Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 31 additions & 13 deletions backends/hip-gen/ceed-hip-gen-operator.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,20 @@
static int CeedOperatorDestroy_Hip_gen(CeedOperator op) {
Ceed ceed;
CeedOperator_Hip_gen *impl;
bool is_composite;

CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
CeedCallBackend(CeedOperatorGetData(op, &impl));
CeedCallBackend(CeedOperatorIsComposite(op, &is_composite));
if (is_composite) {
CeedInt num_suboperators;

CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
for (CeedInt i = 0; i < num_suboperators; i++) {
if (impl->streams[i]) CeedCallHip(ceed, hipStreamDestroy(impl->streams[i]));
Comment thread
jeremylt marked this conversation as resolved.
impl->streams[i] = NULL;
}
}
if (impl->module) CeedCallHip(ceed, hipModuleUnload(impl->module));
if (impl->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)impl->points.num_per_elem));
CeedCallBackend(CeedFree(&impl));
Expand Down Expand Up @@ -239,28 +250,35 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
}

static int CeedOperatorApplyAddComposite_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
bool is_run_good[CEED_COMPOSITE_MAX] = {false};
CeedInt num_suboperators;
const CeedScalar *input_arr = NULL;
CeedScalar *output_arr = NULL;
Ceed ceed;
CeedOperator *sub_operators;
bool is_run_good[CEED_COMPOSITE_MAX] = {true};
CeedInt num_suboperators;
const CeedScalar *input_arr = NULL;
CeedScalar *output_arr;
Ceed ceed;
CeedOperator_Hip_gen *impl;
CeedOperator *sub_operators;

CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
CeedCallBackend(CeedOperatorGetData(op, &impl));
CeedCallBackend(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
CeedCallBackend(CeedCompositeOperatorGetSubList(op, &sub_operators));
if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
for (CeedInt i = 0; i < num_suboperators; i++) {
CeedInt num_elem = 0;

CeedCall(CeedOperatorGetNumElements(sub_operators[i], &num_elem));
CeedCallBackend(CeedOperatorGetNumElements(sub_operators[i], &num_elem));
if (num_elem > 0) {
hipStream_t stream = NULL;
if (!impl->streams[i]) CeedCallHip(ceed, hipStreamCreate(&impl->streams[i]));
CeedCallBackend(CeedOperatorApplyAddCore_Hip_gen(sub_operators[i], impl->streams[i], input_arr, output_arr, &is_run_good[i], request));
} else {
is_run_good[i] = true;
}
}

CeedCallHip(ceed, hipStreamCreate(&stream));
CeedCallBackend(CeedOperatorApplyAddCore_Hip_gen(sub_operators[i], stream, input_arr, output_arr, &is_run_good[i], request));
CeedCallHip(ceed, hipStreamDestroy(stream));
for (CeedInt i = 0; i < num_suboperators; i++) {
if (impl->streams[i]) {
if (is_run_good[i]) CeedCallHip(ceed, hipStreamSynchronize(impl->streams[i]));
}
}
if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
Expand Down
1 change: 1 addition & 0 deletions backends/hip-gen/ceed-hip-gen.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ typedef struct {
CeedInt Q, Q_1d;
CeedInt max_P_1d;
CeedInt thread_1d;
hipStream_t streams[CEED_COMPOSITE_MAX];
hipModule_t module;
hipFunction_t op;
FieldsInt_Hip indices;
Expand Down
45 changes: 33 additions & 12 deletions backends/hip-ref/ceed-hip-ref-vector.c
Original file line number Diff line number Diff line change
Expand Up @@ -305,19 +305,21 @@ static int CeedVectorCopyStrided_Hip(CeedVector vec, CeedSize start, CeedSize st
// Set value for synced device/host array
if (impl->d_array) {
CeedScalar *copy_array;
Ceed ceed;

CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_DEVICE, &copy_array));
#if (HIP_VERSION >= 60000000)
hipblasHandle_t handle;
Ceed ceed;

CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
hipStream_t stream;
CeedCallBackend(CeedGetHipblasHandle_Hip(ceed, &handle));
CeedCallHipblas(ceed, hipblasGetStream(handle, &stream));
#if defined(CEED_SCALAR_IS_FP32)
CeedCallHipblas(ceed, hipblasScopy_64(handle, (int64_t)(stop - start), impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step));
#else /* CEED_SCALAR */
CeedCallHipblas(ceed, hipblasDcopy_64(handle, (int64_t)(stop - start), impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step));
#endif /* CEED_SCALAR */
CeedCallHip(ceed, hipStreamSynchronize(stream));
#else /* HIP_VERSION */
CeedCallBackend(CeedDeviceCopyStrided_Hip(impl->d_array, start, stop, step, copy_array));
#endif /* HIP_VERSION */
Expand Down Expand Up @@ -557,14 +559,15 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
const CeedScalar *d_array;
CeedVector_Hip *impl;
hipblasHandle_t handle;
hipStream_t stream;
Ceed_Hip *hip_data;

CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
CeedCallBackend(CeedGetData(ceed, &hip_data));
CeedCallBackend(CeedVectorGetData(vec, &impl));
CeedCallBackend(CeedVectorGetLength(vec, &length));
CeedCallBackend(CeedGetHipblasHandle_Hip(ceed, &handle));

CeedCallHipblas(ceed, hipblasGetStream(handle, &stream));
#if (HIP_VERSION < 60000000)
// With ROCm 6, we can use the 64-bit integer interface. Prior to that,
// we need to check if the vector is too long to handle with int32,
Expand All @@ -581,6 +584,7 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
#if defined(CEED_SCALAR_IS_FP32)
#if (HIP_VERSION >= 60000000) // We have ROCm 6, and can use 64-bit integers
CeedCallHipblas(ceed, hipblasSasum_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
CeedCallHip(ceed, hipStreamSynchronize(stream));
#else /* HIP_VERSION */
float sub_norm = 0.0;
float *d_array_start;
Expand All @@ -591,12 +595,14 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;

CeedCallHipblas(ceed, hipblasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
CeedCallHip(ceed, hipStreamSynchronize(stream));
*norm += sub_norm;
}
#endif /* HIP_VERSION */
#else /* CEED_SCALAR */
#if (HIP_VERSION >= 60000000)
CeedCallHipblas(ceed, hipblasDasum_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
CeedCallHip(ceed, hipStreamSynchronize(stream));
#else /* HIP_VERSION */
double sub_norm = 0.0;
double *d_array_start;
Expand All @@ -607,6 +613,7 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;

CeedCallHipblas(ceed, hipblasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
CeedCallHip(ceed, hipStreamSynchronize(stream));
*norm += sub_norm;
}
#endif /* HIP_VERSION */
Expand All @@ -617,6 +624,7 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
#if defined(CEED_SCALAR_IS_FP32)
#if (HIP_VERSION >= 60000000)
CeedCallHipblas(ceed, hipblasSnrm2_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
CeedCallHip(ceed, hipStreamSynchronize(stream));
#else /* HIP_VERSION */
float sub_norm = 0.0, norm_sum = 0.0;
float *d_array_start;
Expand All @@ -627,13 +635,15 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;

CeedCallHipblas(ceed, hipblasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
CeedCallHip(ceed, hipStreamSynchronize(stream));
norm_sum += sub_norm * sub_norm;
}
*norm = sqrt(norm_sum);
#endif /* HIP_VERSION */
#else /* CEED_SCALAR */
#if (HIP_VERSION >= 60000000)
CeedCallHipblas(ceed, hipblasDnrm2_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
CeedCallHip(ceed, hipStreamSynchronize(stream));
#else /* HIP_VERSION */
double sub_norm = 0.0, norm_sum = 0.0;
double *d_array_start;
Expand All @@ -644,6 +654,7 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;

CeedCallHipblas(ceed, hipblasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
CeedCallHip(ceed, hipStreamSynchronize(stream));
norm_sum += sub_norm * sub_norm;
}
*norm = sqrt(norm_sum);
Expand All @@ -658,7 +669,8 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
CeedScalar norm_no_abs;

CeedCallHipblas(ceed, hipblasIsamax_64(handle, (int64_t)length, (float *)d_array, 1, &index));
CeedCallHip(ceed, hipMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost));
CeedCallHip(ceed, hipMemcpyAsync(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost, stream));
CeedCallHip(ceed, hipStreamSynchronize(stream));
*norm = fabs(norm_no_abs);
#else /* HIP_VERSION */
CeedInt index;
Expand All @@ -672,10 +684,11 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor

CeedCallHipblas(ceed, hipblasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index));
if (hip_data->has_unified_addressing) {
CeedCallHip(ceed, hipDeviceSynchronize());
CeedCallHip(ceed, hipStreamSynchronize(stream));
sub_max = fabs(d_array[index - 1]);
} else {
CeedCallHip(ceed, hipMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost));
CeedCallHip(ceed, hipMemcpyAsync(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost, stream));
CeedCallHip(ceed, hipStreamSynchronize(stream));
}
if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
}
Expand All @@ -688,10 +701,11 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor

CeedCallHipblas(ceed, hipblasIdamax_64(handle, (int64_t)length, (double *)d_array, 1, &index));
if (hip_data->has_unified_addressing) {
CeedCallHip(ceed, hipDeviceSynchronize());
CeedCallHip(ceed, hipStreamSynchronize(stream));
norm_no_abs = fabs(d_array[index - 1]);
} else {
CeedCallHip(ceed, hipMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost));
CeedCallHip(ceed, hipMemcpyAsync(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost, stream));
CeedCallHip(ceed, hipStreamSynchronize(stream));
}
*norm = fabs(norm_no_abs);
#else /* HIP_VERSION */
Expand All @@ -706,10 +720,11 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor

CeedCallHipblas(ceed, hipblasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index));
if (hip_data->has_unified_addressing) {
CeedCallHip(ceed, hipDeviceSynchronize());
CeedCallHip(ceed, hipStreamSynchronize(stream));
sub_max = fabs(d_array[index - 1]);
} else {
CeedCallHip(ceed, hipMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost));
CeedCallHip(ceed, hipMemcpyAsync(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost, stream));
CeedCallHip(ceed, hipStreamSynchronize(stream));
}
if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
}
Expand Down Expand Up @@ -780,13 +795,16 @@ static int CeedVectorScale_Hip(CeedVector x, CeedScalar alpha) {
if (impl->d_array) {
#if (HIP_VERSION >= 60000000)
hipblasHandle_t handle;
hipStream_t stream;

CeedCallBackend(CeedGetHipblasHandle_Hip(CeedVectorReturnCeed(x), &handle));
CeedCallHipblas(CeedVectorReturnCeed(x), hipblasGetStream(handle, &stream));
#if defined(CEED_SCALAR_IS_FP32)
CeedCallHipblas(CeedVectorReturnCeed(x), hipblasSscal_64(handle, (int64_t)length, &alpha, impl->d_array, 1));
#else /* CEED_SCALAR */
CeedCallHipblas(CeedVectorReturnCeed(x), hipblasDscal_64(handle, (int64_t)length, &alpha, impl->d_array, 1));
#endif /* CEED_SCALAR */
CeedCallHip(CeedVectorReturnCeed(x), hipStreamSynchronize(stream));
#else /* HIP_VERSION */
CeedCallBackend(CeedDeviceScale_Hip(impl->d_array, alpha, length));
#endif /* HIP_VERSION */
Expand Down Expand Up @@ -827,13 +845,16 @@ static int CeedVectorAXPY_Hip(CeedVector y, CeedScalar alpha, CeedVector x) {
CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE));
#if (HIP_VERSION >= 60000000)
hipblasHandle_t handle;
hipStream_t stream;

CeedCallBackend(CeedGetHipblasHandle_Hip(CeedVectorReturnCeed(y), &handle));
CeedCallBackend(CeedGetHipblasHandle_Hip(CeedVectorReturnCeed(x), &handle));
CeedCallHipblas(CeedVectorReturnCeed(y), hipblasGetStream(handle, &stream));
#if defined(CEED_SCALAR_IS_FP32)
CeedCallHipblas(CeedVectorReturnCeed(y), hipblasSaxpy_64(handle, (int64_t)length, &alpha, x_impl->d_array, 1, y_impl->d_array, 1));
#else /* CEED_SCALAR */
CeedCallHipblas(CeedVectorReturnCeed(y), hipblasDaxpy_64(handle, (int64_t)length, &alpha, x_impl->d_array, 1, y_impl->d_array, 1));
#endif /* CEED_SCALAR */
CeedCallHip(CeedVectorReturnCeed(y), hipStreamSynchronize(stream));
#else /* HIP_VERSION */
CeedCallBackend(CeedDeviceAXPY_Hip(y_impl->d_array, alpha, x_impl->d_array, length));
#endif /* HIP_VERSION */
Expand Down
5 changes: 4 additions & 1 deletion backends/hip-ref/ceed-hip-ref.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,10 @@ int CeedGetHipblasHandle_Hip(Ceed ceed, hipblasHandle_t *handle) {
Ceed_Hip *data;

CeedCallBackend(CeedGetData(ceed, &data));
if (!data->hipblas_handle) CeedCallHipblas(ceed, hipblasCreate(&data->hipblas_handle));
if (!data->hipblas_handle) {
CeedCallHipblas(ceed, hipblasCreate(&data->hipblas_handle));
CeedCallHipblas(ceed, hipblasSetPointerMode(data->hipblas_handle, HIPBLAS_POINTER_MODE_HOST));
}
*handle = data->hipblas_handle;
return CEED_ERROR_SUCCESS;
}
Expand Down
3 changes: 3 additions & 0 deletions backends/hip-shared/ceed-hip-shared-basis.c
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,7 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
CeedCallBackend(CeedDestroy(&ceed));
return CEED_ERROR_SUCCESS;
}

Expand Down Expand Up @@ -644,6 +645,7 @@ static int CeedBasisDestroy_Hip_shared(CeedBasis basis) {
CeedCallHip(ceed, hipFree(data->d_collo_grad_1d));
CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d));
CeedCallBackend(CeedFree(&data));
CeedCallBackend(CeedDestroy(&ceed));
return CEED_ERROR_SUCCESS;
}

Expand Down Expand Up @@ -737,6 +739,7 @@ int CeedBasisCreateH1_Hip_shared(CeedElemTopology topo, CeedInt dim, CeedInt num
if (((size_t)num_nodes * (size_t)num_qpts * (size_t)dim + (size_t)CeedIntMax(num_nodes, num_qpts)) * sizeof(CeedScalar) >
hip_data->device_prop.sharedMemPerBlock) {
CeedCallBackend(CeedBasisCreateH1Fallback(ceed, topo, dim, num_nodes, num_qpts, interp, grad, q_ref, q_weight, basis));
CeedCallBackend(CeedDestroy(&ceed));
return CEED_ERROR_SUCCESS;
}
}
Expand Down
38 changes: 38 additions & 0 deletions interface/ceed.c
Original file line number Diff line number Diff line change
Expand Up @@ -827,6 +827,15 @@ int CeedReference(Ceed ceed) {
@ref Developer
**/
int CeedGetWorkVectorMemoryUsage(Ceed ceed, CeedScalar *usage_mb) {
if (!ceed->VectorCreate) {
Ceed delegate;

CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector"));
CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement VectorCreate");
CeedCall(CeedGetWorkVectorMemoryUsage(delegate, usage_mb));
CeedCall(CeedDestroy(&delegate));
return CEED_ERROR_SUCCESS;
}
*usage_mb = 0.0;
if (ceed->work_vectors) {
for (CeedInt i = 0; i < ceed->work_vectors->num_vecs; i++) {
Expand All @@ -852,6 +861,15 @@ int CeedGetWorkVectorMemoryUsage(Ceed ceed, CeedScalar *usage_mb) {
@ref Backend
**/
int CeedClearWorkVectors(Ceed ceed, CeedSize min_len) {
if (!ceed->VectorCreate) {
Ceed delegate;

CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector"));
CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement VectorCreate");
Comment thread
jeremylt marked this conversation as resolved.
CeedCall(CeedClearWorkVectors(delegate, min_len));
CeedCall(CeedDestroy(&delegate));
return CEED_ERROR_SUCCESS;
}
if (!ceed->work_vectors) return CEED_ERROR_SUCCESS;
for (CeedInt i = 0; i < ceed->work_vectors->num_vecs; i++) {
if (ceed->work_vectors->is_in_use[i]) continue;
Expand Down Expand Up @@ -890,6 +908,16 @@ int CeedGetWorkVector(Ceed ceed, CeedSize len, CeedVector *vec) {
CeedInt i = 0;
CeedScalar usage_mb;

if (!ceed->VectorCreate) {
Ceed delegate;

CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector"));
CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement VectorCreate");
CeedCall(CeedGetWorkVector(delegate, len, vec));
CeedCall(CeedDestroy(&delegate));
return CEED_ERROR_SUCCESS;
}

if (!ceed->work_vectors) CeedCall(CeedWorkVectorsCreate(ceed));

// Search for big enough work vector
Expand Down Expand Up @@ -936,6 +964,16 @@ int CeedGetWorkVector(Ceed ceed, CeedSize len, CeedVector *vec) {
@ref Backend
**/
int CeedRestoreWorkVector(Ceed ceed, CeedVector *vec) {
if (!ceed->VectorCreate) {
Ceed delegate;

CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector"));
CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement VectorCreate");
CeedCall(CeedRestoreWorkVector(delegate, vec));
CeedCall(CeedDestroy(&delegate));
return CEED_ERROR_SUCCESS;
}

for (CeedInt i = 0; i < ceed->work_vectors->num_vecs; i++) {
if (*vec == ceed->work_vectors->vecs[i]) {
CeedCheck(ceed->work_vectors->is_in_use[i], ceed, CEED_ERROR_ACCESS, "Work vector %" CeedSize_FMT " was not checked out but is being returned");
Expand Down
Loading