Skip to content

Commit 321138d

Browse files
authored
Merge pull request #1969 from CEED/zach/rocm-7-fixes
HIP: Add support for HIP 7+
2 parents e647d04 + 4dd9e2c commit 321138d

10 files changed

Lines changed: 60 additions & 39 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ On this page we provide a summary of the main API changes, new features and exam
1717
- Require use of `Ceed*Destroy()` on Ceed objects returned from `Ceed*Get*()`.
1818
- Rename `CeedCompositeOperatorCreate()` to `CeedOperatorCreateComposite()` for uniformity.
1919
- Rename `CeedCompositeOperator*()` to `CeedOperatorComposite*()` for uniformity.
20+
- Add `build_objects` parameter to `CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback` to allow for passing uninitialized vectors and restrictions
2021

2122
### New features
2223

Makefile

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -570,19 +570,26 @@ HIP_LIB_DIR := $(wildcard $(foreach d,lib lib64,$(ROCM_DIR)/$d/lib${HIP_LIB_NAME
570570
HIP_LIB_DIR := $(patsubst %/,%,$(dir $(firstword $(HIP_LIB_DIR))))
571571
HIP_BACKENDS = /gpu/hip/ref /gpu/hip/shared /gpu/hip/gen
572572
ifneq ($(HIP_LIB_DIR),)
573+
HIP_CONFIG := $(ROCM_DIR)/bin/hipconfig
573574
ifeq ($(HIP_LIB_NAME),CHIP)
574575
# chipStar hipconfig -C emits clang-only flags; keep only -D/-I/-include for gcc
575-
HIPCONFIG_CPPFLAGS := $(shell $(ROCM_DIR)/bin/hipconfig -C)
576+
HIPCONFIG_CPPFLAGS := $(shell $(HIP_CONFIG) -C)
576577
HIPCONFIG_CPPFLAGS_C := $(filter-out --offload% -nohipwrapperinc --hip-path% --target%,$(HIPCONFIG_CPPFLAGS)) -I$(ROCM_DIR)/include
577578
else
578-
HIPCONFIG_CPPFLAGS := $(subst =,,$(shell $(ROCM_DIR)/bin/hipconfig -C))
579+
HIPCONFIG_CPPFLAGS := $(subst =,,$(shell $(HIP_CONFIG) -C))
579580
HIPCONFIG_CPPFLAGS_C := $(HIPCONFIG_CPPFLAGS)
580581
endif
581582
$(hip-all.c:%.c=$(OBJDIR)/%.o) $(hip-all.c:%=%.tidy): CPPFLAGS += $(HIPCONFIG_CPPFLAGS_C)
582583
ifneq ($(CXX), $(HIPCC))
583584
$(hip-all.cpp:%.cpp=$(OBJDIR)/%.o) $(hip-all.cpp:%=%.tidy): CPPFLAGS += $(HIPCONFIG_CPPFLAGS_C)
584585
endif
585586
PKG_LIBS += -L$(abspath $(HIP_LIB_DIR)) -l${HIP_LIB_NAME} -lhipblas
587+
HIP_MAJOR_VERSION := $(shell $(HIP_CONFIG) --version | cut -d'.' -f1)
588+
ifeq ($(HIP_MAJOR_VERSION),7)
589+
PKG_LIBS += -lhiprtc
590+
$(info $(PKG_LIBS))
591+
endif
592+
$(info $(HIP_MAJOR_VERSION))
586593
LIBCEED_CONTAINS_CXX = 1
587594
libceed.c += $(hip-all.c)
588595
libceed.cpp += $(hip-all.cpp)

backends/cuda-gen/ceed-cuda-gen-operator.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -517,7 +517,7 @@ static int CeedOperatorLinearAssembleQFunctionCore_Cuda_gen(CeedOperator op, boo
517517

518518
CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for LinearAssemblyQFunction\n");
519519
CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
520-
CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, assembled, rstr, request));
520+
CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, build_objects, assembled, rstr, request));
521521
return CEED_ERROR_SUCCESS;
522522
}
523523
return CEED_ERROR_SUCCESS;

backends/hip-gen/ceed-hip-gen-operator-build.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
302302

303303
CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
304304
if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
305-
code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
305+
code << tab << "CeedScalar *s_B" << var_suffix << " = nullptr;\n";
306306
} else {
307307
code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
308308
code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
@@ -341,7 +341,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
341341

342342
CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
343343
if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
344-
code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
344+
code << tab << "CeedScalar *s_B" << var_suffix << " = nullptr;\n";
345345
} else {
346346
code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
347347
code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
@@ -357,7 +357,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
357357

358358
code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
359359
} else if (is_active && skip_active_load) {
360-
code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
360+
code << tab << "CeedScalar *s_G" << var_suffix << " = nullptr;\n";
361361
} else {
362362
code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
363363
code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
@@ -373,7 +373,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
373373

374374
code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
375375
} else if (is_active && skip_active_load) {
376-
code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
376+
code << tab << "CeedScalar *s_G" << var_suffix << " = nullptr;\n";
377377
} else {
378378
code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
379379
code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
@@ -384,7 +384,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
384384

385385
code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
386386
} else if (is_active && skip_active_load) {
387-
code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
387+
code << tab << "CeedScalar *s_G" << var_suffix << " = nullptr;\n";
388388
} else {
389389
code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
390390
<< (is_tensor ? "" : var_suffix) << "];\n";

backends/hip-gen/ceed-hip-gen-operator.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -496,7 +496,7 @@ static int CeedOperatorLinearAssembleQFunctionCore_Hip_gen(CeedOperator op, bool
496496

497497
CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for LinearAssembleQFunction\n");
498498
CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
499-
CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, assembled, rstr, request));
499+
CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, build_objects, assembled, rstr, request));
500500
return CEED_ERROR_SUCCESS;
501501
}
502502
return CEED_ERROR_SUCCESS;

backends/hip/ceed-hip-compile.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -120,10 +120,16 @@ static int CeedCompileCore_Hip(Ceed ceed, const char *source, const char *name,
120120
CeedCallHip(ceed, hipRuntimeGetVersion(&runtime_version));
121121
if (runtime_version < 40400000) {
122122
code << "#include <hip/hip_runtime.h>\n\n";
123-
}
124-
// With ROCm 4.5, need to include these definitions specifically for hiprtc (but cannot include the runtime header)
125-
else {
123+
} else if (runtime_version < 70000000) {
126124
code << "#include <stddef.h>\n";
125+
// With ROCm 4.5+, need to include these definitions specifically for hiprtc (but cannot include the runtime header)
126+
code << "#define __forceinline__ inline __attribute__((always_inline))\n";
127+
code << "#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];\n\n";
128+
} else {
129+
// ROCm 7 removed stddef header, so we use the internal HIP types
130+
code << "using __hip_internal::int32_t;\n";
131+
code << "using __hip_internal::int64_t;\n";
132+
// With ROCm 4.5+, need to include these definitions specifically for hiprtc (but cannot include the runtime header)
127133
code << "#define __forceinline__ inline __attribute__((always_inline))\n";
128134
code << "#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];\n\n";
129135
}

include/ceed/backend.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -473,8 +473,8 @@ CEED_EXTERN int CeedOperatorReference(CeedOperator op);
473473
CEED_EXTERN int CeedOperatorGetFallback(CeedOperator op, CeedOperator *op_fallback);
474474
CEED_EXTERN int CeedOperatorGetFallbackParent(CeedOperator op, CeedOperator *parent);
475475
CEED_EXTERN int CeedOperatorGetFallbackParentCeed(CeedOperator op, Ceed *parent);
476-
CEED_EXTERN int CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr,
477-
CeedRequest *request);
476+
CEED_EXTERN int CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(CeedOperator op, bool build_objects, CeedVector *assembled,
477+
CeedElemRestriction *rstr, CeedRequest *request);
478478
CEED_INTERN int CeedOperatorAssembleSingle(CeedOperator op, CeedSize offset, CeedVector values);
479479
CEED_EXTERN int CeedOperatorSetSetupDone(CeedOperator op);
480480

include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,15 +63,15 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__
6363
for (IndexType e_out = 0; e_out < NUM_EVAL_MODES_OUT; e_out++) {
6464
IndexType d_in = 0;
6565
CeedEvalMode eval_modes_in_prev = CEED_EVAL_NONE;
66-
const CeedScalar *b_t = NULL;
66+
const CeedScalar *b_t = nullptr;
6767

6868
GetBasisPointer(&b_t, eval_modes_out[e_out], identity, interp_out, grad_out, div_out, curl_out);
6969
if (e_out == 0 || eval_modes_out[e_out] != eval_modes_out_prev) d_out = 0;
7070
else b_t = &b_t[(++d_out) * NUM_QPTS * NUM_NODES];
7171
eval_modes_out_prev = eval_modes_out[e_out];
7272

7373
for (IndexType e_in = 0; e_in < NUM_EVAL_MODES_IN; e_in++) {
74-
const CeedScalar *b = NULL;
74+
const CeedScalar *b = nullptr;
7575

7676
GetBasisPointer(&b, eval_modes_in[e_in], identity, interp_in, grad_in, div_in, curl_in);
7777
if (e_in == 0 || eval_modes_in[e_in] != eval_modes_in_prev) d_in = 0;

include/ceed/jit-source/hip/hip-shared-basis-tensor.h

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -407,21 +407,21 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
407407

408408
if (BASIS_DIM == 1) {
409409
ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
410-
Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
410+
Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
411411
if (e < num_elem) {
412412
WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
413413
}
414414
} else if (BASIS_DIM == 2) {
415415
ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
416-
GradTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
416+
GradTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
417417
if (e < num_elem) {
418418
WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
419419
d_V);
420420
}
421421
} else if (BASIS_DIM == 3) {
422422
ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
423423
BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
424-
GradTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
424+
GradTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
425425
if (e < num_elem) {
426426
WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
427427
BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
@@ -522,21 +522,21 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
522522

523523
if (BASIS_DIM == 1) {
524524
ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
525-
GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
525+
GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
526526
if (e < num_elem) {
527527
WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
528528
}
529529
} else if (BASIS_DIM == 2) {
530530
ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
531531
r_U);
532-
GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
532+
GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
533533
if (e < num_elem) {
534534
WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
535535
}
536536
} else if (BASIS_DIM == 3) {
537537
ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
538538
BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
539-
GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
539+
GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
540540
if (e < num_elem) {
541541
WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
542542
BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
@@ -637,21 +637,21 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
637637

638638
if (BASIS_DIM == 1) {
639639
ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
640-
GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
640+
GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
641641
if (e < num_elem) {
642642
SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
643643
}
644644
} else if (BASIS_DIM == 2) {
645645
ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
646646
r_U);
647-
GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
647+
GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
648648
if (e < num_elem) {
649649
SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
650650
}
651651
} else if (BASIS_DIM == 3) {
652652
ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
653653
BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
654-
GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
654+
GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
655655
if (e < num_elem) {
656656
SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
657657
BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);

0 commit comments

Comments
 (0)