Skip to content

Commit 1956afa

Browse files
committed
cuda: fix mixed basis assembly
1 parent d9b3770 commit 1956afa

2 files changed

Lines changed: 22 additions & 20 deletions

File tree

backends/cuda-ref/ceed-cuda-ref-operator.c

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1073,9 +1073,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
10731073
CeedInt strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */
10741074

10751075
// Create output restriction
1076-
CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out,
1077-
(CeedSize)num_active_in * (CeedSize)num_active_out * (CeedSize)num_elem * (CeedSize)Q, strides,
1078-
rstr));
1076+
CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out, l_size, strides, rstr));
10791077
// Create assembled vector
10801078
CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled));
10811079
}
@@ -1534,12 +1532,12 @@ static int CeedOperatorAssembleSingleBlockSetup_Cuda(CeedOperator op, CeedInt ac
15341532
CeedInt num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
15351533
CeedInt elem_size_in, num_qpts_in = 0, num_comp_in, elem_size_out, num_qpts_out, num_comp_out;
15361534
CeedSize num_output_components;
1537-
CeedSize eval_mode_offset_in = 0, eval_mode_offset_out = 0;
15381535
const CeedScalar *h_B_in, *h_B_out;
15391536
CeedElemRestriction rstr_in = NULL, rstr_out = NULL;
15401537
CeedBasis basis_in = NULL, basis_out = NULL;
15411538
CeedOperatorField *input_fields, *output_fields;
15421539
CeedOperator_Cuda *impl;
1540+
char *eval_mode_offsets_in_str, *eval_mode_offsets_out_str;
15431541

15441542
CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
15451543
CeedCallBackend(CeedOperatorGetData(op, &impl));
@@ -1573,14 +1571,16 @@ static int CeedOperatorAssembleSingleBlockSetup_Cuda(CeedOperator op, CeedInt ac
15731571
impl->num_blocks_out = num_active_bases_out;
15741572
}
15751573

1576-
rstr_in = active_rstrs_in[active_input];
1577-
basis_in = active_bases_in[active_input];
1578-
eval_mode_offset_in = eval_modes_offsets_in[active_input][0];
1579-
h_B_in = B_mats_in[active_input];
1580-
rstr_out = active_rstrs_out[active_output];
1581-
basis_out = active_bases_out[active_output];
1582-
eval_mode_offset_out = eval_modes_offsets_out[active_output][0];
1583-
h_B_out = B_mats_out[active_output];
1574+
rstr_in = active_rstrs_in[active_input];
1575+
basis_in = active_bases_in[active_input];
1576+
CeedCallBackend(CeedBuildArrayConstantSize_Cuda(ceed, "EVAL_MODE_OFFSETS_IN", num_eval_modes_in, eval_modes_offsets_in[active_input],
1577+
&eval_mode_offsets_in_str));
1578+
h_B_in = B_mats_in[active_input];
1579+
rstr_out = active_rstrs_out[active_output];
1580+
basis_out = active_bases_out[active_output];
1581+
CeedCallBackend(CeedBuildArrayConstantSize_Cuda(ceed, "EVAL_MODE_OFFSETS_OUT", num_eval_modes_out, eval_modes_offsets_out[active_output],
1582+
&eval_mode_offsets_out_str));
1583+
h_B_out = B_mats_out[active_output];
15841584
}
15851585

15861586
CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size_in));
@@ -1611,12 +1611,12 @@ static int CeedOperatorAssembleSingleBlockSetup_Cuda(CeedOperator op, CeedInt ac
16111611
const char assembly_kernel_source[] = "// Full assembly source\n#include <ceed/jit-source/cuda/cuda-ref-operator-assemble-block.h>\n";
16121612
CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp_in));
16131613
CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_out, &num_comp_out));
1614-
CeedCallBackend(CeedCompile_Cuda(ceed, assembly_kernel_source, &asmb->module, 13, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT",
1615-
num_eval_modes_out, "EVAL_MODE_OFFSET_IN", eval_mode_offset_in, "EVAL_MODE_OFFSET_OUT", eval_mode_offset_out,
1616-
"NUM_COMP_IN", num_comp_in, "NUM_COMP_OUT", num_comp_out, "TOTAL_NUM_COMP_OUT", num_output_components,
1617-
"NUM_NODES_IN", elem_size_in, "NUM_NODES_OUT", elem_size_out, "NUM_QPTS", num_qpts_in, "BLOCK_SIZE",
1618-
asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_y,
1619-
"USE_CEEDSIZE", use_ceedsize_idx));
1614+
CeedCallBackend(CeedCompileExtra_Cuda(ceed, assembly_kernel_source, &asmb->module, 11, 2, "NUM_EVAL_MODES_IN", num_eval_modes_in,
1615+
"NUM_EVAL_MODES_OUT", num_eval_modes_out, "NUM_COMP_IN", num_comp_in, "NUM_COMP_OUT", num_comp_out,
1616+
"TOTAL_NUM_COMP_OUT", num_output_components, "NUM_NODES_IN", elem_size_in, "NUM_NODES_OUT", elem_size_out,
1617+
"NUM_QPTS", num_qpts_in, "BLOCK_SIZE", asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block,
1618+
"BLOCK_SIZE_Y", asmb->block_size_y, "USE_CEEDSIZE", use_ceedsize_idx, eval_mode_offsets_in_str,
1619+
eval_mode_offsets_out_str));
16201620
CeedCallBackend(CeedGetKernel_Cuda(ceed, asmb->module, "LinearAssembleBlock", &asmb->LinearAssemble));
16211621

16221622
// Load into B_in, in order that they will be used in eval_modes_in
@@ -1634,6 +1634,8 @@ static int CeedOperatorAssembleSingleBlockSetup_Cuda(CeedOperator op, CeedInt ac
16341634
CeedCallCuda(ceed, cudaMalloc((void **)&asmb->d_B_out, out_bytes));
16351635
CeedCallCuda(ceed, cudaMemcpy(asmb->d_B_out, h_B_out, out_bytes, cudaMemcpyHostToDevice));
16361636
}
1637+
CeedCallBackend(CeedFree(&eval_mode_offsets_in_str));
1638+
CeedCallBackend(CeedFree(&eval_mode_offsets_out_str));
16371639
CeedCallBackend(CeedDestroy(&ceed));
16381640
return CEED_ERROR_SUCCESS;
16391641
}

include/ceed/jit-source/cuda/cuda-ref-operator-assemble-block.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__
5050

5151
for (IndexType e_in = 0; e_in < NUM_EVAL_MODES_IN; e_in++) {
5252
for (IndexType e_out = 0; e_out < NUM_EVAL_MODES_OUT; e_out++) {
53-
const IndexType row_offset = EVAL_MODE_OFFSET_IN + e_in * NUM_COMP_IN + comp_in;
54-
const IndexType col_offset = EVAL_MODE_OFFSET_OUT + e_out * NUM_COMP_OUT + comp_out;
53+
const IndexType row_offset = EVAL_MODE_OFFSETS_IN[e_in] + comp_in;
54+
const IndexType col_offset = EVAL_MODE_OFFSETS_OUT[e_out] + comp_out;
5555
const IndexType comp_index = row_offset * TOTAL_NUM_COMP_OUT + col_offset;
5656

5757
// Perform the B^T D B operation for this 'chunk' of D (the qf_array)

0 commit comments

Comments
 (0)