Skip to content

Commit 57ac1d0

Browse files
committed
[HAL][AMDGPU] Support raw HSACO custom kernargs
Allow AMDGPU raw HSACO modules to expose HIP-visible kernel symbols that have companion kernel descriptors, even when the code object metadata only describes a subset of exports. Preserve native kernarg layouts for those raw kernels by widening reflected parameter sizes and offsets, accepting pre-packed HIP argument blobs that omit trailing ABI padding, zero-filling missing bytes, and synthesizing HIP implicit args at the metadata-derived or HSA-reported suffix offset. Use ELF symbol metadata and kernel descriptors to identify additional raw code object exports and keep their dispatch ABI consistent with the loaded HSA executable. Signed-off-by: Andrew Woloszyn <andrew.woloszyn@gmail.com>
1 parent f0103e8 commit 57ac1d0

9 files changed

Lines changed: 639 additions & 57 deletions

File tree

runtime/src/iree/hal/drivers/amdgpu/abi/kernel_args.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,11 @@ typedef struct iree_hal_amdgpu_device_kernel_args_t {
5454
uint16_t constant_count;
5555
// Total number of bindings used by the dispatch (if a HAL dispatch).
5656
uint16_t binding_count;
57+
// Offset of HIP/OpenCL implicit args within custom-direct kernargs, or
58+
// UINT16_MAX if the kernel does not use an implicit suffix.
59+
uint16_t implicit_args_offset;
5760
// Reserved for future hot kernel metadata. Must be zero.
58-
uint32_t reserved;
61+
uint16_t reserved;
5962
} iree_hal_amdgpu_device_kernel_args_t;
6063
IREE_AMDGPU_STATIC_ASSERT(
6164
sizeof(iree_hal_amdgpu_device_kernel_args_t) <= 64,

runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.c

Lines changed: 59 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1690,11 +1690,32 @@ static iree_status_t iree_hal_amdgpu_aql_command_buffer_write_dispatch_tail(
16901690
}
16911691
return iree_ok_status();
16921692
}
1693-
case IREE_HAL_AMDGPU_COMMAND_BUFFER_KERNARG_STRATEGY_CUSTOM_DIRECT:
1694-
if (constants.data_length > 0) {
1695-
memcpy(tail_payload, constants.data, constants.data_length);
1693+
case IREE_HAL_AMDGPU_COMMAND_BUFFER_KERNARG_STRATEGY_CUSTOM_DIRECT: {
1694+
// Custom-direct callers hand us a pre-packed HIP ABI blob. Some raw
1695+
// HSACO kernels also need an implicit suffix synthesized after that blob,
1696+
// so zero the full reservation before copying only caller-owned bytes.
1697+
const iree_host_size_t total_kernarg_size =
1698+
layout->total_kernarg_size ? layout->total_kernarg_size
1699+
: constants.data_length;
1700+
memset(tail_payload, 0, total_kernarg_size);
1701+
const iree_host_size_t explicit_bytes =
1702+
layout->has_implicit_args ? layout->implicit_args_offset
1703+
: total_kernarg_size;
1704+
const iree_host_size_t copy_bytes =
1705+
constants.data_length < explicit_bytes ? constants.data_length
1706+
: explicit_bytes;
1707+
if (copy_bytes > 0) {
1708+
memcpy(tail_payload, constants.data, copy_bytes);
1709+
}
1710+
if (layout->has_implicit_args) {
1711+
iree_amdgpu_kernel_implicit_args_t* implicit_args =
1712+
(iree_amdgpu_kernel_implicit_args_t*)(tail_payload +
1713+
layout->implicit_args_offset);
1714+
iree_hal_amdgpu_aql_command_buffer_write_implicit_args(
1715+
kernel_args, config, implicit_args);
16961716
}
16971717
return iree_ok_status();
1718+
}
16981719
case IREE_HAL_AMDGPU_COMMAND_BUFFER_KERNARG_STRATEGY_INDIRECT:
16991720
return iree_make_status(
17001721
IREE_STATUS_UNIMPLEMENTED,
@@ -2017,18 +2038,40 @@ static iree_status_t iree_hal_amdgpu_aql_command_buffer_prepare_dispatch_plan(
20172038

20182039
if (iree_hal_amdgpu_aql_dispatch_plan_uses_custom_direct_arguments(
20192040
out_plan)) {
2020-
if (IREE_UNLIKELY(inputs->constants.data_length !=
2021-
out_plan->descriptor->kernel_args.kernarg_size)) {
2041+
// Callers (e.g. rocBLAS/Tensile) sometimes omit trailing ABI padding or pad
2042+
// beyond the declared kernarg_segment_size with extra trailing scalars. The
2043+
// kernel only reads its declared size, so trailing bytes are ignored and the
2044+
// memcpy in write_dispatch_tail clamps to the declared size.
2045+
//
2046+
// Validate after 8-byte ABI padding so we accept missing tail padding while
2047+
// still rejecting truly short pre-packed HIP argument buffers.
2048+
const uint32_t required_explicit_bytes =
2049+
(uint32_t)out_plan->descriptor->custom_kernarg_layout
2050+
.explicit_kernarg_size;
2051+
const iree_host_size_t padded_constant_length =
2052+
iree_host_align(inputs->constants.data_length, /*alignment=*/8);
2053+
if (IREE_UNLIKELY(padded_constant_length < required_explicit_bytes)) {
20222054
return iree_make_status(
20232055
IREE_STATUS_INVALID_ARGUMENT,
2024-
"custom dispatch argument length mismatch; expected %u but got "
2025-
"%" PRIhsz,
2026-
out_plan->descriptor->kernel_args.kernarg_size,
2027-
inputs->constants.data_length);
2056+
"custom dispatch argument length too short; expected at least %u "
2057+
"but got %" PRIhsz " (padded to %" PRIhsz ")",
2058+
required_explicit_bytes, inputs->constants.data_length,
2059+
padded_constant_length);
20282060
}
20292061
out_plan->layout = &out_plan->descriptor->custom_kernarg_layout;
20302062
out_plan->kernarg_block_count =
20312063
iree_max(1u, out_plan->descriptor->custom_kernarg_block_count);
2064+
if (out_plan->layout->total_kernarg_size == 0 &&
2065+
inputs->constants.data_length > 0) {
2066+
// Some raw kernels have no reflected kernarg size. In that case the
2067+
// caller-provided blob is the only reservation size we can trust.
2068+
const uint32_t provided_kernarg_block_count =
2069+
(uint32_t)iree_host_size_ceil_div(
2070+
inputs->constants.data_length,
2071+
sizeof(iree_hal_amdgpu_kernarg_block_t));
2072+
out_plan->kernarg_block_count =
2073+
iree_max(out_plan->kernarg_block_count, provided_kernarg_block_count);
2074+
}
20322075
out_plan->kernarg_strategy =
20332076
IREE_HAL_AMDGPU_COMMAND_BUFFER_KERNARG_STRATEGY_CUSTOM_DIRECT;
20342077
return iree_ok_status();
@@ -2102,14 +2145,18 @@ iree_hal_amdgpu_aql_command_buffer_calculate_dispatch_layout(
21022145
? 0
21032146
: (iree_host_size_t)plan->kernel_args->binding_count *
21042147
sizeof(uint64_t);
2105-
const iree_host_size_t tail_byte_length =
2106-
plan->layout->total_kernarg_size - binding_bytes;
2148+
const iree_host_size_t total_kernarg_size =
2149+
iree_hal_amdgpu_aql_dispatch_plan_uses_custom_direct_arguments(plan) &&
2150+
plan->layout->total_kernarg_size == 0
2151+
? inputs->constants.data_length
2152+
: plan->layout->total_kernarg_size;
2153+
const iree_host_size_t tail_byte_length = total_kernarg_size - binding_bytes;
21072154
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_aql_command_buffer_qword_length(
21082155
tail_byte_length, "dispatch tail payload",
21092156
&out_layout->kernarg.tail_length_qwords,
21102157
&out_layout->kernarg.tail_padded_length));
21112158
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_aql_command_buffer_qword_length(
2112-
plan->layout->total_kernarg_size, "dispatch kernarg",
2159+
total_kernarg_size, "dispatch kernarg",
21132160
&out_layout->kernarg.total_length_qwords,
21142161
&out_layout->kernarg.total_padded_length));
21152162
out_layout->kernarg.implicit_args_offset_qwords =

runtime/src/iree/hal/drivers/amdgpu/device/dispatch.c

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,14 +96,34 @@ void iree_hal_amdgpu_device_dispatch_emplace_hal_kernargs(
9696
}
9797

9898
void iree_hal_amdgpu_device_dispatch_emplace_custom_kernargs(
99+
const iree_hal_amdgpu_device_kernel_args_t* IREE_AMDGPU_RESTRICT
100+
kernel_args,
101+
const uint32_t workgroup_count[3], uint32_t dynamic_workgroup_local_memory,
99102
const iree_hal_amdgpu_device_dispatch_kernarg_layout_t* IREE_AMDGPU_RESTRICT
100103
layout,
101104
const void* IREE_AMDGPU_RESTRICT custom_kernarg_ptr,
105+
size_t custom_kernarg_length,
102106
void* IREE_AMDGPU_RESTRICT kernarg_ptr) {
103-
if (layout->total_kernarg_size > 0) {
104-
iree_amdgpu_memcpy(kernarg_ptr, custom_kernarg_ptr,
105-
layout->total_kernarg_size);
107+
const size_t total_kernarg_size =
108+
layout->total_kernarg_size ? layout->total_kernarg_size
109+
: custom_kernarg_length;
110+
if (total_kernarg_size > 0) {
111+
iree_amdgpu_memset(kernarg_ptr, 0, total_kernarg_size);
112+
const size_t explicit_bytes =
113+
layout->has_implicit_args
114+
? layout->implicit_args_offset
115+
: total_kernarg_size;
116+
const size_t copy_bytes =
117+
custom_kernarg_length < explicit_bytes ? custom_kernarg_length
118+
: explicit_bytes;
119+
if (copy_bytes > 0) {
120+
iree_amdgpu_memcpy(kernarg_ptr, custom_kernarg_ptr, copy_bytes);
121+
}
106122
}
123+
124+
iree_hal_amdgpu_device_dispatch_emplace_implicit_args(
125+
kernel_args, workgroup_count, dynamic_workgroup_local_memory, layout,
126+
kernarg_ptr);
107127
}
108128

109129
//===----------------------------------------------------------------------===//

runtime/src/iree/hal/drivers/amdgpu/device/dispatch.h

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -178,18 +178,24 @@ void iree_hal_amdgpu_device_dispatch_emplace_hal_kernargs(
178178

179179
// Populates custom direct kernargs in already-reserved storage.
180180
//
181-
// |custom_kernarg_ptr| must provide |layout->total_kernarg_size| bytes in the
182-
// final kernel ABI shape expected by the target kernel.
181+
// |custom_kernarg_ptr| provides up to |layout->total_kernarg_size| bytes in the
182+
// final kernel ABI shape expected by the target kernel. Missing trailing padding
183+
// bytes remain zeroed.
183184
//
184185
// Preconditions:
185-
// - |layout| and |kernarg_ptr| are non-NULL.
186+
// - |kernel_args|, |workgroup_count|, |layout|, and |kernarg_ptr| are
187+
// non-NULL.
186188
// - |layout| was derived with
187189
// iree_hal_amdgpu_device_dispatch_make_custom_kernarg_layout.
188-
// - |custom_kernarg_ptr| is non-NULL when |layout->total_kernarg_size| > 0.
190+
// - |custom_kernarg_ptr| is non-NULL when |custom_kernarg_length| > 0.
189191
void iree_hal_amdgpu_device_dispatch_emplace_custom_kernargs(
192+
const iree_hal_amdgpu_device_kernel_args_t* IREE_AMDGPU_RESTRICT
193+
kernel_args,
194+
const uint32_t workgroup_count[3], uint32_t dynamic_workgroup_local_memory,
190195
const iree_hal_amdgpu_device_dispatch_kernarg_layout_t* IREE_AMDGPU_RESTRICT
191196
layout,
192197
const void* IREE_AMDGPU_RESTRICT custom_kernarg_ptr,
198+
size_t custom_kernarg_length,
193199
void* IREE_AMDGPU_RESTRICT kernarg_ptr);
194200

195201
// Populates the builtin patch dispatch that updates an indirect-parameter

runtime/src/iree/hal/drivers/amdgpu/executable.c

Lines changed: 70 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -965,6 +965,7 @@ static iree_status_t iree_hal_amdgpu_executable_resolve_kernel_args_from_symbol(
965965

966966
out_kernel_args->binding_count = binding_count;
967967
out_kernel_args->constant_count = constant_count;
968+
out_kernel_args->implicit_args_offset = UINT16_MAX;
968969

969970
IREE_TRACE_ZONE_END(z0);
970971
return iree_ok_status();
@@ -1141,6 +1142,21 @@ static iree_status_t iree_hal_amdgpu_executable_initialize_dispatch_descriptor(
11411142
out_descriptor->custom_kernarg_layout =
11421143
iree_hal_amdgpu_device_dispatch_make_custom_kernarg_layout(
11431144
kernel_args->kernarg_size);
1145+
const uint16_t custom_implicit_args_offset =
1146+
kernel_args->implicit_args_offset != UINT16_MAX
1147+
? kernel_args->implicit_args_offset
1148+
: kernel_args->kernarg_size;
1149+
if (custom_implicit_args_offset != UINT16_MAX) {
1150+
out_descriptor->custom_kernarg_layout.explicit_kernarg_size =
1151+
custom_implicit_args_offset;
1152+
out_descriptor->custom_kernarg_layout.implicit_args_offset =
1153+
custom_implicit_args_offset;
1154+
out_descriptor->custom_kernarg_layout.total_kernarg_size = iree_max(
1155+
(size_t)kernel_args->kernarg_size,
1156+
(size_t)custom_implicit_args_offset +
1157+
IREE_AMDGPU_KERNEL_IMPLICIT_ARGS_SIZE);
1158+
out_descriptor->custom_kernarg_layout.has_implicit_args = true;
1159+
}
11441160
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_executable_calculate_kernarg_block_count(
11451161
&out_descriptor->custom_kernarg_layout,
11461162
&out_descriptor->custom_kernarg_block_count));
@@ -1656,13 +1672,6 @@ iree_hal_amdgpu_executable_calculate_raw_hsaco_reflection_storage(
16561672
iree_host_size_t* out_export_name_storage_size,
16571673
iree_host_size_t* out_export_parameter_count,
16581674
iree_host_size_t* out_export_parameter_name_storage_size) {
1659-
if (iree_string_view_is_empty(hsaco_metadata->target)) {
1660-
return iree_make_status(
1661-
IREE_STATUS_INVALID_ARGUMENT,
1662-
"raw HSACO metadata is missing `amdhsa.target`; direct loading "
1663-
"requires the code object to declare its target ISA");
1664-
}
1665-
16661675
iree_host_size_t export_name_storage_size = 0;
16671676
iree_host_size_t export_parameter_count = 0;
16681677
iree_host_size_t export_parameter_name_storage_size = 0;
@@ -1922,6 +1931,60 @@ static iree_status_t iree_hal_amdgpu_executable_resolve_raw_hsaco_kernel_args(
19221931
requirements.binding_count, &host_kernel_args[kernel_ordinal]),
19231932
"resolving kernel args for raw kernel `%.*s`", (int)symbol_name.size,
19241933
symbol_name.data);
1934+
// Raw HSACO metadata is the source of truth for caller-visible kernargs:
1935+
// some code objects report a smaller HSA symbol size than the metadata
1936+
// segment needed by pre-packed HIP launch buffers.
1937+
if (host_kernel_args[kernel_ordinal].kernarg_size <
1938+
kernel->kernarg_segment_size) {
1939+
host_kernel_args[kernel_ordinal].kernarg_size =
1940+
kernel->kernarg_segment_size;
1941+
}
1942+
if (host_kernel_args[kernel_ordinal].kernarg_alignment <
1943+
kernel->kernarg_segment_alignment) {
1944+
host_kernel_args[kernel_ordinal].kernarg_alignment =
1945+
kernel->kernarg_segment_alignment;
1946+
}
1947+
1948+
uint16_t implicit_args_offset = UINT16_MAX;
1949+
uint32_t explicit_args_end = 0;
1950+
for (iree_host_size_t arg_i = 0; arg_i < kernel->arg_count; ++arg_i) {
1951+
const iree_hal_amdgpu_hsaco_metadata_arg_t* arg =
1952+
&kernel->args[arg_i];
1953+
if (arg->kind == IREE_HAL_AMDGPU_HSACO_METADATA_ARG_KIND_HIDDEN ||
1954+
arg->kind == IREE_HAL_AMDGPU_HSACO_METADATA_ARG_KIND_HIDDEN_NONE) {
1955+
if (arg->offset <= UINT16_MAX &&
1956+
(implicit_args_offset == UINT16_MAX ||
1957+
arg->offset < implicit_args_offset)) {
1958+
implicit_args_offset = (uint16_t)arg->offset;
1959+
}
1960+
} else {
1961+
iree_host_size_t arg_end = 0;
1962+
if (!iree_host_size_checked_add(arg->offset, arg->size, &arg_end) ||
1963+
arg_end > UINT32_MAX) {
1964+
return iree_make_status(
1965+
IREE_STATUS_OUT_OF_RANGE,
1966+
"AMDGPU kernel `%.*s` argument offset overflows",
1967+
(int)symbol_name.size, symbol_name.data);
1968+
}
1969+
explicit_args_end = iree_max(explicit_args_end, (uint32_t)arg_end);
1970+
}
1971+
}
1972+
if (implicit_args_offset != UINT16_MAX && implicit_args_offset > 0) {
1973+
host_kernel_args[kernel_ordinal].implicit_args_offset =
1974+
implicit_args_offset;
1975+
} else if (explicit_args_end <= UINT16_MAX &&
1976+
host_kernel_args[kernel_ordinal].kernarg_size >=
1977+
explicit_args_end + IREE_AMDGPU_KERNEL_IMPLICIT_ARGS_SIZE) {
1978+
host_kernel_args[kernel_ordinal].implicit_args_offset =
1979+
(uint16_t)explicit_args_end;
1980+
} else if (kernel->uses_borrowed_arg_layout &&
1981+
host_kernel_args[kernel_ordinal].kernarg_size > 0) {
1982+
// ELF-synthesized exports borrow a template arg layout, so its hidden arg
1983+
// offsets may not apply. Preserve the full caller-provided raw kernarg
1984+
// blob and append the implicit suffix after the HSA-reported segment.
1985+
host_kernel_args[kernel_ordinal].implicit_args_offset =
1986+
host_kernel_args[kernel_ordinal].kernarg_size;
1987+
}
19251988
}
19261989
return iree_ok_status();
19271990
}

runtime/src/iree/hal/drivers/amdgpu/host_queue_dispatch.c

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -221,13 +221,17 @@ static iree_status_t iree_hal_amdgpu_host_queue_validate_dispatch_kernargs(
221221

222222
iree_host_size_t operation_resource_count = 1;
223223
if (iree_any_bit_set(flags, IREE_HAL_DISPATCH_FLAG_CUSTOM_DIRECT_ARGUMENTS)) {
224-
if (IREE_UNLIKELY(constants.data_length !=
225-
descriptor->kernel_args.kernarg_size)) {
224+
const uint32_t required_explicit_bytes =
225+
(uint32_t)descriptor->custom_kernarg_layout.explicit_kernarg_size;
226+
const iree_host_size_t padded_constant_length =
227+
iree_host_align(constants.data_length, /*alignment=*/8);
228+
if (IREE_UNLIKELY(padded_constant_length < required_explicit_bytes)) {
226229
return iree_make_status(
227230
IREE_STATUS_INVALID_ARGUMENT,
228-
"custom dispatch argument length mismatch; expected %u but got "
229-
"%" PRIhsz,
230-
descriptor->kernel_args.kernarg_size, constants.data_length);
231+
"custom dispatch argument length too short; expected at least %u "
232+
"but got %" PRIhsz " (padded to %" PRIhsz ")",
233+
required_explicit_bytes, constants.data_length,
234+
padded_constant_length);
231235
}
232236
if (IREE_UNLIKELY(constants.data_length > 0 && !constants.data)) {
233237
return iree_make_status(
@@ -238,6 +242,13 @@ static iree_status_t iree_hal_amdgpu_host_queue_validate_dispatch_kernargs(
238242
*out_layout = &descriptor->custom_kernarg_layout;
239243
*out_kernarg_block_count =
240244
iree_max(1u, descriptor->custom_kernarg_block_count);
245+
if ((*out_layout)->total_kernarg_size == 0 && constants.data_length > 0) {
246+
const uint32_t provided_kernarg_block_count =
247+
(uint32_t)iree_host_size_ceil_div(
248+
constants.data_length, sizeof(iree_hal_amdgpu_kernarg_block_t));
249+
*out_kernarg_block_count =
250+
iree_max(*out_kernarg_block_count, provided_kernarg_block_count);
251+
}
241252
} else {
242253
if (IREE_UNLIKELY(constants.data_length > 0 && !constants.data)) {
243254
return iree_make_status(
@@ -531,7 +542,9 @@ static iree_status_t iree_hal_amdgpu_host_queue_submit_direct_dispatch(
531542

532543
if (uses_custom_direct_arguments) {
533544
iree_hal_amdgpu_device_dispatch_emplace_custom_kernargs(
534-
plan->layout, constants.data, submission.kernel.kernargs.blocks->data);
545+
plan->kernel_args, config.workgroup_count,
546+
config.dynamic_workgroup_local_memory, plan->layout, constants.data,
547+
constants.data_length, submission.kernel.kernargs.blocks->data);
535548
} else {
536549
iree_hal_amdgpu_device_dispatch_emplace_hal_kernargs(
537550
plan->kernel_args, config.workgroup_count,
@@ -725,7 +738,9 @@ static iree_status_t iree_hal_amdgpu_host_queue_submit_indirect_dispatch(
725738
const uint32_t placeholder_workgroup_count[3] = {0, 0, 0};
726739
if (uses_custom_direct_arguments) {
727740
iree_hal_amdgpu_device_dispatch_emplace_custom_kernargs(
728-
plan->layout, constants.data, dispatch_kernarg_data);
741+
plan->kernel_args, placeholder_workgroup_count,
742+
config.dynamic_workgroup_local_memory, plan->layout, constants.data,
743+
constants.data_length, dispatch_kernarg_data);
729744
} else {
730745
iree_hal_amdgpu_device_dispatch_emplace_hal_kernargs(
731746
plan->kernel_args, placeholder_workgroup_count,

0 commit comments

Comments
 (0)