Skip to content

Commit b52b847

Browse files
committed
Qualcomm AI Engine Direct - Minimal Inerence Runtime Core Requirment
1. Removed from_blob tensor creation 2. Compile and Linking Option optimization 3. Function visibility optimization 4. Expose Power Config to user
1 parent 60d57e5 commit b52b847

11 files changed

Lines changed: 128 additions & 26 deletions

File tree

CMakeLists.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,17 @@ project(executorch)
5050

5151
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
5252

53+
# Hexagon toolchain with release build complains about code in third party
54+
# libraries.
55+
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES Hexagon AND ${CMAKE_BUILD_TYPE} STREQUAL
56+
"Release"
57+
)
58+
add_compile_options(
59+
-Wno-error=format -Wno-error=implicit-int-conversion
60+
-Wno-error=unused-variable -Wno-error=unused-function
61+
)
62+
endif()
63+
5364
# --- ExecuTorch Version ---
5465
# Parse version from version.txt (single source of truth)
5566
file(READ "${EXECUTORCH_ROOT}/version.txt" ET_VERSION_STRING)

backends/qualcomm/CMakeLists.txt

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -90,22 +90,34 @@ if(${ANDROID})
9090
find_library(android_log log)
9191
endif()
9292

93-
add_compile_options("-Wall" "-Werror" "-Wno-sign-compare")
93+
add_compile_options("-Wall" "-Werror" "-fvisibility=hidden")
9494
add_compile_definitions(C10_USING_CUSTOM_GENERATED_MACROS)
9595

96-
# GNU emit wanring for ignored attributes Unfortunately, we use [[maybe_unused]]
97-
# which can be ignored by GNU. So we make it a warning, not an error in GNU.
96+
# GNU emits warning for ignored attributes Unfortunately, we use
97+
# [[maybe_unused]] which can be ignored by GNU. So we make it a warning, not an
98+
# error in GNU.
9899
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
99100
add_compile_options("-Wno-error=attributes")
100101
add_link_options("-flto=auto")
101102
endif()
102103

103104
if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
104105
# strip symbols
105-
add_link_options("-s")
106+
add_link_options(LINKER:-s,--gc-sections)
107+
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES Hexagon)
108+
add_compile_options(
109+
"-Os"
110+
"-ffunction-sections"
111+
"-fdata-sections"
112+
"-frtti"
113+
"-fno-exceptions"
114+
"-fomit-frame-pointer"
115+
"-fno-asynchronous-unwind-tables"
116+
)
117+
else()
106118

107-
# --gc-sections is added by torch.
108-
add_compile_options("-O3" "-ffunction-sections" "-fdata-sections" "-frtti")
119+
add_compile_options("-O3" "-ffunction-sections" "-fdata-sections" "-frtti")
120+
endif()
109121
endif()
110122

111123
include_directories(
@@ -243,9 +255,8 @@ target_link_libraries(
243255
qnn_schema shared_buffer qnn_dlc_manager
244256
)
245257
target_link_libraries(
246-
qnn_executorch_backend
247-
PRIVATE qnn_executorch_header qnn_schema qnn_manager executorch_core
248-
extension_tensor qnn_backend_options
258+
qnn_executorch_backend PRIVATE qnn_executorch_header qnn_schema qnn_manager
259+
executorch_core qnn_backend_options
249260
)
250261

251262
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES Hexagon)

backends/qualcomm/aot/python/PyQnnManagerAdaptor.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ class PyQnnManager {
250250
std::vector<std::vector<std::shared_ptr<OpWrapper>>>& op_wrappers) {
251251
QnnExecuTorchContextBinary binary_info;
252252

253-
for (int i = 0; i < graph_names.size(); ++i) {
253+
for (uint32_t i = 0; i < graph_names.size(); ++i) {
254254
if (qnn_manager_->Compile(graph_names[i], op_wrappers[i]) !=
255255
executorch::runtime::Error::Ok) {
256256
QNN_EXECUTORCH_LOG_ERROR("Fail to compile QNN graph");

backends/qualcomm/aot/wrappers/TensorWrapper.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ class TensorWrapper {
7676
rank);
7777
return;
7878
}
79-
for (int i = 0; i < rank; ++i) {
79+
for (size_t i = 0; i < rank; ++i) {
8080
QNN_TENSOR_VER_PTR(tensor_)->dimensions[i] = dims[i];
8181
}
8282
}

backends/qualcomm/runtime/QnnExecuTorch.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,14 +63,18 @@ struct CustomMemTensorInfo {
6363
/// alignment as MemoryAllocator::kDefaultAlignment.
6464
/// See runtime/core/memory_allocator.h. The function returns a valid pointer
6565
/// if allocation is successful.
66-
void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment);
66+
__attribute__((__visibility__("default"))) void* QnnExecuTorchAllocCustomMem(
67+
size_t bytes,
68+
size_t alignment);
6769

6870
/// Add tensor to custom memory with custom type descriptor. Create memory
6971
/// handle to tensor wrapper during execution
70-
void QnnExecuTorchAddCustomMemTensorAddr(void* tensor_addr, void* custom_mem);
72+
__attribute__((__visibility__("default"))) void
73+
QnnExecuTorchAddCustomMemTensorAddr(void* tensor_addr, void* custom_mem);
7174

7275
/// Free the allocated shared memory.
73-
void QnnExecuTorchFreeCustomMem(void* buffer_ptr);
76+
__attribute__((__visibility__("default"))) void QnnExecuTorchFreeCustomMem(
77+
void* buffer_ptr);
7478

7579
#ifdef __cplusplus
7680
}

backends/qualcomm/runtime/QnnManager.cpp

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
#include <executorch/backends/qualcomm/runtime/backends/QnnBackendCommon.h>
1313
#include <executorch/backends/qualcomm/runtime/backends/QnnCustomProtocol.h>
1414
#include <executorch/backends/qualcomm/runtime/backends/QnnImplementation.h>
15-
#include <executorch/extension/tensor/tensor.h>
1615
#include <algorithm>
1716
#include <cstdlib>
1817
#include <cstring>
@@ -427,19 +426,25 @@ Error QnnManager::Execute(
427426
QNN_TENSOR_VER_PTR(output_tensor)->dimensions +
428427
QNN_TENSOR_VER_PTR(output_tensor)->rank);
429428

430-
auto dump_tensor = executorch::extension::from_blob(
431-
QNN_TENSOR_VER_PTR(output_tensor)->clientBuf.data,
432-
sizes,
429+
std::vector<executorch::aten::StridesType> stride_size(sizes.size(), 0);
430+
// Avoid using from_blob as it significantly increases shared library
431+
// size.
432+
executorch::aten::TensorImpl tensor_impl(
433433
qnn_dtype_to_scalar_type_[QNN_TENSOR_VER_PTR(output_tensor)
434-
->dataType]);
434+
->dataType],
435+
sizes.size(),
436+
sizes.data(),
437+
QNN_TENSOR_VER_PTR(output_tensor)->clientBuf.data,
438+
nullptr,
439+
stride_size.data());
435440

436441
executorch::runtime::event_tracer_log_output_delegate<
437442
executorch::aten::Tensor>(
438443
event_tracer,
439444
QNN_TENSOR_VER_PTR(output_tensor)->name,
440445
/*delegate_debug_id=*/
441446
static_cast<executorch::runtime::DebugHandle>(-1),
442-
*dump_tensor);
447+
executorch::aten::Tensor(&tensor_impl));
443448
}
444449
}
445450

@@ -547,7 +552,7 @@ Error QnnManager::CompileDlc() {
547552

548553
// Mapping memory address for the input and output of mutable buffer
549554
std::unordered_map<int, const void*> mutable_buffer_id_to_memory_map;
550-
for (int i = 0; i < graphInfo.numInputTensors; ++i) {
555+
for (uint32_t i = 0; i < graphInfo.numInputTensors; ++i) {
551556
auto tw = CreateTensorWrapper(graphInfo.inputTensors[i]);
552557
tw->UpdateQnnTensorMeta(graphInfo.inputTensors[i]);
553558

@@ -560,7 +565,7 @@ Error QnnManager::CompileDlc() {
560565
}
561566
graph_inputs.push_back(tw);
562567
}
563-
for (int i = 0; i < graphInfo.numOutputTensors; ++i) {
568+
for (uint32_t i = 0; i < graphInfo.numOutputTensors; ++i) {
564569
auto tw = CreateTensorWrapper(graphInfo.outputTensors[i]);
565570
tw->UpdateQnnTensorMeta(graphInfo.outputTensors[i]);
566571
int mutable_buffer_id = ExtractMutableBufferNumber(tw->GetName());

backends/qualcomm/runtime/SharedBuffer.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ std::size_t std::hash<CustomMemTensorInfo>::operator()(
2121
hash_val ^= std::hash<void*>()(info.custom_mem);
2222
hash_val ^= std::hash<size_t>()(info.pos);
2323
hash_val ^= std::hash<size_t>()(info.tensor_bytes);
24-
for (int i = 0; i < info.rank; ++i) {
24+
for (size_t i = 0; i < info.rank; ++i) {
2525
hash_val ^= std::hash<uint32_t>()(info.shape[i]);
2626
}
2727
hash_val ^= std::hash<uint32_t>()(info.rank);
@@ -36,7 +36,7 @@ bool operator==(
3636
(lhs.tensor_addr == rhs.tensor_addr && lhs.custom_mem == rhs.custom_mem &&
3737
lhs.pos == rhs.pos && lhs.tensor_bytes == rhs.tensor_bytes &&
3838
lhs.rank == rhs.rank && lhs.dtype == rhs.dtype);
39-
for (int i = 0; i < lhs.rank; ++i) {
39+
for (size_t i = 0; i < lhs.rank; ++i) {
4040
is_same &= lhs.shape[i] == rhs.shape[i];
4141
}
4242
return is_same;

backends/qualcomm/runtime/backends/direct_mode/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,4 @@ target_link_libraries(
4545
${HEXAGON_TOOLS_ROOT}/Tools/target/hexagon/lib/${DSP_VERSION}/G0/pic/libc++.so.1
4646
${HEXAGON_TOOLS_ROOT}/Tools/target/hexagon/lib/${DSP_VERSION}/G0/pic/libc++abi.so.1
4747
)
48+
target_compile_options(qnn_executorch_skel PRIVATE "-fvisibility=default")

backends/qualcomm/tests/test_qnn_delegate.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from executorch.backends.qualcomm.debugger.utils import generate_optrace
3030
from executorch.backends.qualcomm.serialization.qc_schema import (
3131
QnnExecuTorchBackendType,
32+
QnnExecuTorchHtpPerformanceMode,
3233
)
3334
from executorch.backends.qualcomm.tests.utils import (
3435
convert_pt2e,
@@ -4790,6 +4791,33 @@ def setUp(self):
47904791
saver=False,
47914792
)
47924793

4794+
def test_qnn_backend_compile_time_option_htp_performance(self):
4795+
backend_options = generate_htp_compiler_spec(
4796+
use_fp16=True,
4797+
htp_performance_mode=QnnExecuTorchHtpPerformanceMode.kHtpHighPowerSaver,
4798+
)
4799+
TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
4800+
soc_model=self.chipset_table[TestQNN.model],
4801+
backend_options=backend_options,
4802+
)
4803+
module = SimpleModel() # noqa: F405
4804+
sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
4805+
4806+
def output_callback(log_msg):
4807+
msg = log_msg.stdout
4808+
# Refer to HtpDevice.cpp for the following values
4809+
min_voltage = "coreVoltageCornerMin 80"
4810+
self.assertTrue(min_voltage in msg, f"Expecting '{min_voltage} ' in log")
4811+
4812+
runtime_extra_commands = " --log_level 4"
4813+
self.lower_module_and_test_output(
4814+
module,
4815+
sample_input,
4816+
extra_cmds=runtime_extra_commands,
4817+
output_callback=partial(output_callback),
4818+
save_inference_speed=True,
4819+
)
4820+
47934821
def test_qnn_backend_dump_intermediate_outputs_topk(self):
47944822
TestQNN.dump_intermediate_outputs = True
47954823
backend_options = generate_htp_compiler_spec(use_fp16=True)
@@ -5379,6 +5407,34 @@ def setUp(self):
53795407
saver=False,
53805408
)
53815409

5410+
def test_qnn_backend_compile_time_option_htp_performance(self):
5411+
backend_options = generate_htp_compiler_spec(
5412+
use_fp16=False,
5413+
htp_performance_mode=QnnExecuTorchHtpPerformanceMode.kHtpHighPowerSaver,
5414+
)
5415+
TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
5416+
soc_model=self.chipset_table[TestQNN.model],
5417+
backend_options=backend_options,
5418+
)
5419+
module = SimpleModel() # noqa: F405
5420+
sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
5421+
module = self.get_qdq_module(module, sample_input)
5422+
5423+
def output_callback(log_msg):
5424+
msg = log_msg.stdout
5425+
# Refer to HtpDevice.cpp for the following values
5426+
min_voltage = "coreVoltageCornerMin 80"
5427+
self.assertTrue(min_voltage in msg, f"Expecting '{min_voltage} ' in log")
5428+
5429+
runtime_extra_commands = " --log_level 4"
5430+
self.lower_module_and_test_output(
5431+
module,
5432+
sample_input,
5433+
extra_cmds=runtime_extra_commands,
5434+
output_callback=partial(output_callback),
5435+
save_inference_speed=True,
5436+
)
5437+
53825438
def test_qnn_backend_dump_intermediate_outputs_simple_model(self):
53835439
TestQNN.dump_intermediate_outputs = True
53845440
backend_options = generate_htp_compiler_spec(use_fp16=False)

backends/qualcomm/utils/utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -994,6 +994,7 @@ def generate_htp_compiler_spec(
994994
use_multi_contexts: bool = False,
995995
use_weight_sharing: bool = False,
996996
use_slc_allocator: bool = False,
997+
htp_performance_mode: QnnExecuTorchHtpPerformanceMode = QnnExecuTorchHtpPerformanceMode.kHtpBurst,
997998
) -> QnnExecuTorchBackendOptions:
998999
"""
9991000
Helper function generating backend options for QNN HTP
@@ -1025,7 +1026,7 @@ def generate_htp_compiler_spec(
10251026
# This actually is not an option which can affect the compiled blob.
10261027
# But we don't have other place to pass this option at execution stage.
10271028
# TODO: enable voting mechanism in runtime and make this as an option
1028-
htp_options.performance_mode = QnnExecuTorchHtpPerformanceMode.kHtpBurst
1029+
htp_options.performance_mode = htp_performance_mode
10291030
htp_options.use_multi_contexts = use_multi_contexts
10301031
htp_options.use_weight_sharing = use_weight_sharing
10311032
htp_options.use_dlbc = use_dlbc

0 commit comments

Comments
 (0)