diff --git a/.claude/skills/qualcomm/SKILL.md b/.claude/skills/qualcomm/SKILL.md index bcbd581e293..86ea3351fc9 100644 --- a/.claude/skills/qualcomm/SKILL.md +++ b/.claude/skills/qualcomm/SKILL.md @@ -31,7 +31,7 @@ Use `backends/qualcomm/scripts/build.sh`. Linux only (macOS not supported). |---|---|---| | x86_64 (Python interface + host tools) | enabled | `build-x86/` | | Android arm64-v8a (device runner) | enabled | `build-android/` | -| Hexagon DSP (direct mode) | disabled | `build-hexagon/` | +| Direct mode (LPAI ADSP or Hexagon CDSP) | disabled | `build-direct/` | | OE Linux embedded | disabled | `build-oe-linux/` | **Common build commands:** diff --git a/.gitignore b/.gitignore index aeb4aa14e93..a0eeaa92117 100644 --- a/.gitignore +++ b/.gitignore @@ -18,7 +18,7 @@ cmake-out* cmake-out-android/ build-android/ build-x86/ -build-hexagon/ +build-direct/ dist/ arm-scratch/ executorch.egg-info diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt index 6303114d2fd..08658809438 100644 --- a/backends/qualcomm/CMakeLists.txt +++ b/backends/qualcomm/CMakeLists.txt @@ -248,21 +248,31 @@ target_link_libraries( if(${CMAKE_SYSTEM_PROCESSOR} MATCHES Hexagon) # Add macro here so we can dlopen the correct .so library. - if(DSP_TYPE STREQUAL "3") - string(TOUPPER ${DSP_VERSION} CAPITAL_DSP_VERSION) - set(HEXAGON_LIB "libQnnHtp${CAPITAL_DSP_VERSION}.so") + if(DSP_TYPE STREQUAL "0") + message( + STATUS + "aDSP direct and non-direct mode uses shared libraries under different folders but have the same name, skipping HEXAGON_LIB override." + ) + # Just random string here since QnnBackendUnifiedRegistry.h will ignore this + # macro for aDSP case. + add_compile_definitions(HEXAGON_LIB="") + elseif(DSP_TYPE STREQUAL "3") + string(TOUPPER ${CDSP_VERSION} CAPITAL_CDSP_VERSION) + set(HEXAGON_LIB "libQnnHtp${CAPITAL_CDSP_VERSION}.so") add_compile_definitions(HEXAGON_LIB="${HEXAGON_LIB}") message(STATUS "For hexagon build, using HTP Library: ${HEXAGON_LIB}") else() message(FATAL_ERROR "Unknown DSP_TYPE ${DSP_TYPE}") endif() + # aDSP will also use cDSP c/c++ library since aDSP does not have these + # libraries. target_link_libraries( qnn_executorch_backend PRIVATE - ${HEXAGON_TOOLS_ROOT}/Tools/target/hexagon/lib/${DSP_VERSION}/G0/pic/libc.so - ${HEXAGON_TOOLS_ROOT}/Tools/target/hexagon/lib/${DSP_VERSION}/G0/pic/libc++.so.1 - ${HEXAGON_TOOLS_ROOT}/Tools/target/hexagon/lib/${DSP_VERSION}/G0/pic/libc++abi.so.1 + ${HEXAGON_TOOLS_ROOT}/Tools/target/hexagon/lib/${CDSP_VERSION}/G0/pic/libc.so + ${HEXAGON_TOOLS_ROOT}/Tools/target/hexagon/lib/${CDSP_VERSION}/G0/pic/libc++.so.1 + ${HEXAGON_TOOLS_ROOT}/Tools/target/hexagon/lib/${CDSP_VERSION}/G0/pic/libc++abi.so.1 ) endif() diff --git a/backends/qualcomm/aot/wrappers/TensorWrapper.cpp b/backends/qualcomm/aot/wrappers/TensorWrapper.cpp index 17d76aac412..1e8b6423719 100644 --- a/backends/qualcomm/aot/wrappers/TensorWrapper.cpp +++ b/backends/qualcomm/aot/wrappers/TensorWrapper.cpp @@ -115,19 +115,24 @@ TensorWrapper::TensorWrapper( } } -Error TensorWrapper::FillDataBuffer(const void* data, bool copy_data) { +Error TensorWrapper::FillDataBuffer(const void* data) { if (data != nullptr) { QNN_TENSOR_VER_PTR(tensor_)->memType = QNN_TENSORMEMTYPE_RAW; +#ifdef __hexagon__ + // data's address is already aligned in idl skel implementation. e.g. + // QnnExecuTorchIdlWrapper.cpp Here, we are ensuring we pass data size that + // is also multiple of 64. QnnExecuTorchIdlWrapper.cpp should have created + // sufficient space for tensor. + auto align_size = [](size_t alignment, size_t sz) { + return (sz + (alignment - 1)) & ~(alignment - 1); + }; + QNN_TENSOR_VER_PTR(tensor_)->clientBuf.dataSize = + align_size(QNN_TENSOR_ALIGNMENT, bytes_); +#else QNN_TENSOR_VER_PTR(tensor_)->clientBuf.dataSize = bytes_; - if (copy_data) { - owned_data_ = std::make_unique(bytes_); - const char* src_data = static_cast(data); - std::memcpy(owned_data_.get(), src_data, bytes_); - QNN_TENSOR_VER_PTR(tensor_)->clientBuf.data = owned_data_.get(); - } else { - // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) - QNN_TENSOR_VER_PTR(tensor_)->clientBuf.data = const_cast(data); - } +#endif + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) + QNN_TENSOR_VER_PTR(tensor_)->clientBuf.data = const_cast(data); } else { QNN_EXECUTORCH_LOG_WARN("Data pointer is nullptr"); } diff --git a/backends/qualcomm/aot/wrappers/TensorWrapper.h b/backends/qualcomm/aot/wrappers/TensorWrapper.h index 6f20a807820..d8661acc492 100644 --- a/backends/qualcomm/aot/wrappers/TensorWrapper.h +++ b/backends/qualcomm/aot/wrappers/TensorWrapper.h @@ -19,6 +19,9 @@ #define QNN_TENSOR_VER_PTR(x) (&((x).v2)) #define QNN_OP_VER_PTR(x) (&((x).v1)) +// This is for direct mode, especially LPAI +#define QNN_TENSOR_ALIGNMENT 64 + namespace executorch { namespace backends { namespace qnn { @@ -36,9 +39,7 @@ class TensorWrapper { const void* data = nullptr, bool copy_data = false); - executorch::runtime::Error FillDataBuffer( - const void* data, - bool copy_data = false); + executorch::runtime::Error FillDataBuffer(const void* data); executorch::runtime::Error AllocateDataBuffer(); diff --git a/backends/qualcomm/export_utils.py b/backends/qualcomm/export_utils.py index 2c7ab2abd02..447923c2274 100644 --- a/backends/qualcomm/export_utils.py +++ b/backends/qualcomm/export_utils.py @@ -34,10 +34,10 @@ QcomChipset, QnnExecuTorchBackendType, QnnExecuTorchHtpPerformanceMode, + QnnExecuTorchLpaiTargetEnv, QnnExecuTorchOpPackageOptions, ) from executorch.backends.qualcomm.utils.constants import ( - DSP_VERSION, HEXAGON_SDK_ROOT, HEXAGON_TOOLS_ROOT, ) @@ -76,7 +76,7 @@ class QnnConfig: backend (str): The target backend, such as htp, gpu, etc. QnnConfig will then parse this to type QnnExecuTorchBackendType. soc_model (QcomChipset): The target Qualcomm System on Chip (SoC) model. build_folder (str): Path to cmake binary directory for target platform, e.g., /path/to/build-android. - direct_build_folder (str): Path to cmake binary directory for direct_mode. E.g., path/to/build-hexagon. + direct_build_folder (str): Path to cmake binary directory for direct_mode. E.g., path/to/build-direct. target (str): Target platform for deployment. online_prepare (bool): Compose QNN graph on device if set to True. shared_buffer (bool): Enables usage of shared buffer(zero-copy mechanism) between application and backend for graph I/O during runtime. @@ -235,16 +235,14 @@ def __init__( ) self.runner = runner if qnn_config.direct_build_folder: - required_env = [HEXAGON_SDK_ROOT, HEXAGON_TOOLS_ROOT, DSP_VERSION] + required_env = [HEXAGON_SDK_ROOT, HEXAGON_TOOLS_ROOT] assert all( var in os.environ for var in required_env ), f"Please ensure the following environment variables are set: {required_env}" self.hexagon_sdk_root = os.getenv(HEXAGON_SDK_ROOT) self.hexagon_tools_root = os.getenv(HEXAGON_TOOLS_ROOT) - self.dsp_arch = os.getenv(DSP_VERSION) logging.info(f"{HEXAGON_SDK_ROOT}={self.hexagon_sdk_root}") logging.info(f"{HEXAGON_TOOLS_ROOT}={self.hexagon_tools_root}") - logging.info(f"{DSP_VERSION}={self.dsp_arch}") self.qnn_config = qnn_config self.qnn_sdk = os.getenv("QNN_SDK_ROOT") self.build_path = qnn_config.build_folder @@ -287,17 +285,25 @@ def __init__( if self.direct_build_folder: direct_general_artifacts = [ f"{self.build_path}/examples/qualcomm/direct_executor_runner/libqnn_executorch_stub.so", - f"{self.direct_build_folder}/backends/qualcomm/libqnn_executorch_backend.so", - f"{self.direct_build_folder}/backends/qualcomm/qnn_executorch/direct_mode/libqnn_executorch_skel.so", ] self.backend_library_paths.update( { QnnExecuTorchBackendType.kHtpBackend: [ + f"{self.direct_build_folder}/backends/qualcomm/libqnn_executorch_backend.so", + f"{self.direct_build_folder}/backends/qualcomm/qnn_executorch/direct_mode/libqnn_executorch_skel.so", f"{self.qnn_sdk}/lib/hexagon-v{self.htp_arch}/unsigned/libQnnHtpV{self.htp_arch}.so", f"{self.qnn_sdk}/lib/hexagon-v{self.htp_arch}/unsigned/libQnnSystem.so", f"{self.hexagon_tools_root}/Tools/target/hexagon/lib/v{self.htp_arch}/G0/pic/libc++abi.so.1", f"{self.hexagon_tools_root}/Tools/target/hexagon/lib/v{self.htp_arch}/G0/pic/libc++.so.1", - ] + ], + QnnExecuTorchBackendType.kLpaiBackend: [ + f"{self.qnn_sdk}/lib/lpai-v{self.lpai_hw_ver}/signed/libqnn_executorch_backend.so", + f"{self.qnn_sdk}/lib/lpai-v{self.lpai_hw_ver}/signed/libqnn_executorch_skel.so", + f"{self.qnn_sdk}/lib/lpai-v{self.lpai_hw_ver}/signed/libQnnLpai.so", + f"{self.qnn_sdk}/lib/lpai-v{self.lpai_hw_ver}/signed/libQnnSystem.so", + f"{self.qnn_sdk}/lib/lpai-v{self.lpai_hw_ver}/signed/libc++abi.so.1", + f"{self.qnn_sdk}/lib/lpai-v{self.lpai_hw_ver}/signed/libc++.so.1", + ], } ) for _, library_paths in self.backend_library_paths.items(): @@ -378,6 +384,12 @@ def push( # noqa: C901 # backend libraries for backend in backends: artifacts.extend(self.backend_library_paths[backend]) + + # Ensure that all necessary library artifacts exists. + missing = [path for path in artifacts if not os.path.exists(path)] + assert not missing, "Missing the following libraries:\n" + "\n".join( + f" {p}" for p in missing + ) with tempfile.TemporaryDirectory() as tmp_dir: input_list_file, input_files = generate_inputs( tmp_dir, self.input_list_filename, inputs @@ -440,6 +452,13 @@ def execute( ) + self.extra_cmds ) + if self.qnn_config.direct_build_folder: + qnn_executor_runner_args = " ".join( + [ + qnn_executor_runner_args, + f"--domain_id {get_dsp_id(self.qnn_config.backend)}", + ] + ) qnn_executor_runner_cmds = " ".join( [ f"cd {self.workspace} &&", @@ -526,7 +545,9 @@ def build_executorch_binary( ): raise RuntimeError("Currently LPAI backend only supports offline_prepare.") backend_options = { - QnnExecuTorchBackendType.kLpaiBackend: generate_lpai_compiler_spec(), + QnnExecuTorchBackendType.kLpaiBackend: generate_lpai_compiler_spec( + target_env=get_lpai_target_env(qnn_config) + ), QnnExecuTorchBackendType.kGpuBackend: generate_gpu_compiler_spec(), QnnExecuTorchBackendType.kHtpBackend: generate_htp_compiler_spec( use_fp16=False if quant_dtype is not None else True, @@ -652,10 +673,31 @@ def make_quantizer( return quantizer +def get_lpai_target_env(qnn_config: QnnConfig): + if qnn_config.enable_x86_64: + return QnnExecuTorchLpaiTargetEnv.kX86 + elif qnn_config.direct_build_folder: + return QnnExecuTorchLpaiTargetEnv.kAdsp + return QnnExecuTorchLpaiTargetEnv.kArm + + def get_backend_type(backend: str): return getattr(QnnExecuTorchBackendType, f"k{backend.title()}Backend") +def get_dsp_id(backend): + dsp_id_map = { + QnnExecuTorchBackendType.kLpaiBackend: 0, + QnnExecuTorchBackendType.kHtpBackend: 3, + } + if backend not in dsp_id_map: + raise ValueError( + f"Unsupported backend {backend} for direct mode. " + f"Supported: {list(dsp_id_map.keys())}" + ) + return dsp_id_map[backend] + + def setup_common_args_and_variables(): parser = argparse.ArgumentParser() @@ -822,7 +864,7 @@ def setup_common_args_and_variables(): parser.add_argument( "--direct_build_folder", - help="Path to cmake binary directory for direct_mode. E.g., path/to/build-hexagon." + help="Path to cmake binary directory for direct_mode. E.g., path/to/build-direct." "If enabled, run self-defined protocol to control fastrpc communication.", type=str, ) diff --git a/backends/qualcomm/requirements.txt b/backends/qualcomm/requirements.txt index 038e94f1b1f..34acf1e34f0 100644 --- a/backends/qualcomm/requirements.txt +++ b/backends/qualcomm/requirements.txt @@ -3,3 +3,4 @@ pydot py-cpuinfo requests tabulate +openpyxl diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp index 51ac8312072..33cca5350d9 100644 --- a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp +++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp @@ -148,8 +148,7 @@ Error QnnExecuTorchBackend::execute( Error::Ok) { // update data ptr only should be fine input_tensor->FillDataBuffer( - args[args_index]->toTensor().const_data_ptr(), - false /* copy_data */); + args[args_index]->toTensor().const_data_ptr()); // use the real input shape instead of nominal one to make sure // dynamic shape is functional auto dims = args[args_index]->toTensor().sizes(); @@ -167,7 +166,7 @@ Error QnnExecuTorchBackend::execute( void* mutable_data_ptr = args[args_index]->toTensor().mutable_data_ptr(); if (qnn_manager->RegisterMem(mutable_data_ptr, output_tensor) != Error::Ok) { - output_tensor->FillDataBuffer(mutable_data_ptr, false /* copy_data */); + output_tensor->FillDataBuffer(mutable_data_ptr); } args_index++; } diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp index b55e3fa15f1..b1095ca3aac 100644 --- a/backends/qualcomm/runtime/QnnManager.cpp +++ b/backends/qualcomm/runtime/QnnManager.cpp @@ -376,8 +376,7 @@ Error QnnManager::AllocateTensor(const std::string& graph_name) { mutable_buffer_id_to_memory_map.end()) { // Fill the same memory for I/O of mutable buffer tensor_wrapper->FillDataBuffer( - mutable_buffer_id_to_memory_map[mutable_buffer_id], - false /* copy_data */); + mutable_buffer_id_to_memory_map[mutable_buffer_id]); } output_tensors_[graph_name].emplace_back(std::move(tensor_wrapper)); } @@ -581,9 +580,7 @@ Error QnnManager::CompileDlc() { mutable_buffer_id_to_memory_map.find(mutable_buffer_id) != mutable_buffer_id_to_memory_map.end()) { // Fill the same memory for I/O of mutable buffer - tw->FillDataBuffer( - mutable_buffer_id_to_memory_map[mutable_buffer_id], - false /* copy_data */); + tw->FillDataBuffer(mutable_buffer_id_to_memory_map[mutable_buffer_id]); } graph_outputs.push_back(tw); } diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp index 3dd1738d33b..94c38f624e0 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp +++ b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp @@ -8,6 +8,7 @@ #include #include + namespace executorch { namespace backends { namespace qnn { @@ -22,11 +23,10 @@ Error QnnBackendCache::GetQnnGraphInfoFromBinary( std::uint32_t num_graphs; QnnSystemContext_GraphInfo_t* graphs = nullptr; const QnnSystemContext_BinaryInfo_t* binaryinfo{nullptr}; - Qnn_ContextBinarySize_t binaryinfo_size = 0; Qnn_ErrorHandle_t error = QNN_SUCCESS; error = qnn_sys_interface.qnn_system_context_get_binary_info( - sys_context_handle_, buffer, nbytes, &binaryinfo, &binaryinfo_size); + sys_context_handle_, buffer, nbytes, &binaryinfo); if (error != QNN_SUCCESS) { QNN_EXECUTORCH_LOG_WARN( diff --git a/backends/qualcomm/runtime/backends/QnnBackendUnifiedRegistry.h b/backends/qualcomm/runtime/backends/QnnBackendUnifiedRegistry.h index db31404955e..d65fefc0018 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendUnifiedRegistry.h +++ b/backends/qualcomm/runtime/backends/QnnBackendUnifiedRegistry.h @@ -79,6 +79,7 @@ class QnnBackendUnifiedRegistry { #endif static constexpr const char* gpu_library_name_ = "libQnnGpu.so"; static constexpr const char* dsp_library_name_ = "libQnnDsp.so"; + // Lpai library name is same for both traditional build and hexagon build. static constexpr const char* lpai_library_name_ = "libQnnLpai.so"; std::unique_ptr GetImplementationConfig( diff --git a/backends/qualcomm/runtime/backends/QnnFunctionInterface.h b/backends/qualcomm/runtime/backends/QnnFunctionInterface.h index 598d38f9e33..2a49505a672 100644 --- a/backends/qualcomm/runtime/backends/QnnFunctionInterface.h +++ b/backends/qualcomm/runtime/backends/QnnFunctionInterface.h @@ -66,6 +66,7 @@ class QnnInterface { DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute); DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve); DEFINE_SHIM_FUNCTION_INTERFACE(graph_set_config, graphSetConfig); + DEFINE_SHIM_FUNCTION_INTERFACE(graph_get_property, graphGetProperty); // --------- QnnLog --------- DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate); DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree); diff --git a/backends/qualcomm/runtime/backends/QnnSysFunctionInterface.h b/backends/qualcomm/runtime/backends/QnnSysFunctionInterface.h index b77c7c2903e..28c3ed733f4 100644 --- a/backends/qualcomm/runtime/backends/QnnSysFunctionInterface.h +++ b/backends/qualcomm/runtime/backends/QnnSysFunctionInterface.h @@ -40,7 +40,7 @@ class QnnSystemInterface { systemContextCreate); DEFINE_SHIM_FUNCTION_SYS_INTERFACE( system_context_get_binary_info, - systemContextGetBinaryInfo); + systemContextGetMetaData); DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); private: diff --git a/backends/qualcomm/runtime/backends/direct_mode/CMakeLists.txt b/backends/qualcomm/runtime/backends/direct_mode/CMakeLists.txt index 8beebddc343..07ec2befd6f 100644 --- a/backends/qualcomm/runtime/backends/direct_mode/CMakeLists.txt +++ b/backends/qualcomm/runtime/backends/direct_mode/CMakeLists.txt @@ -35,14 +35,17 @@ add_library( target_include_directories( qnn_executorch_skel PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ) +# aDSP will also use cDSP c/c++ library since aDSP does not have these +# libraries. target_link_libraries( qnn_executorch_skel PRIVATE extension_data_loader qnn_executorch_backend + quantized_ops_lib etdump - ${HEXAGON_TOOLS_ROOT}/Tools/target/hexagon/lib/${DSP_VERSION}/G0/pic/libc.so - ${HEXAGON_TOOLS_ROOT}/Tools/target/hexagon/lib/${DSP_VERSION}/G0/pic/libc++.so.1 - ${HEXAGON_TOOLS_ROOT}/Tools/target/hexagon/lib/${DSP_VERSION}/G0/pic/libc++abi.so.1 + ${HEXAGON_TOOLS_ROOT}/Tools/target/hexagon/lib/${CDSP_VERSION}/G0/pic/libc.so + ${HEXAGON_TOOLS_ROOT}/Tools/target/hexagon/lib/${CDSP_VERSION}/G0/pic/libc++.so.1 + ${HEXAGON_TOOLS_ROOT}/Tools/target/hexagon/lib/${CDSP_VERSION}/G0/pic/libc++abi.so.1 ) target_compile_options(qnn_executorch_skel PRIVATE "-fvisibility=default") diff --git a/backends/qualcomm/runtime/backends/direct_mode/QnnExecuTorchIdlWrapper.cpp b/backends/qualcomm/runtime/backends/direct_mode/QnnExecuTorchIdlWrapper.cpp index 1c212d5fb65..d1a6bfc9590 100644 --- a/backends/qualcomm/runtime/backends/direct_mode/QnnExecuTorchIdlWrapper.cpp +++ b/backends/qualcomm/runtime/backends/direct_mode/QnnExecuTorchIdlWrapper.cpp @@ -5,6 +5,7 @@ * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ +#include #include #include #include @@ -33,6 +34,16 @@ namespace executorch { namespace backends { namespace qnn { +size_t size_with_alignment_padding(size_t sz) { + return QNN_TENSOR_ALIGNMENT + sz; +} + +void* align_ptr(void* ptr) { + void* addr = reinterpret_cast( + ((size_t)ptr + (QNN_TENSOR_ALIGNMENT - 1)) & ~(QNN_TENSOR_ALIGNMENT - 1)); + return addr; +} + // Code logic below is similar to qnn_executor_runner to load pte file. QnnExecuTorchIdlWrapper::QnnExecuTorchIdlWrapper( const char* pte_path, @@ -113,12 +124,13 @@ QnnExecuTorchIdlWrapper::QnnExecuTorchIdlWrapper( input_tensors_.resize(method_->inputs_size()); for (int i = 0; i < input_tensors_.size(); i++) { Result tensor_info = method_meta_->input_tensor_meta(i); - input_tensors_[i].resize(tensor_info->nbytes()); + input_tensors_[i].resize( + size_with_alignment_padding(tensor_info->nbytes())); input_tensor_impls_.emplace_back(TensorImpl( tensor_info->scalar_type(), tensor_info->sizes().size(), const_cast(tensor_info->sizes().data()), - input_tensors_[i].data(), + align_ptr(input_tensors_[i].data()), const_cast( tensor_info->dim_order().data()))); Error ret = method_->set_input(Tensor(&input_tensor_impls_.back()), i); @@ -131,9 +143,10 @@ QnnExecuTorchIdlWrapper::QnnExecuTorchIdlWrapper( output_tensors_.resize(method_->outputs_size()); for (int i = 0; i < output_tensors_.size(); ++i) { Result tensor_info = method_meta_->output_tensor_meta(i); - output_tensors_[i].resize(tensor_info->nbytes()); + output_tensors_[i].resize( + size_with_alignment_padding(tensor_info->nbytes())); Error ret = method_->set_output_data_ptr( - output_tensors_[i].data(), tensor_info->nbytes(), i); + align_ptr(output_tensors_[i].data()), tensor_info->nbytes(), i); if (ret != Error::Ok) { FARF(RUNTIME_ERROR, "Failed to set output tensor: %d", (int)ret); return; @@ -190,9 +203,10 @@ Error QnnExecuTorchIdlWrapper::execute_all( status = Error::Internal; return status; } - - ssize_t bytes = - read(fd, input_tensors_[i].data(), input_tensors_[i].size()); + ssize_t bytes = read( + fd, + align_ptr(input_tensors_[i].data()), + method_meta_->input_tensor_meta(i)->nbytes()); if (bytes < 0) { FARF(RUNTIME_ERROR, "Failed to read data from file to input_tensor."); status = Error::Internal; @@ -231,14 +245,26 @@ Error QnnExecuTorchIdlWrapper::execute_all( return status; } + size_t expected_bytes = method_meta_->output_tensor_meta(i)->nbytes(); ssize_t bytes = - write(fd, output_tensors_[i].data(), output_tensors_[i].size()); + write(fd, align_ptr(output_tensors_[i].data()), expected_bytes); if (bytes < 0) { FARF(RUNTIME_ERROR, "Failed to write data to output file."); close(fd); status = Error::Internal; return status; } + if (static_cast(bytes) != expected_bytes) { + FARF( + RUNTIME_ERROR, + "Output %zu: wrote %zd bytes, expected %zu bytes", + i, + bytes, + expected_bytes); + close(fd); + status = Error::Internal; + return status; + } close(fd); } save_end = Clock::now(); diff --git a/backends/qualcomm/runtime/backends/direct_mode/README.md b/backends/qualcomm/runtime/backends/direct_mode/README.md index 0fa89114938..35b86d3d7b0 100644 --- a/backends/qualcomm/runtime/backends/direct_mode/README.md +++ b/backends/qualcomm/runtime/backends/direct_mode/README.md @@ -1,36 +1,44 @@ # Direct Mode ## Introduction -This tutorial will cover **Direct Mode**, also known as the **Native DSP Backend** in the QNN SDK. The QNN SDK provides predefined protocols for general use cases. However, there may be situations where users want to go further and define their own RPC calls for customized workflows. For example, a user might want to perform an RPC call that handles model loading, input loading and setting, execution, and output saving in a single call. This is something not possible with QNN's predefined RPC protocol. This approach can improve performance by giving users control over where resources are loaded and by reducing the number of RPC calls. To address this need, **Direct Mode** was introduced, providing flexibility for users to define their own FastRPC protocol. For more information about FastRPC, please refer to the [Hexagon SDK](https://www.qualcomm.com/developer/software/hexagon-npu-sdk) for details on setup, building, and defining custom protocols. +This tutorial will cover **Direct Mode**, also known as the **Native DSP Backend** in the QNN SDK. The QNN SDK provides predefined protocols for general use cases. However, there may be situations where users want to go further and define their own RPC calls for customized workflows. For example, a user might want to reduce the number of RPC calls by combining model loading, input setting, execution, and output retrieval into fewer round trips. This approach can also improve performance by giving users control over where resources are loaded. To address this need, **Direct Mode** was introduced, providing flexibility for users to define their own FastRPC protocol. For more information about FastRPC, please refer to the [Hexagon SDK](https://www.qualcomm.com/developer/software/hexagon-npu-sdk) for details on setup, building, and defining custom protocols. -## Requirments +## Requirements Below are the required files to enable **Direct Mode**. Example files are also provided for reference. 1. A **.idl** file that defines the interface. A sample self-defined protocol can be found under [qnn_executorch.idl](qnn_executorch.idl). -This file specifies how the AP and DSP communicate. It can be compiled into header, stub, and skel files using the Hexagon SDK’s **qaic** compiler. **qaic** compiler and more information about **qaic** can be found under `$HEXAGON_SDK_ROOT/ipc/fastrpc/qaic/Ubuntu` +This file specifies how the AP and DSP communicate. It can be compiled into header, stub, and skel files using the Hexagon SDK's **qaic** compiler. More information related to **qaic** compiler can be found under `$HEXAGON_SDK_ROOT/ipc/fastrpc/qaic/Ubuntu`. -2. Implementation for the skel. An example for skel implementation for [qnn_executorch.idl](qnn_executorch.idl) can be found in [qnn_executorch_imp.cpp](qnn_executorch_imp.cpp). +2. Implementation for the skel. An example for skel implementation [qnn_executorch.idl](qnn_executorch.idl) can be found in [qnn_executorch_imp.cpp](qnn_executorch_imp.cpp). -3. Implementation to control session and perform RPC calls. An example runner can be found in [qnn_executor_direct_runner.cpp](../../../../../examples/qualcomm/direct_executor_runner/qnn_executor_direct_runner.cpp) +3. Implementation to control session and perform RPC calls. An example runner can be found in [qnn_executor_direct_runner.cpp](../../../../../examples/qualcomm/direct_executor_runner/qnn_executor_direct_runner.cpp). ## Instructions Below are the steps to build **Direct Mode** artifacts and execute with **Direct Mode**. -1. Export required environment variables. Please export the following 3 variables: +1. Export required environment variables. Please export the following 2 variables: - `HEXAGON_SDK_ROOT`: Path to Hexagon SDK root directory. - `HEXAGON_TOOLS_ROOT`: Hexagon SDK includes 1 or more toolchains. If you are unsure which toolchain to use, you can check `$QNN_SDK_ROOT/share/QNN/OpPackageGenerator/makefiles/HTP/Makefile`. Inside, you will find a mapping between devices and toolchains. The path to `HEXAGON_TOOLS_ROOT` should look similar to `$HEXAGON_SDK_ROOT/tools/HEXAGON_Tools/19.0.04` - - `DSP_VERSION`: The target DSP architecture (e.g., `v79`). - -2. Build necessary artifacts -```bash -backends/qualcomm/scripts/build.sh --enable_hexagon -``` -3. Execution -Below is an example to execute a unit test with direct mode using qnn_executor_direct_runner. -``` -python backends/qualcomm/tests/test_qnn_delegate.py -k TestQNNQuantizedOperator.test_qnn_backend_adaptive_avg_pool2d --soc_model SM8750 --device $DEVICE_ID --build_folder build-android --direct_build_folder build-hexagon/ -``` + +2. Build necessary artifacts. + The build differs depending on the target backend. cDSP (HTP) is relatively simple since signing is not required. On the other hand, aDSP (LPAI) requires all libraries running on aDSP to be signed. The build script handles all of this automatically. For example, if a user has an SM8850 device: + + To build for cDSP: + ```bash + backends/qualcomm/scripts/build.sh --build_direct_mode 3 --soc_model SM8850 + ``` + To build for aDSP: + ```bash + backends/qualcomm/scripts/build.sh --build_direct_mode 0 --soc_model SM8850 + ``` + +3. Execution. + Below is an example to execute a unit test with direct mode using `qnn_executor_direct_runner`. + Replace `$BACKEND` with either `htp` or `lpai`, make sure this is aligned with backend chosen during step 2: + ``` + python backends/qualcomm/tests/test_qnn_delegate.py -k TestQNNQuantizedOperator.test_qnn_backend_adaptive_avg_pool2d --soc_model SM8850 --device $DEVICE_ID --build_folder build-android --direct_build_folder build-direct/ --backend $BACKEND + ``` ### Note The model execution time for `qnn_executor_direct_runner` is expected to be faster than `qnn_executor_runner` because it reduces DMA usage and minimizes the number of RPC calls. However, you may observe that the total completion time for `qnn_executor_direct_runner` appears longer. This is expected in the demo script, since the runner performs file loading and saving on the DSP side. These operations can be slightly slower compared to when the AP handles them. -In production scenarios, this difference should not be a concern. Typically, inputs will be accessed directly from memory and outputs will be handled in more optimized ways. The file I/O in the demo is included only to align the behavior of `qnn_executor_direct_runner` with `qnn_executor_runner`, and to simplify testing. It is not representative of the intended performance characteristics in real-world usage. \ No newline at end of file +In production scenarios, this difference should not be a concern. Typically, inputs will be accessed directly from memory and outputs will be handled in more optimized ways. The file I/O in the demo is included only to align the behavior of `qnn_executor_direct_runner` with `qnn_executor_runner`, and to simplify testing. It is not representative of the intended performance characteristics in real-world usage. diff --git a/backends/qualcomm/runtime/backends/lpai/LpaiContext.cpp b/backends/qualcomm/runtime/backends/lpai/LpaiContext.cpp index 11f90722f32..d5203898f6b 100644 --- a/backends/qualcomm/runtime/backends/lpai/LpaiContext.cpp +++ b/backends/qualcomm/runtime/backends/lpai/LpaiContext.cpp @@ -39,6 +39,14 @@ Error LpaiContext::MakeConfig(std::vector& config) { config.push_back(&context_config_[i]); } +#ifdef __hexagon__ + QnnContext_Config_t adsp_context_config; + adsp_context_config.option = QNN_CONTEXT_CONFIG_PERSISTENT_BINARY; + adsp_context_config.isPersistentBinary = 1; + context_config_.push_back(adsp_context_config); + config.push_back(&context_config_.back()); +#endif + config.push_back(nullptr); return Error::Ok; } diff --git a/backends/qualcomm/runtime/backends/lpai/LpaiGraph.cpp b/backends/qualcomm/runtime/backends/lpai/LpaiGraph.cpp index f3bb13d7724..7373ceff8d8 100644 --- a/backends/qualcomm/runtime/backends/lpai/LpaiGraph.cpp +++ b/backends/qualcomm/runtime/backends/lpai/LpaiGraph.cpp @@ -17,6 +17,56 @@ using executorch::runtime::Error; Error LpaiGraph::AfterRetrieveGraph(const std::string& graph_name) { std::vector graph_custom_config; QnnLpaiGraph_CustomConfig_t* p_custom_config = nullptr; + const QnnInterface& qnn_interface = implementation_->GetQnnInterface(); + Qnn_ErrorHandle_t error; + +#ifdef __hexagon__ + uint32_t scratch_size = 0; + uint32_t persistent_size = 0; + QnnLpaiGraph_CustomProperty_t custom_props[2]; + custom_props[0].option = QNN_LPAI_GRAPH_GET_PROP_SCRATCH_MEM_SIZE; + custom_props[0].property = &scratch_size; + custom_props[1].option = QNN_LPAI_GRAPH_GET_PROP_PERSISTENT_MEM_SIZE; + custom_props[1].property = &persistent_size; + + QnnGraph_Property_t graph_props[2]; + graph_props[0].option = QNN_GRAPH_PROPERTY_OPTION_CUSTOM; + graph_props[0].customProperty = &custom_props[0]; + graph_props[1].option = QNN_GRAPH_PROPERTY_OPTION_CUSTOM; + graph_props[1].customProperty = &custom_props[1]; + QnnGraph_Property_t* graph_prop_ptrs[3] = {0}; + graph_prop_ptrs[0] = &graph_props[0]; + graph_prop_ptrs[1] = &graph_props[1]; + + error = qnn_interface.qnn_graph_get_property( + handle_[graph_name], graph_prop_ptrs); + + if (error != QNN_SUCCESS) { + QNN_EXECUTORCH_LOG_ERROR( + "failed to get graph property: %d", QNN_GET_ERROR_CODE(error)); + return Error::Internal; + } + + scratch_buf_.resize(scratch_size); + p_custom_config = AllocGraphCustomConfig(); + p_custom_config->option = QNN_LPAI_GRAPH_SET_CFG_SCRATCH_MEM; + auto p_scratch_config = AllocMem(); + p_scratch_config->memType = QNN_LPAI_MEM_TYPE_DDR; + p_scratch_config->size = scratch_size; + p_scratch_config->addr = scratch_buf_.data(); + p_custom_config->config = p_scratch_config; + graph_custom_config.push_back(p_custom_config); + + persistent_buf_.resize(persistent_size); + p_custom_config = AllocGraphCustomConfig(); + p_custom_config->option = QNN_LPAI_GRAPH_SET_CFG_PERSISTENT_MEM_DEFAULT; + auto p_persistent_config = AllocMem(); + p_persistent_config->memType = QNN_LPAI_MEM_TYPE_DDR; + p_persistent_config->size = persistent_size; + p_persistent_config->addr = persistent_buf_.data(); + p_custom_config->config = p_persistent_config; + graph_custom_config.push_back(p_custom_config); +#endif // perf config p_custom_config = AllocGraphCustomConfig(); @@ -56,8 +106,7 @@ Error LpaiGraph::AfterRetrieveGraph(const std::string& graph_name) { config.push_back(nullptr); // LPAI specific configs can only be set after graph create - const QnnInterface& qnn_interface = implementation_->GetQnnInterface(); - Qnn_ErrorHandle_t error = + error = qnn_interface.qnn_graph_set_config(handle_[graph_name], config.data()); if (error != QNN_SUCCESS) { QNN_EXECUTORCH_LOG_ERROR( diff --git a/backends/qualcomm/runtime/backends/lpai/LpaiGraph.h b/backends/qualcomm/runtime/backends/lpai/LpaiGraph.h index 7f3d8c5b148..f6380c20376 100644 --- a/backends/qualcomm/runtime/backends/lpai/LpaiGraph.h +++ b/backends/qualcomm/runtime/backends/lpai/LpaiGraph.h @@ -51,6 +51,7 @@ class LpaiGraph : public QnnGraph { std::vector> lpai_core_affinity_; std::vector> lpai_prepare_; + std::vector scratch_buf_, persistent_buf_; QnnLpaiGraph_Mem_t* AllocMem() { lpai_mem_.emplace_back(std::make_unique()); diff --git a/backends/qualcomm/runtime/backends/lpai/target/LpaiContextCustomConfig.cpp b/backends/qualcomm/runtime/backends/lpai/target/LpaiContextCustomConfig.cpp index dc8352f1205..f9ceb8793f3 100644 --- a/backends/qualcomm/runtime/backends/lpai/target/LpaiContextCustomConfig.cpp +++ b/backends/qualcomm/runtime/backends/lpai/target/LpaiContextCustomConfig.cpp @@ -15,13 +15,15 @@ namespace qnn { std::vector LpaiContextCustomConfig::CreateContextCustomConfig() { std::vector ret; - QnnLpaiContext_CustomConfig_t* p_custom_config = nullptr; +#ifndef __hexagon__ + QnnLpaiContext_CustomConfig_t* p_custom_config = nullptr; // TODO: support graph based execution in island mode p_custom_config = AllocContextCustomConfig(); p_custom_config->option = QNN_LPAI_CONTEXT_SET_CFG_ENABLE_ISLAND; p_custom_config->config = nullptr; ret.push_back(static_cast(p_custom_config)); +#endif return ret; } diff --git a/backends/qualcomm/runtime/backends/lpai/target/LpaiDevice.cpp b/backends/qualcomm/runtime/backends/lpai/target/LpaiDevice.cpp index ce592303b45..4bcba99f5e3 100644 --- a/backends/qualcomm/runtime/backends/lpai/target/LpaiDevice.cpp +++ b/backends/qualcomm/runtime/backends/lpai/target/LpaiDevice.cpp @@ -13,7 +13,11 @@ namespace backends { namespace qnn { Error LpaiDevice::Configure() { +#ifndef __hexagon__ return QnnDevice::Configure(); +#else + return Error::Ok; +#endif } } // namespace qnn diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh index 39418e63b40..498bf924921 100755 --- a/backends/qualcomm/scripts/build.sh +++ b/backends/qualcomm/scripts/build.sh @@ -36,6 +36,7 @@ fi set -o xtrace usage() { + set +x echo "Usage: Build the aarch64 version of executor runner or the python interface of Qnn Manager" echo "" echo "QNN SDK and Android NDK will be auto-downloaded if not set." @@ -45,6 +46,16 @@ usage() { echo "TOOLCHAIN_ROOT_TARGET=/path/to/sysroots/xx_target for linux embedded with --enable_linux_embedded)" echo "" echo "e.g.: executorch$ ./backends/qualcomm/scripts/build.sh --skip_x86_64" + echo "" + echo "Direct mode: Use --build_direct_mode --soc_model to enable." + echo "You can choose either LPAI (ADSP) or CDSP (HTP) as the target DSP:" + echo " LPAI (ADSP): dsp_type=0" + echo " CDSP (HTP): dsp_type=3" + echo "" + echo "e.g. Build with LPAI direct mode for SM8850 device:" + echo " executorch$ ./backends/qualcomm/scripts/build.sh --build_direct_mode 0 --soc_model SM8850" + echo "e.g. Build with CDSP direct mode for SM8750 device:" + echo " executorch$ ./backends/qualcomm/scripts/build.sh --build_direct_mode 3 --soc_model SM8750" exit 1 } @@ -56,15 +67,16 @@ CMAKE_X86_64="build-x86" BUILD_ANDROID="true" CMAKE_ANDROID="build-android" BUILD_HEXAGON="false" -CMAKE_HEXAGON="build-hexagon" +CMAKE_HEXAGON="build-direct" BUILD_OE_LINUX="false" CMAKE_OE_LINUX="build-oe-linux" CLEAN="true" BUILD_TYPE="RelWithDebInfo" BUILD_JOB_NUMBER="16" -# Default to use CDSP for now -DSP_TYPE=3 +# Default DSP_TYPE=-1 means direct mode is disabled. +DSP_TYPE=-1 +SOC_MODEL="" if [ -z "$PYTHON_EXECUTABLE" ]; then PYTHON_EXECUTABLE="python3" @@ -74,7 +86,7 @@ if [ -z "$BUCK2" ]; then BUCK2="buck2" fi -long_options=skip_x86_64,skip_linux_android,enable_linux_embedded,enable_hexagon,no_clean,release,job_number:,dsp_type: +long_options=skip_x86_64,skip_linux_android,enable_linux_embedded,build_direct_mode:,soc_model:,no_clean,release,job_number: parsed_args=$(getopt -a --options '' --longoptions $long_options --name "$0" -- "$@") eval set -- "$parsed_args" @@ -83,18 +95,41 @@ while true ; do case "$1" in --skip_x86_64) BUILD_X86_64="false"; shift;; --skip_linux_android) BUILD_ANDROID="false"; shift;; - --enable_hexagon) BUILD_HEXAGON="true"; shift;; + --build_direct_mode) DSP_TYPE="$2"; BUILD_HEXAGON="true"; shift 2;; + --soc_model) SOC_MODEL="$2"; shift 2;; --enable_linux_embedded) BUILD_ANDROID="false"; BUILD_OE_LINUX="true"; shift;; --no_clean) CLEAN="false"; shift;; --release) BUILD_TYPE="Release"; shift;; --job_number) BUILD_JOB_NUMBER="$2"; shift 2;; - --dsp_type) DSP_TYPE="$2"; shift 2;; --) shift; break;; esac done PRJ_ROOT="$( cd "$(dirname "$0")/../../.." ; pwd -P)" +if [ "$DSP_TYPE" -ne -1 ]; then + if [ "$DSP_TYPE" != "0" ] && [ "$DSP_TYPE" != "3" ]; then + echo "Error: --build_direct_mode only accepts 0 (ADSP/LPAI) or 3 (CDSP/HTP)." + exit 1 + fi + + if [ -z "$SOC_MODEL" ]; then + echo "Error: --soc_model is required when using --build_direct_mode." + echo "e.g. --soc_model SM8850" + exit 1 + fi + + source "${SCRIPT_DIR}/build_utils.sh" + resolve_soc_info "$PYTHON_EXECUTABLE" "$SOC_MODEL" "$DSP_TYPE" + HTP_ARCH="v${HTP_ARCH}" + if [ -n "$LPAI_HW_VER" ]; then + LPAI_HW_VER="v${LPAI_HW_VER}" + fi + echo "[QNN Direct Mode] SoC model: ${SOC_MODEL}" + echo "[QNN Direct Mode] HTP arch version: ${HTP_ARCH}" + echo "[QNN Direct Mode] LPAI hardware version: ${LPAI_HW_VER}" +fi + if [ "$BUILD_ANDROID" = true ]; then if [[ -z ${ANDROID_NDK_ROOT} ]]; then @@ -144,9 +179,6 @@ if [ "$BUILD_ANDROID" = true ]; then EXAMPLE_ROOT=examples/qualcomm CMAKE_PREFIX_PATH="${BUILD_ROOT};${BUILD_ROOT}/third-party/gflags;" - # DSP_TYPE variable only matters when building direct_mode. - # Ignore the variable for traditional mode. - if [ "$BUILD_HEXAGON" = "true" ]; then DIRECT_MODE_FLAG="-DBUILD_DIRECT_MODE=ON" else @@ -187,8 +219,7 @@ if [ "$BUILD_ANDROID" = true ]; then cmake --build $LLAMA_EXAMPLE_ROOT -j$BUILD_JOB_NUMBER fi -# TODO: Currently, DSP Domain is set to 3 (cdsp). In future, either create 2 folders: build_cdsp, build_adsp when supporting LPAI, or -# see if there's a way to build both cdsp and adsp in 1 library. + if [ "$BUILD_HEXAGON" = true ]; then if [[ -z ${ANDROID_NDK_ROOT} ]]; then echo "Please export ANDROID_NDK_ROOT=/path/to/android_ndkXX" @@ -205,11 +236,6 @@ if [ "$BUILD_HEXAGON" = true ]; then exit -1 fi - if [[ -z ${DSP_VERSION} ]]; then - echo "Please export DSP_VERSION=xx. e.g. For SM8750, please export v79. Conversion table can be found in _soc_info_table under executorch/backends/qualcomm/serialization/qc_schema.py." - exit -1 - fi - BUILD_ROOT=$PRJ_ROOT/$CMAKE_HEXAGON if [ "$CLEAN" = true ]; then @@ -240,7 +266,7 @@ if [ "$BUILD_HEXAGON" = true ]; then -DQNN_SDK_ROOT=$QNN_SDK_ROOT \ -DHEXAGON_SDK_ROOT=$HEXAGON_SDK_ROOT \ -DHEXAGON_TOOLS_ROOT=$HEXAGON_TOOLS_ROOT \ - -DDSP_VERSION=$DSP_VERSION \ + -DCDSP_VERSION=$HTP_ARCH \ -DCMAKE_TOOLCHAIN_FILE=$HEXAGON_SDK_ROOT/build/cmake/hexagon_toolchain.cmake \ -DDSP_TYPE=$DSP_TYPE \ -DANDROID_ABI='arm64-v8a' \ @@ -250,6 +276,10 @@ if [ "$BUILD_HEXAGON" = true ]; then -B$BUILD_ROOT cmake --build $BUILD_ROOT -j$BUILD_JOB_NUMBER --target install + + if [ "$DSP_TYPE" = "0" ]; then + bash $SCRIPT_DIR/sign_library.sh --direct_mode --htp_arch $HTP_ARCH --lpai_arch $LPAI_HW_VER + fi fi diff --git a/backends/qualcomm/scripts/build_utils.sh b/backends/qualcomm/scripts/build_utils.sh new file mode 100644 index 00000000000..81a7f2d9f2d --- /dev/null +++ b/backends/qualcomm/scripts/build_utils.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Resolve SoC model to HTP arch version and optionally LPAI hardware version. +# Sets HTP_ARCH (always) and LPAI_HW_VER (only when DSP_TYPE=0) in caller's scope. +# Arguments: +# $1 - PYTHON_EXECUTABLE +# $2 - SOC_MODEL +# $3 - DSP_TYPE +resolve_soc_info() { + local python_exec="$1" + local soc_model="$2" + local dsp_type="$3" + + HTP_ARCH=$($python_exec -c " +import sys, os +devnull = open(os.devnull, 'w') +old_stdout = sys.stdout +sys.stdout = devnull +from executorch.backends.qualcomm.utils.utils import get_soc_to_htp_arch_map +sys.stdout = old_stdout +m = get_soc_to_htp_arch_map() +if '${soc_model}' not in m: + sys.exit(1) +print(m['${soc_model}'].value) +" 2>/dev/null) || { + echo "Error: SoC model '${soc_model}' not found in HTP arch map." + echo "Check supported models in executorch/backends/qualcomm/utils/utils.py get_soc_to_htp_arch_map()." + exit 1 + } + + if [ "$dsp_type" = "0" ]; then + LPAI_HW_VER=$($python_exec -c " +import sys, os +devnull = open(os.devnull, 'w') +old_stdout = sys.stdout +sys.stdout = devnull +from executorch.backends.qualcomm.utils.utils import get_soc_to_lpai_hw_ver_map +sys.stdout = old_stdout +m = get_soc_to_lpai_hw_ver_map() +if '${soc_model}' not in m: + sys.exit(1) +print(m['${soc_model}'].value) +" 2>/dev/null) || { + echo "Error: SoC model '${soc_model}' not found in LPAI hardware version map." + echo "Check supported models in executorch/backends/qualcomm/utils/utils.py get_soc_to_lpai_hw_ver_map()." + exit 1 + } + fi +} diff --git a/backends/qualcomm/scripts/sign_library.sh b/backends/qualcomm/scripts/sign_library.sh index a1c763275b1..014dbb100cd 100755 --- a/backends/qualcomm/scripts/sign_library.sh +++ b/backends/qualcomm/scripts/sign_library.sh @@ -10,21 +10,30 @@ if [[ -z $HEXAGON_SDK_ROOT || -z $QNN_SDK_ROOT ]]; then exit -1 fi -usage() { +usage() { echo "Usage: Sign the LPAI library for a given LPAI architecture" - echo "e.g.: executorch$ $0 --lpai_arch v6" + echo "" + echo "Non-direct mode (default), e.g.:" + echo " executorch$ $0 --lpai_arch v6" + echo "" + echo "Direct mode, e.g.:" + echo " executorch$ $0 --direct_mode --htp_arch v81 --lpai_arch v6" exit 1; } -short=l:,h -long=lpai_arch:,help +short=l:,c:,d,h +long=lpai_arch:,htp_arch:,direct_mode,help args=$(getopt -a -o $short -l $long -n $0 -- $@) eval set -- $args lpai_arch="" +htp_arch="" +direct_mode=false while true; do case $1 in -l | --lpai_arch) lpai_arch=$2; shift 2;; + -c | --htp_arch) htp_arch=$2; shift 2;; + -d | --direct_mode) direct_mode=true; shift;; -h | --help) usage;; --) shift; break;; *) echo "unknown keyword: $1"; usage;; @@ -36,8 +45,27 @@ if [[ -z $lpai_arch ]]; then usage fi +if [ "$direct_mode" = true ]; then + if [[ -z $htp_arch ]]; then + echo "please specify htp_arch for direct mode" + usage + fi +fi + +SCRIPT_DIR=$(cd -- "$(dirname "$0")" && pwd) +PRJ_ROOT=$SCRIPT_DIR/../../.. + signed_folder=$QNN_SDK_ROOT/lib/lpai-$lpai_arch/signed signer=$HEXAGON_SDK_ROOT/tools/elfsigner/elfsigner.py mkdir -p $signed_folder -yes 2>/dev/null | python $signer -i $QNN_SDK_ROOT/lib/lpai-$lpai_arch/unsigned/libQnnLpaiSkel.so -o $signed_folder +if [ "$direct_mode" = true ]; then + yes 2>/dev/null | python $signer -i $QNN_SDK_ROOT/lib/lpai-$lpai_arch/unsigned/libQnnLpai.so -o $signed_folder + yes 2>/dev/null | python $signer -i $QNN_SDK_ROOT/lib/hexagon-$htp_arch/unsigned/libQnnSystem.so -o $signed_folder + yes 2>/dev/null | python $signer -i $HEXAGON_TOOLS_ROOT/Tools/target/hexagon/lib/$htp_arch/G0/pic/libc++abi.so.1 -o $signed_folder + yes 2>/dev/null | python $signer -i $HEXAGON_TOOLS_ROOT/Tools/target/hexagon/lib/$htp_arch/G0/pic/libc++.so.1 -o $signed_folder + yes 2>/dev/null | python $signer -i $PRJ_ROOT/build-direct/backends/qualcomm/qnn_executorch/direct_mode/libqnn_executorch_skel.so -o $signed_folder + yes 2>/dev/null | python $signer -i $PRJ_ROOT/build-direct/backends/qualcomm/libqnn_executorch_backend.so -o $signed_folder +else + yes 2>/dev/null | python $signer -i $QNN_SDK_ROOT/lib/lpai-$lpai_arch/unsigned/libQnnLpaiSkel.so -o $signed_folder +fi diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 8fc4fd4e6a1..a6470f623ba 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -36,8 +36,8 @@ from executorch.backends.qualcomm.serialization.qc_schema import ( QnnExecuTorchBackendType, QnnExecuTorchHtpPerformanceMode, - QnnExecuTorchLpaiTargetEnv, ) + from executorch.backends.qualcomm.tests.utils import ( convert_pt2e, generate_context_binary, @@ -2400,11 +2400,7 @@ def setUp(self): backend_options = generate_htp_compiler_spec(use_fp16=False) case QnnExecuTorchBackendType.kLpaiBackend: backend_options = generate_lpai_compiler_spec( - target_env=( - QnnExecuTorchLpaiTargetEnv.kX86 - if self.enable_x86_64 - else QnnExecuTorchLpaiTargetEnv.kArm - ) + target_env=self.get_lpai_target_env() ) case _: raise ValueError("Backend is not implemented yet") @@ -4750,11 +4746,7 @@ def setUp(self): backend_options = generate_htp_compiler_spec(use_fp16=False) case QnnExecuTorchBackendType.kLpaiBackend: backend_options = generate_lpai_compiler_spec( - target_env=( - QnnExecuTorchLpaiTargetEnv.kX86 - if self.enable_x86_64 - else QnnExecuTorchLpaiTargetEnv.kArm - ) + target_env=self.get_lpai_target_env() ) case _: raise ValueError("Backend is not implemented yet") @@ -5860,11 +5852,7 @@ def setUp(self): backend_options = generate_htp_compiler_spec(use_fp16=False) case QnnExecuTorchBackendType.kLpaiBackend: backend_options = generate_lpai_compiler_spec( - target_env=( - QnnExecuTorchLpaiTargetEnv.kX86 - if self.enable_x86_64 - else QnnExecuTorchLpaiTargetEnv.kArm - ) + target_env=self.get_lpai_target_env() ) case _: raise ValueError("Backend is not implemented yet") @@ -6102,11 +6090,7 @@ def test_qnn_backend_skip_node_id_quantizer(self): backend_options = generate_htp_compiler_spec(use_fp16=False) case QnnExecuTorchBackendType.kLpaiBackend: backend_options = generate_lpai_compiler_spec( - target_env=( - QnnExecuTorchLpaiTargetEnv.kX86 - if self.enable_x86_64 - else QnnExecuTorchLpaiTargetEnv.kArm - ) + target_env=self.get_lpai_target_env() ) case _: raise ValueError("Backend is not implemented yet") @@ -6160,11 +6144,7 @@ def test_qnn_backend_skip_node_op_quantizer(self): backend_options = generate_htp_compiler_spec(use_fp16=False) case QnnExecuTorchBackendType.kLpaiBackend: backend_options = generate_lpai_compiler_spec( - target_env=( - QnnExecuTorchLpaiTargetEnv.kX86 - if self.enable_x86_64 - else QnnExecuTorchLpaiTargetEnv.kArm - ) + target_env=self.get_lpai_target_env() ) case _: raise ValueError("Backend is not implemented yet") diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py index 481c2b71696..93a6dd81f73 100644 --- a/backends/qualcomm/tests/utils.py +++ b/backends/qualcomm/tests/utils.py @@ -27,7 +27,10 @@ ) from executorch.backends.qualcomm.qnn_preprocess import QnnBackend from executorch.backends.qualcomm.quantizer.quantizer import ModuleQConfig, QuantDtype -from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset +from executorch.backends.qualcomm.serialization.qc_schema import ( + QcomChipset, + QnnExecuTorchLpaiTargetEnv, +) from executorch.backends.qualcomm.utils.constants import ( QCOM_DTYPE, QCOM_PASS_ACTIVATE_KEY, @@ -832,3 +835,10 @@ def call(self, graph_module: torch.fx.GraphModule): QCOM_PASS_ACTIVATE_KEY: True, QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY: {"division": division}, } + + def get_lpai_target_env(self): + if self.enable_x86_64: + return QnnExecuTorchLpaiTargetEnv.kX86 + elif self.direct_build_folder: + return QnnExecuTorchLpaiTargetEnv.kAdsp + return QnnExecuTorchLpaiTargetEnv.kArm diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py index 8a24cfd0aa1..bf11230065c 100644 --- a/backends/qualcomm/utils/utils.py +++ b/backends/qualcomm/utils/utils.py @@ -1253,6 +1253,7 @@ def generate_qnn_executorch_compiler_spec( # noqa: C901 ] +# If changing function interface, please ensure it doesn't break backends/qualcomm/scripts/build_utils.sh def get_soc_to_htp_arch_map(): return { "SA8295": HtpArch.V68, @@ -1278,6 +1279,7 @@ def get_soc_to_htp_arch_map(): } +# If changing function interface, please ensure it doesn't break backends/qualcomm/scripts/build_utils.sh def get_soc_to_lpai_hw_ver_map(): return { "SM8850": LpaiHardwareVersion.V6, diff --git a/examples/qualcomm/scripts/inception_v3.py b/examples/qualcomm/scripts/inception_v3.py index f1781207d0f..5e63746cf06 100755 --- a/examples/qualcomm/scripts/inception_v3.py +++ b/examples/qualcomm/scripts/inception_v3.py @@ -54,6 +54,7 @@ def main(args): quant_dtype = { QnnExecuTorchBackendType.kGpuBackend: None, QnnExecuTorchBackendType.kHtpBackend: QuantDtype.use_8a8w, + QnnExecuTorchBackendType.kLpaiBackend: QuantDtype.use_8a8w, }[qnn_config.backend] build_executorch_binary(