Qualcomm AI Engine Direct - Support MSVC-compatible code (pytorch#19686)

zhaoxul-qti · web-flow · commit 5baf3f9a136c · 2026-06-04T08:28:28.000-07:00
## Summary

### 1. Remove the **designated initializers** for C++

Why it compiles on Linux but not on Windows MSVC?

- Designated initializers for C++ aggregates were standardized in C++20.
GCC and Clang have supported them as a C++11/14/17 extension — they
silently accept the syntax even when compiling in `-std=c++17` mode.
MSVC is strictly conformant: it only accepts designated initializers
when `/std:c++20` (or `/std:c++latest`) is active.

### 2. Remove the **GNU statement expressions**

Why it compiles on Linux but not on Windows MSVC?

- The GNU statement expression is a GNU C / GNU C++ language extension
that lets you treat a block of statements as if it were a single
expression that produces a value. It is not part of standard C or C++,
but it is widely supported by GCC and Clang. MSVC does not support it.

### 3. Replace `constexpr` inside the lambda `[&amp;]`

- `ET_INTERNAL_SWITCH` wraps the `NAME` in `[&amp;] { ... }()`. The `[&amp;]`
capture means the lambda captures all local variables by reference,
including `NAME`.
- However, inside the lambda `[&amp;]`, `NAME` is accessed via the closure's
implicit `this` pointer — it is `(*this).name` in the closure's internal
representation to capture variables by reference. Dereferencing `this`
is not a constant expression because `this` is a runtime pointer to the
closure object, which is not a constant expression and only exists at
runtime.

Why it compiles on Linux but not on Windows MSVC?

- GCC and Clang are more permissive here. They apply a special rule: if
the captured variable is itself `constexpr` and its value is a
compile-time constant, they allow it to be used as a constant expression
inside the lambda, effectively treating the capture as a constant
propagation rather than a runtime dereference. This is a
quality-of-implementation extension beyond what the standard strictly
requires.

### 4. Replace the `__attribute__((visibility("default")))` with
corresponding MSVC-compatible syntax

- Use Microsoft-specific C/C++ extensions `__declspec(dllexport)` and
`__declspec(dllimport)` to control symbol visibility when working with
Windows DLLs.
diff --git a/backends/qualcomm/aot/wrappers/QuantizeParamsWrapper.h b/backends/qualcomm/aot/wrappers/QuantizeParamsWrapper.h
@@ -70,9 +70,9 @@ class UndefinedQuantizeParamsWrapper final : public QuantizeParamsWrapper {
   }
 
   Qnn_QuantizeParams_t CreateQuantizeParams() override {
-    Qnn_QuantizeParams_t rval = {
-        .encodingDefinition = GetEncodingDefinition(),
-        .quantizationEncoding = GetQuantizationEncoding()};
+    Qnn_QuantizeParams_t rval;
+    rval.encodingDefinition = GetEncodingDefinition();
+    rval.quantizationEncoding = GetQuantizationEncoding();
     return rval;
   }
 };
diff --git a/backends/qualcomm/aot/wrappers/TensorWrapper.h b/backends/qualcomm/aot/wrappers/TensorWrapper.h
@@ -130,9 +130,12 @@ class TensorWrapper {
   std::unique_ptr<char[]> owned_data_;
   bool created_{false};
 
-  Qnn_Tensor_t tensor_ = {
-      .version = QNN_TENSOR_VERSION_2,
-      .v2 = QNN_TENSOR_V2_INIT};
+  Qnn_Tensor_t tensor_ = []() noexcept {
+    Qnn_Tensor_t t{};
+    t.version = QNN_TENSOR_VERSION_2;
+    t.v2 = QNN_TENSOR_V2_INIT;
+    return t;
+  }();
 };
 // base function for Create TensorWrapper
 std::shared_ptr<TensorWrapper> CreateTensorWrapper(
diff --git a/backends/qualcomm/runtime/QnnExecuTorch.h b/backends/qualcomm/runtime/QnnExecuTorch.h
@@ -27,6 +27,16 @@
 #define QNN_RUNTIME_LPAI_CORE_SELECTION "qnn_runtime_lpai_core_selection"
 #define QNN_RUNTIME_HEAP_PROFILING_PATH "qnn_runtime_heap_profiling_path"
 
+#if defined(_MSC_VER)
+#if defined(QNN_EXECUTORCH_BUILDING_DLL)
+#define QNN_EXECUTORCH_EXPORT __declspec(dllexport)
+#else
+#define QNN_EXECUTORCH_EXPORT __declspec(dllimport)
+#endif
+#else
+#define QNN_EXECUTORCH_EXPORT __attribute__((__visibility__("default")))
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif // __cplusplus
@@ -69,18 +79,18 @@ struct CustomMemTensorInfo {
 /// alignment as MemoryAllocator::kDefaultAlignment.
 /// See runtime/core/memory_allocator.h. The function returns a valid pointer
 /// if allocation is successful.
-__attribute__((__visibility__("default"))) void* QnnExecuTorchAllocCustomMem(
+QNN_EXECUTORCH_EXPORT void* QnnExecuTorchAllocCustomMem(
     size_t bytes,
     size_t alignment);
 
 /// Add tensor to custom memory with custom type descriptor. Create memory
 /// handle to tensor wrapper during execution
-__attribute__((__visibility__("default"))) void
-QnnExecuTorchAddCustomMemTensorAddr(void* tensor_addr, void* custom_mem);
+QNN_EXECUTORCH_EXPORT void QnnExecuTorchAddCustomMemTensorAddr(
+    void* tensor_addr,
+    void* custom_mem);
 
 /// Free the allocated shared memory.
-__attribute__((__visibility__("default"))) void QnnExecuTorchFreeCustomMem(
-    void* buffer_ptr);
+QNN_EXECUTORCH_EXPORT void QnnExecuTorchFreeCustomMem(void* buffer_ptr);
 
 #ifdef __cplusplus
 }
diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/backends/qualcomm/runtime/QnnBackendOptions.h>
+#include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
 #include <executorch/backends/qualcomm/runtime/QnnManager.h>
 #include <executorch/backends/qualcomm/runtime/SharedBuffer.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnBackendCommon.h>
diff --git a/examples/models/llama/main.cpp b/examples/models/llama/main.cpp
@@ -199,8 +199,8 @@ int32_t main(int32_t argc, char** argv) {
     }
   }
   // generate
-  executorch::extension::llm::GenerationConfig config{
-      .temperature = temperature};
+  executorch::extension::llm::GenerationConfig config{};
+  config.temperature = temperature;
 
   config.ignore_eos = FLAGS_ignore_eos;
   config.num_bos = FLAGS_num_bos;
diff --git a/examples/qualcomm/oss_scripts/llama/runner/attention_sink_rope_runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/attention_sink_rope_runner.cpp
@@ -40,9 +40,9 @@ Error AttentionSinkRopeRunner::load(
   for (const std::string& method_name : method_names) {
     ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(method_name));
   }
-  eviction_batch_size_ = ET_UNWRAP(module_->get("get_eviction_batch_size"))
-                             .toScalar()
-                             .to<int64_t>();
+  ET_UNWRAP(
+      eviction_batch_size_evalue__, module_->get("get_eviction_batch_size"));
+  eviction_batch_size_ = eviction_batch_size_evalue__.toScalar().to<int64_t>();
   return Error::Ok;
 }
 
diff --git a/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.cpp
@@ -347,8 +347,9 @@ Result<int64_t> LhdTokenGenerator::generate(
       shifted_pos++;
 
       // print the token as string, decode it with the Tokenizer object
-      token_callback(
-          ET_UNWRAP_TOKENIZER(this->tokenizer_->decode(prev_token, cur_token)));
+      ET_UNWRAP_TOKENIZER(
+          decoded_token__, this->tokenizer_->decode(prev_token, cur_token));
+      token_callback(decoded_token__);
 
       // data-dependent terminating condition: we have n_eos_ number of EOS
       if (this->eos_ids_->count(cur_token) > 0) {
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.cpp
@@ -332,8 +332,9 @@ Result<int64_t> MultimodalLhdTokenGenerator::generate(
       pos++;
 
       // print the token as string, decode it with the Tokenizer object
-      token_callback(
-          ET_UNWRAP_TOKENIZER(this->tokenizer_->decode(prev_token, cur_token)));
+      ET_UNWRAP_TOKENIZER(
+          decoded_token__, this->tokenizer_->decode(prev_token, cur_token));
+      token_callback(decoded_token__);
 
       // data-dependent terminating condition: we have n_eos_ number of EOS
       if (this->eos_ids_->count(cur_token) > 0) {
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.cpp
@@ -223,8 +223,8 @@ Error QNNMultimodalRunner::load() {
 
   ET_LOG(Info, "Reading metadata from model");
   // retrieve any method meta, can be either prefill or kv
-  int64_t num_layers =
-      ET_UNWRAP(text_decoder_->get("get_n_layers")).toScalar().to<int64_t>();
+  ET_UNWRAP(num_layers_evalue__, text_decoder_->get("get_n_layers"));
+  int64_t num_layers = num_layers_evalue__.toScalar().to<int64_t>();
 
   ET_CHECK_MSG(num_layers != -1, "Could not retrieve num layers");
   // k_cache: [1, n_heads, head_dim, seq_len]
@@ -292,8 +292,9 @@ Error QNNMultimodalRunner::load() {
   // attention
   int32_t sliding_window = context_len_;
   if (text_decoder_->method_names()->count("get_sliding_window") > 0) {
-    sliding_window =
-        ET_UNWRAP(text_decoder_->get("get_sliding_window")).toInt();
+    ET_UNWRAP(
+        sliding_window_evalue__, text_decoder_->get("get_sliding_window"));
+    sliding_window = sliding_window_evalue__.toInt();
   }
   kv_manager_ = std::make_unique<KVManager>(
       KVManager::Metadata{
@@ -527,8 +528,9 @@ executorch::runtime::Error QNNMultimodalRunner::generate(
   // print the first token from prefill. No prev_token so use cur_token for
   // it.
   if (token_callback) {
-    token_callback(
-        ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token)));
+    ET_UNWRAP_TOKENIZER(
+        decoded_token__, tokenizer_->decode(cur_token, cur_token));
+    token_callback(decoded_token__);
   }
   ET_LOG(
       Info,
@@ -538,8 +540,15 @@ executorch::runtime::Error QNNMultimodalRunner::generate(
   // start the main loop
   prompt_tokens.push_back(cur_token);
 
-  int64_t num_generated_tokens = ET_UNWRAP(token_generator_->generate(
-      prompt_tokens, cur_pos_, seq_len, token_callback, dump_logits, nullptr));
+  ET_UNWRAP(
+      num_generated_tokens,
+      token_generator_->generate(
+          prompt_tokens,
+          cur_pos_,
+          seq_len,
+          token_callback,
+          dump_logits,
+          nullptr));
   stats_.inference_end_ms = time_in_ms();
   ET_LOG(
       Info,
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -227,8 +227,8 @@ Error Runner::load() {
 
   ET_LOG(Info, "Reading metadata from model");
   // retrieve any method meta, can be either prefill or kv
-  int64_t num_layers =
-      ET_UNWRAP(module_->get("get_n_layers")).toScalar().to<int64_t>();
+  ET_UNWRAP(num_layers_evalue__, module_->get("get_n_layers"));
+  int64_t num_layers = num_layers_evalue__.toScalar().to<int64_t>();
 
   ET_CHECK_MSG(num_layers != -1, "Could not retrieve num layers");
   // k_cache: [1, n_heads, head_dim, seq_len]
@@ -270,7 +270,8 @@ Error Runner::load() {
   // attention
   int32_t sliding_window = context_len_;
   if (module_->method_names()->count("get_sliding_window") > 0) {
-    sliding_window = ET_UNWRAP(module_->get("get_sliding_window")).toInt();
+    ET_UNWRAP(sliding_window_evalue__, module_->get("get_sliding_window"));
+    sliding_window = sliding_window_evalue__.toInt();
   }
   kv_manager_ = std::make_unique<KVManager>(
       KVManager::Metadata{
@@ -461,8 +462,9 @@ Error Runner::generate_from_prompt_or_file(
   // print the first token from prefill. No prev_token so use cur_token for
   // it.
   if (token_callback) {
-    token_callback(
-        ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token)));
+    ET_UNWRAP_TOKENIZER(
+        decoded_token__, tokenizer_->decode(cur_token, cur_token));
+    token_callback(decoded_token__);
   }
   ET_LOG(
       Info,
@@ -471,13 +473,15 @@ Error Runner::generate_from_prompt_or_file(
 
   // start the main loop
   prompt_tokens.push_back(cur_token);
-  int64_t num_generated_tokens = ET_UNWRAP(token_generator_->generate(
-      prompt_tokens,
-      cur_pos_,
-      seq_len,
-      token_callback,
-      dump_logits,
-      attention_sink_rope_runner_.get()));
+  ET_UNWRAP(
+      num_generated_tokens,
+      token_generator_->generate(
+          prompt_tokens,
+          cur_pos_,
+          seq_len,
+          token_callback,
+          dump_logits,
+          attention_sink_rope_runner_.get()));
   stats_.inference_end_ms = time_in_ms();
   ET_LOG(
       Info,
diff --git a/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp
@@ -337,8 +337,9 @@ Result<int64_t> TokenGenerator::generate(
     pos++;
 
     // print the token as string, decode it with the Tokenizer object
-    token_callback(
-        ET_UNWRAP_TOKENIZER(tokenizer_->decode(prev_token, cur_token)));
+    ET_UNWRAP_TOKENIZER(
+        decoded_token__, tokenizer_->decode(prev_token, cur_token));
+    token_callback(decoded_token__);
 
     // data-dependent terminating condition: we have n_eos_ number of EOS
     if (eos_ids_->count(cur_token) > 0) {
diff --git a/examples/qualcomm/oss_scripts/llama/runner/utils.h b/examples/qualcomm/oss_scripts/llama/runner/utils.h
@@ -11,6 +11,7 @@
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <cstddef>
 #include <memory>
+#include <vector>
 
 // Template struct to hold tensor data and tensor
 
diff --git a/examples/qualcomm/oss_scripts/t5/runner/runner.cpp b/examples/qualcomm/oss_scripts/t5/runner/runner.cpp
@@ -180,8 +180,9 @@ Error Runner::generate(
     output_token_ids.push_back(cur_token);
 
     if (token_callback) {
-      token_callback(
-          ET_UNWRAP_TOKENIZER(tokenizer_->decode(prev_token, cur_token)));
+      ET_UNWRAP_TOKENIZER(
+          decoded_token__, tokenizer_->decode(prev_token, cur_token));
+      token_callback(decoded_token__);
     }
     if (eos_ids_->count(cur_token) > 0) {
       ET_LOG(Info, "\nReached to the end of generation");
diff --git a/examples/qualcomm/oss_scripts/whisper/runner/runner.cpp b/examples/qualcomm/oss_scripts/whisper/runner/runner.cpp
@@ -171,8 +171,9 @@ Error Runner::transcribe(
     ++pos;
 
     if (token_callback) {
-      token_callback(
-          ET_UNWRAP_TOKENIZER(tokenizer_->decode(prev_token, cur_token)));
+      ET_UNWRAP_TOKENIZER(
+          decoded_token__, tokenizer_->decode(prev_token, cur_token));
+      token_callback(decoded_token__);
     }
     if (eos_ids_->count(cur_token) > 0) {
       ET_LOG(Info, "\nReached to the end of generation");
diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.cpp b/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.cpp
@@ -400,8 +400,7 @@ void KVCachedMemory::prepare_io(
     for (int i = 0, range = 1024 / thread_pool_.num_workers();
          i < thread_pool_.num_workers();
          ++i) {
-      lr_update_kv_.push_back(
-          {.start = i * range, .end = (i + 1) * range, .step = 1});
+      lr_update_kv_.push_back({i * range, (i + 1) * range, 1});
     }
   }
 }
diff --git a/extension/llm/runner/util.h b/extension/llm/runner/util.h
@@ -19,18 +19,19 @@
 #include <sys/resource.h>
 #endif
 
-#define ET_UNWRAP_TOKENIZER(result__)                       \
-  ({                                                        \
-    auto tk_result__ = (result__);                          \
-    if (!tk_result__.ok()) {                                \
-      ET_LOG(                                               \
-          Error,                                            \
-          "Tokenizers error code %d",                       \
-          static_cast<uint32_t>(tk_result__.error()));      \
-      return ::executorch::runtime::Error::InvalidArgument; \
-    }                                                       \
-    std::move(*tk_result__);                                \
-  })
+// The internal result variable is named et_unwrap_result_##var__ rather than
+// a fixed name so that multiple ET_UNWRAP_TOKENIZER calls in the same scope
+// do not collide with each other.
+#define ET_UNWRAP_TOKENIZER(var__, result__)                      \
+  auto et_unwrap_result_##var__ = (result__);                     \
+  if (!et_unwrap_result_##var__.ok()) {                           \
+    ET_LOG(                                                       \
+        Error,                                                    \
+        "Tokenizers error code %d",                               \
+        static_cast<uint32_t>(et_unwrap_result_##var__.error())); \
+    return ::executorch::runtime::Error::InvalidArgument;         \
+  }                                                               \
+  auto var__ = std::move(*et_unwrap_result_##var__);
 
 #define ET_CHECK_TK_OK_OR_RETURN_ERROR(result__, ...)                        \
   do {                                                                       \
diff --git a/runtime/core/exec_aten/util/scalar_type_util.h b/runtime/core/exec_aten/util/scalar_type_util.h
@@ -916,7 +916,7 @@ struct promote_types {
 #define ET_INTERNAL_SWITCH(TYPE, CONTEXT, NAME, ...)            \
   [&] {                                                         \
     const auto& _st = TYPE;                                     \
-    constexpr const char* et_switch_name = NAME;                \
+    const char* et_switch_name = NAME;                          \
     (void)et_switch_name; /* Suppress unused var */             \
     C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum") \
     switch (_st) {                                              \
diff --git a/runtime/core/result.h b/runtime/core/result.h

Original file line number	Diff line number	Diff line change
`@@ -70,9 +70,9 @@ class UndefinedQuantizeParamsWrapper final : public QuantizeParamsWrapper {`
`70`	`70`	`}`
`71`	`71`
`72`	`72`	`Qnn_QuantizeParams_t CreateQuantizeParams() override {`
`73`		`- Qnn_QuantizeParams_t rval = {`
`74`		`- .encodingDefinition = GetEncodingDefinition(),`
`75`		`- .quantizationEncoding = GetQuantizationEncoding()};`
	`73`	`+ Qnn_QuantizeParams_t rval;`
	`74`	`+ rval.encodingDefinition = GetEncodingDefinition();`
	`75`	`+ rval.quantizationEncoding = GetQuantizationEncoding();`
`76`	`76`	`return rval;`
`77`	`77`	`}`
`78`	`78`	`};`
Original file line number	Diff line number	Diff line change
`@@ -199,8 +199,8 @@ int32_t main(int32_t argc, char** argv) {`
`199`	`199`	`}`
`200`	`200`	`}`
`201`	`201`	`// generate`
`202`		`- executorch::extension::llm::GenerationConfig config{`
`203`		`- .temperature = temperature};`
	`202`	`+ executorch::extension::llm::GenerationConfig config{};`
	`203`	`+ config.temperature = temperature;`
`204`	`204`
`205`	`205`	`config.ignore_eos = FLAGS_ignore_eos;`
`206`	`206`	`config.num_bos = FLAGS_num_bos;`
Original file line number	Diff line number	Diff line change
`@@ -40,9 +40,9 @@ Error AttentionSinkRopeRunner::load(`
`40`	`40`	`for (const std::string& method_name : method_names) {`
`41`	`41`	`ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(method_name));`
`42`	`42`	`}`
`43`		`- eviction_batch_size_ = ET_UNWRAP(module_->get("get_eviction_batch_size"))`
`44`		`- .toScalar()`
`45`		`- .to<int64_t>();`
	`43`	`+ ET_UNWRAP(`
	`44`	`+ eviction_batch_size_evalue__, module_->get("get_eviction_batch_size"));`
	`45`	`+ eviction_batch_size_ = eviction_batch_size_evalue__.toScalar().to<int64_t>();`
`46`	`46`	`return Error::Ok;`
`47`	`47`	`}`
`48`	`48`