miaobin
diff --git a/‎include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h‎
Lines changed: 6 additions & 0 deletions b/‎include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.cc‎
Lines changed: 6 additions & 4 deletions b/‎onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.cc‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.h‎
Lines changed: 5 additions & 1 deletion b/‎onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc‎
Lines changed: 6 additions & 5 deletions b/‎onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.h‎
Lines changed: 5 additions & 1 deletion b/‎onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc‎
Lines changed: 9 additions & 9 deletions b/‎onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h‎
Lines changed: 5 additions & 0 deletions b/‎onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc‎
Lines changed: 7 additions & 5 deletions b/‎onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.h‎
Lines changed: 4 additions & 1 deletion b/‎onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.h‎
Lines changed: 4 additions & 1 deletion
@@ -380,6 +380,12 @@ static const char* const kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16 = "mlas
 // - "1": Use LUT based GEMM when available.
 static const char* const kOrtSessionOptionsMlasLutGemm = "mlas.use_lut_gemm";
 
+// Use KleidiAI kernels in MLAS if available.
+// Option values:
+// - "0": Use KleidiAI kernels when available. [DEFAULT]
+// - "1": Disable KleidiAI kernels even if available.
+static const char* const kOrtSessionOptionsMlasDisableKleidiAi = "mlas.disable_kleidiai";
+
 // When converting DQ + MatMul -> MatMulNBits, the accuracy level of the MatMulNBits is controlled by this option.
 // Refer to MatMulNBits op schema for more details.
 // If not provided, default is 4.
 
@@ -19,7 +19,8 @@ template <typename T>
 AttentionWrapper<T>::AttentionWrapper(AllocatorPtr alloc, const logging::Logger& logger,
                                       int batch_size, int attn_context_depth, int attn_layer_depth,
                                       int inner_cell_hidden_size, bool has_attn_layer,
-                                      const IAttentionMechanism<T>& attention_mechanism, concurrency::ThreadPool* threadpool)
+                                      const IAttentionMechanism<T>& attention_mechanism, concurrency::ThreadPool* threadpool,
+                                      const MLAS_BACKEND_KERNEL_SELECTOR_CONFIG* mlas_backend_kernel_selector_config)
     : allocator_(alloc),
       logger_(logger),
       batch_size_(batch_size),
@@ -28,7 +29,8 @@ AttentionWrapper<T>::AttentionWrapper(AllocatorPtr alloc, const logging::Logger&
       inner_cell_hidden_size_(inner_cell_hidden_size),
       has_attn_layer_(has_attn_layer),
       attention_mechanism_(attention_mechanism),
-      ttp_(threadpool) {
+      ttp_(threadpool),
+      mlas_backend_kernel_selector_config_(mlas_backend_kernel_selector_config) {
   auto mem_max_steps = attention_mechanism_.GetMaxMemorySteps();
   prev_alignments_ = Allocate(allocator_, batch_size_ * mem_max_steps, prev_alignments_ptr_, true);
   alignments_ = Allocate(allocator_, batch_size_ * mem_max_steps, alignments_ptr_, true);
@@ -45,7 +47,7 @@ void AttentionWrapper<T>::ProcessOutput(const gsl::span<const T>& rnn_cell_outpu
                     batch_size_, attn_layer_depth_, inner_cell_hidden_size_, T{1.0},
                     rnn_cell_output.data(), inner_cell_hidden_size_,
                     attn_layer_cell_weights_.data(), attn_layer_depth_, T{0.0},
-                    attn_states_.data(), attn_layer_depth_, ttp_);
+                    attn_states_.data(), attn_layer_depth_, ttp_, mlas_backend_kernel_selector_config_);
   }
 
   // Get the context which is calculated within attention mechanism.
@@ -62,7 +64,7 @@ void AttentionWrapper<T>::ProcessOutput(const gsl::span<const T>& rnn_cell_outpu
                     batch_size_, attn_layer_depth_, attn_context_depth_, T{1.0},
                     attn_context_.data(), attn_context_depth_,
                     attn_layer_attn_weights_.data(), attn_layer_depth_, T{1.0},
-                    attn_states_.data(), attn_layer_depth_, ttp_);
+                    attn_states_.data(), attn_layer_depth_, ttp_, mlas_backend_kernel_selector_config_);
   }
 }
 
 
@@ -9,6 +9,7 @@
 #include "core/common/logging/logging.h"
 #include "core/framework/allocator.h"
 #include "core/platform/threadpool.h"
+#include "core/mlas/inc/mlas.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -23,7 +24,8 @@ class AttentionWrapper {
                    int attn_layer_depth,
                    int inner_cell_hidden_size,
                    bool has_attn_layer,
-                   const IAttentionMechanism<T>& attention_mechanism, concurrency::ThreadPool* threadpool);
+                   const IAttentionMechanism<T>& attention_mechanism, concurrency::ThreadPool* threadpool,
+                   const MLAS_BACKEND_KERNEL_SELECTOR_CONFIG* mlas_backend_kernel_selector_config);
 
   virtual ~AttentionWrapper() = default;
 
@@ -71,6 +73,8 @@ class AttentionWrapper {
 
   const IAttentionMechanism<T>& attention_mechanism_;
   concurrency::ThreadPool* ttp_;
+
+  const MLAS_BACKEND_KERNEL_SELECTOR_CONFIG* mlas_backend_kernel_selector_config_;
 };
 
 }  // namespace contrib
 
@@ -19,8 +19,9 @@ namespace contrib {
 template <typename T>
 BahdanauAttention<T>::BahdanauAttention(AllocatorPtr allocator, const logging::Logger& logger,
                                         int batch_size, int max_memory_step, int memory_depth,
-                                        int query_depth, int attn_depth, bool normalize, concurrency::ThreadPool* threadpool)
-    : allocator_(allocator), logger_(logger), batch_size_(batch_size), max_memory_steps_(max_memory_step), memory_depth_(memory_depth), query_depth_(query_depth), attn_depth_(attn_depth), normalize_(normalize), ttp_(threadpool) {
+                                        int query_depth, int attn_depth, bool normalize, concurrency::ThreadPool* threadpool,
+                                        const MLAS_BACKEND_KERNEL_SELECTOR_CONFIG* mlas_backend_kernel_selector_config)
+    : allocator_(allocator), logger_(logger), batch_size_(batch_size), max_memory_steps_(max_memory_step), memory_depth_(memory_depth), query_depth_(query_depth), attn_depth_(attn_depth), normalize_(normalize), ttp_(threadpool), mlas_backend_kernel_selector_config_(mlas_backend_kernel_selector_config) {
   values_ = Allocate(allocator_, batch_size_ * max_memory_steps_ * memory_depth_, values_ptr_, true);
   keys_ = Allocate(allocator_, batch_size_ * max_memory_steps_ * attn_depth_, keys_ptr_, true);
   processed_query_ = Allocate(allocator_, batch_size_ * attn_depth_, processed_query_ptr_, true);
@@ -80,7 +81,7 @@ void BahdanauAttention<T>::PrepareMemory(
                   batch_size_ * max_memory_steps_, attn_depth_, memory_depth_, T{1.0},
                   memory.data(), memory_depth_,
                   memory_layer_weights_.data(), attn_depth_, T{0.0},
-                  keys_.data(), attn_depth_, ttp_);
+                  keys_.data(), attn_depth_, ttp_, mlas_backend_kernel_selector_config_);
 }
 
 template <typename T>
@@ -123,7 +124,7 @@ void BahdanauAttention<T>::Compute(
                   batch_size_, attn_depth_, query_depth_, T{1.0},
                   queries.data(), query_depth_,
                   query_layer_weights_.data(), attn_depth_, T{0.0},
-                  processed_query_.data(), attn_depth_, ttp_);
+                  processed_query_.data(), attn_depth_, ttp_, mlas_backend_kernel_selector_config_);
 
   std::fill(aligns.begin(), aligns.end(), T{});
 
@@ -154,7 +155,7 @@ void BahdanauAttention<T>::Compute(
                     1, memory_depth_, max_memory_steps_, T{1.0},
                     alignments, max_memory_steps_,
                     values.data(), memory_depth_, T{0.0},
-                    outspan.data(), memory_depth_, ttp_);
+                    outspan.data(), memory_depth_, ttp_, mlas_backend_kernel_selector_config_);
   }
 }
 
 
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "core/framework/allocator.h"
+#include "core/mlas/inc/mlas.h"
 #include "core/providers/cpu/rnn/rnn_helpers.h"
 
 #include "attention_mechanism.h"
@@ -23,7 +24,8 @@ class BahdanauAttention : public IAttentionMechanism<T> {
       int memory_depth,
       int query_depth,
       int attn_depth,
-      bool normalize, concurrency::ThreadPool* threadpool);
+      bool normalize, concurrency::ThreadPool* threadpool,
+      const MLAS_BACKEND_KERNEL_SELECTOR_CONFIG* mlas_backend_kernel_selector_config);
 
   void SetWeights(
       const gsl::span<const T>& attn_weights,
@@ -78,6 +80,8 @@ class BahdanauAttention : public IAttentionMechanism<T> {
 
   bool normalize_;
   concurrency::ThreadPool* ttp_;
+
+  const MLAS_BACKEND_KERNEL_SELECTOR_CONFIG* mlas_backend_kernel_selector_config_;
 };
 
 }  // namespace contrib
 
@@ -248,7 +248,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
         memory_depth,
         query_depth,
         am_attn_size,
-        false, thread_pool);
+        false, thread_pool, &mlas_backend_kernel_selector_config_);
 
     fam.SetWeights(
         FirstHalfSpan(am_v_weights.DataAsSpan<T>()),
@@ -264,7 +264,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
         attn_layer_depth,
         hidden_size_,
         has_attention_layer,
-        fam, thread_pool);
+        fam, thread_pool, &mlas_backend_kernel_selector_config_);
     faw.SetWeights(FirstHalfSpan(attn_layer_weights_span));
 
     UniDirectionalAttnLstm<T> fw(
@@ -275,7 +275,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
         activation_funcs_.Entries()[0],
         activation_funcs_.Entries()[1],
         activation_funcs_.Entries()[2],
-        clip_, thread_pool);
+        clip_, thread_pool, &mlas_backend_kernel_selector_config_);
 
     BahdanauAttention<T> bam(
         alloc,
@@ -285,7 +285,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
         memory_depth,
         query_depth,
         am_attn_size,
-        false, thread_pool);
+        false, thread_pool, &mlas_backend_kernel_selector_config_);
     bam.SetWeights(
         SecondHalfSpan(am_v_weights.DataAsSpan<T>()),
         SecondHalfSpan(am_query_layer_weights.DataAsSpan<T>()),
@@ -300,7 +300,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
         attn_layer_depth,
         hidden_size_,
         has_attention_layer,
-        bam, thread_pool);
+        bam, thread_pool, &mlas_backend_kernel_selector_config_);
     baw.SetWeights(SecondHalfSpan(attn_layer_weights_span));
 
     UniDirectionalAttnLstm<T> bw(
@@ -311,7 +311,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
         activation_funcs_.Entries()[3],
         activation_funcs_.Entries()[4],
         activation_funcs_.Entries()[5],
-        clip_, thread_pool);
+        clip_, thread_pool, &mlas_backend_kernel_selector_config_);
 
     fw.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, output_1, hidden_output_1, last_cell_1);
     bw.Compute(input, sequence_lens_span, num_directions_, input_weights_2, hidden_weights_2, output_2, hidden_output_2, last_cell_2);
@@ -325,7 +325,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
         memory_depth,
         query_depth,
         am_attn_size,
-        false, thread_pool);
+        false, thread_pool, &mlas_backend_kernel_selector_config_);
 
     fam.SetWeights(
         am_v_weights.DataAsSpan<T>(),
@@ -341,7 +341,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
         attn_layer_depth,
         hidden_size_,
         has_attention_layer,
-        fam, thread_pool);
+        fam, thread_pool, &mlas_backend_kernel_selector_config_);
 
     faw.SetWeights(attn_layer_weights_span);
 
@@ -353,7 +353,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
         activation_funcs_.Entries()[0],
         activation_funcs_.Entries()[1],
         activation_funcs_.Entries()[2],
-        clip_, thread_pool);
+        clip_, thread_pool, &mlas_backend_kernel_selector_config_);
 
     fw.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, output_1, hidden_output_1, last_cell_1);
   }
 
@@ -10,6 +10,7 @@
 #include "core/common/narrow.h"
 #include "core/framework/op_kernel.h"
 #include "core/providers/cpu/rnn/rnn_helpers.h"
+#include "core/providers/cpu/mlas_backend_kernel_selector_config_utils.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -58,6 +59,8 @@ class DeepCpuAttnLstmOp final : public OpKernel {
     activation_funcs_ = ActivationFuncs(activation_func_names,
                                         activation_func_alphas,
                                         activation_func_betas);
+
+    SetupMlasBackendKernelSelectorFromConfigOptions(mlas_backend_kernel_selector_config_, info.GetConfigOptions());
   }
 
   Status Compute(OpKernelContext* context) const override;
@@ -92,6 +95,8 @@ class DeepCpuAttnLstmOp final : public OpKernel {
   bool input_forget_ = false;
 
   ActivationFuncs activation_funcs_;
+
+  MLAS_BACKEND_KERNEL_SELECTOR_CONFIG mlas_backend_kernel_selector_config_;
 };
 
 }  // namespace contrib
 
@@ -51,7 +51,8 @@ UniDirectionalAttnLstm<T>::UniDirectionalAttnLstm(AllocatorPtr allocator,
                                                   const ActivationFuncs::Entry& activation_func_g,
                                                   const ActivationFuncs::Entry& activation_func_h,
                                                   const float clip,
-                                                  onnxruntime::concurrency::ThreadPool* ttp)
+                                                  onnxruntime::concurrency::ThreadPool* ttp,
+                                                  const MLAS_BACKEND_KERNEL_SELECTOR_CONFIG* mlas_backend_kernel_selector_config)
     : allocator_(allocator),
       logger_(logger),
       seq_length_(seq_length),
@@ -64,7 +65,8 @@ UniDirectionalAttnLstm<T>::UniDirectionalAttnLstm(AllocatorPtr allocator,
       use_bias_(!bias.empty()),
       use_peepholes_(!peephole_weights.empty()),
       attention_wrapper_(attention_wrapper),
-      ttp_(ttp) {
+      ttp_(ttp),
+      mlas_backend_kernel_selector_config_(mlas_backend_kernel_selector_config) {
   activation_f_ = {deepcpu::ActivationFuncByName(activation_func_f.name),
                    activation_func_f.alpha,
                    activation_func_f.beta};
@@ -260,7 +262,7 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
               input_weights.begin(), input_weights.end(),  // W[iofc]^T
               input_size_ + attention_size_, T{0.0},
               output_iofc_.begin(), output_iofc_.end(),
-              hidden_size_x4, ttp_);
+              hidden_size_x4, ttp_, mlas_backend_kernel_selector_config_);
 
   DumpMatrix("Xt*(W[iofc]^T)", output_iofc_.data(), total_rows, hidden_size_x4);
 
@@ -298,7 +300,7 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
                   input_weights.begin() + input_size_, input_weights.end(),  // WA[iofc]
                   input_size_ + attention_size_, T{1.0},
                   step_out_IOFC, output_iofc_.end(),  // input contains Xt*(W[iofc]^T)
-                  hidden_size_x4, ttp_);
+                  hidden_size_x4, ttp_, mlas_backend_kernel_selector_config_);
 
       // calculate Xt*(W[iofc]^T) + Ht-1*R[iofc]
       ComputeGemm(batch_size_, hidden_size_x4, hidden_size_, T{1.0},
@@ -307,7 +309,7 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
                   recurrent_weights.begin(), recurrent_weights.end(),  // R[iofc]
                   hidden_size_, T{1.0},
                   step_out_IOFC, output_iofc_.end(),  // input contains Xt*(W[iofc]^T)
-                  hidden_size_x4, ttp_);
+                  hidden_size_x4, ttp_, mlas_backend_kernel_selector_config_);
 
       span_T_iter batched_output, batched_output_end;
       if (output_sequence) {
 
@@ -51,7 +51,8 @@ class UniDirectionalAttnLstm {
                          const ActivationFuncs::Entry& activation_func_g,
                          const ActivationFuncs::Entry& activation_func_h,
                          const float clip,
-                         onnxruntime::concurrency::ThreadPool* ttp);
+                         onnxruntime::concurrency::ThreadPool* ttp,
+                         const MLAS_BACKEND_KERNEL_SELECTOR_CONFIG* mlas_backend_kernel_selector_config);
 
   void Compute(const gsl::span<const T>& inputs,
                const gsl::span<const int>& sequence_lengths,
@@ -152,6 +153,8 @@ class UniDirectionalAttnLstm {
   AttentionWrapper<T>& attention_wrapper_;
 
   onnxruntime::concurrency::ThreadPool* ttp_;
+
+  const MLAS_BACKEND_KERNEL_SELECTOR_CONFIG* mlas_backend_kernel_selector_config_;
 };
 
 }  // namespace detail