microsoft
diff --git a/‎cmake/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎cmake/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cmake/external/onnxruntime_external_deps.cmake‎
Lines changed: 2 additions & 2 deletions b/‎cmake/external/onnxruntime_external_deps.cmake‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cmake/onnxruntime_mlas.cmake‎
Lines changed: 16 additions & 1 deletion b/‎cmake/onnxruntime_mlas.cmake‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎cmake/onnxruntime_unittests.cmake‎
Lines changed: 9 additions & 6 deletions b/‎cmake/onnxruntime_unittests.cmake‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎docs/ContribOperators.md‎
Lines changed: 6 additions & 7 deletions b/‎docs/ContribOperators.md‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎docs/OperatorKernels.md‎
Lines changed: 10 additions & 5 deletions b/‎docs/OperatorKernels.md‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h‎
Lines changed: 12 additions & 0 deletions b/‎include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/bert/attention_base.h‎
Lines changed: 30 additions & 17 deletions b/‎onnxruntime/contrib_ops/cpu/bert/attention_base.h‎
Lines changed: 30 additions & 17 deletions
@@ -638,6 +638,7 @@ else()
   check_cxx_compiler_flag(-Wcatch-value HAS_CATCH_VALUE)
   check_cxx_compiler_flag(-Wclass-memaccess HAS_CLASS_MEMACCESS)
   check_cxx_compiler_flag(-Wcharacter-conversion HAS_CHARACTER_CONVERSION)
+  check_cxx_compiler_flag(-Wno-error=character-conversion HAS_NO_ERROR_CHARACTER_CONVERSION)
   check_cxx_compiler_flag(-Wdangling-reference HAS_DANGLING_REFERENCE)
   check_cxx_compiler_flag(-Wdeprecated-anon-enum-enum-conversion HAS_DEPRECATED_ANON_ENUM_ENUM_CONVERSION)
   check_cxx_compiler_flag(-Wdeprecated-builtins HAS_DEPRECATED_BUILTINS)
 
@@ -149,10 +149,10 @@ if(NOT ONNX_CUSTOM_PROTOC_EXECUTABLE AND NOT onnxruntime_USE_VCPKG)
       if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
         onnxruntime_fetchcontent_declare(protoc_binary URL ${DEP_URL_protoc_linux_x64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x64} EXCLUDE_FROM_ALL)
         FetchContent_Populate(protoc_binary)
-      elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$")
+      elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$")
         onnxruntime_fetchcontent_declare(protoc_binary URL ${DEP_URL_protoc_linux_x86} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x86} EXCLUDE_FROM_ALL)
         FetchContent_Populate(protoc_binary)
-      elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64.*")
+      elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^aarch64.*")
         onnxruntime_fetchcontent_declare(protoc_binary URL ${DEP_URL_protoc_linux_aarch64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_aarch64} EXCLUDE_FROM_ALL)
         FetchContent_Populate(protoc_binary)
       endif()
 
@@ -636,9 +636,13 @@ else()
             enable_language(ASM)
             check_cxx_source_compiles("
               #ifdef _AIX
+              #include <sys/systemcfg.h>
+              #if !defined(POWER_10)
               #define POWER_10       0x40000
+              #endif
+              #if !defined(POWER_10_ANDUP)
               #define POWER_10_ANDUP (POWER_10)
-              #include <sys/systemcfg.h>
+              #endif
               #define __power_10_andup() (_system_configuration.implementation & POWER_10_ANDUP)
               int main() {
                 bool HasP10 = (__power_10_andup() && __power_mma_version() == MMA_V31);
@@ -911,6 +915,11 @@ endif()
     if(RISCV64 AND MLAS_SOURCE_IS_NOT_SET)
         file(GLOB_RECURSE mlas_platform_srcs CONFIGURE_DEPENDS
           "${MLAS_SRC_DIR}/scalar/*.cpp")
+        # Remove scalar depthwise kernel; replaced by the vectorized version
+        list(REMOVE_ITEM mlas_platform_srcs
+          "${MLAS_SRC_DIR}/scalar/SconvDepthwiseKernelScalar.cpp")
+        list(APPEND mlas_platform_srcs
+          ${MLAS_SRC_DIR}/sconv_nchw_depthwise_multiplier_1.cpp)
 
         if(onnxruntime_USE_RVV)
           set(OLD_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
@@ -932,11 +941,17 @@ endif()
               ${MLAS_SRC_DIR}/riscv64/sgemm_pack_b_rvv.cpp
               ${MLAS_SRC_DIR}/riscv64/sgemm_kernel_rvv.cpp
               ${MLAS_SRC_DIR}/riscv64/softmax_kernel_rvv.cpp
+              ${MLAS_SRC_DIR}/riscv64/sconv_depthwise_kernel_rvv.cpp
+              ${MLAS_SRC_DIR}/riscv64/sconv_nchwc_kernel_rvv.cpp
             )
+            list(REMOVE_ITEM mlas_platform_srcs
+              "${MLAS_SRC_DIR}/sconv_nchw_depthwise_multiplier_1.cpp")
             set_source_files_properties(
               ${MLAS_SRC_DIR}/riscv64/sgemm_pack_b_rvv.cpp
               ${MLAS_SRC_DIR}/riscv64/sgemm_kernel_rvv.cpp
               ${MLAS_SRC_DIR}/riscv64/softmax_kernel_rvv.cpp
+              ${MLAS_SRC_DIR}/riscv64/sconv_depthwise_kernel_rvv.cpp
+              ${MLAS_SRC_DIR}/riscv64/sconv_nchwc_kernel_rvv.cpp
               PROPERTIES COMPILE_FLAGS "-march=rv64gcv -mabi=lp64d")
             list(APPEND mlas_private_compile_definitions MLAS_USE_RVV=1)
           else()
 
@@ -50,6 +50,13 @@ function(filter_test_srcs test_srcs_var)
 endfunction()
 
 set(disabled_warnings)
+
+function(onnxruntime_disable_gtest_character_conversion_as_error target_name)
+  if (HAS_NO_ERROR_CHARACTER_CONVERSION)
+    target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:-Wno-error=character-conversion>")
+  endif()
+endfunction()
+
 function(AddTest)
   cmake_parse_arguments(_UT "DYN" "TARGET" "LIBS;SOURCES;DEPENDS;TEST_ARGS" ${ARGN})
   list(REMOVE_DUPLICATES _UT_SOURCES)
@@ -170,9 +177,7 @@ function(AddTest)
     if (${HAS_NOERROR})
       target_compile_options(${_UT_TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:-Wno-error=uninitialized>")
     endif()
-    if (${HAS_CHARACTER_CONVERSION})
-      target_compile_options(${_UT_TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:-Wno-error=character-conversion>")
-    endif()
+    onnxruntime_disable_gtest_character_conversion_as_error(${_UT_TARGET})
   endif()
 
   set(TEST_ARGS ${_UT_TEST_ARGS})
@@ -847,9 +852,7 @@ if(MSVC)
                 "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd6326>")
 else()
   target_include_directories(onnxruntime_test_utils PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT})
-  if (HAS_CHARACTER_CONVERSION)
-    target_compile_options(onnxruntime_test_utils PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:-Wno-error=character-conversion>")
-  endif()
+  onnxruntime_disable_gtest_character_conversion_as_error(onnxruntime_test_utils)
 endif()
 if (onnxruntime_USE_NCCL)
   target_include_directories(onnxruntime_test_utils PRIVATE ${NCCL_INCLUDE_DIRS})
 
@@ -3761,7 +3761,6 @@ This version of the operator has been available since version 1 of the 'com.micr
 ### <a name="com.microsoft.NhwcFusedConv"></a><a name="com.microsoft.nhwcfusedconv">**com.microsoft.NhwcFusedConv**</a>
 
   NhwcFusedConv is a Conv operator with optional activation and add operators fused in.
-  Only has fp16 implementation as of 2023/04/15.
 
 #### Version
 
@@ -3792,26 +3791,26 @@ This version of the operator has been available since version 1 of the 'com.micr
 
 <dl>
 <dt><tt>X</tt> : T</dt>
-<dd></dd>
+<dd>Input activation tensor in channels-last layout. For 2D convolution this is [N, H, W, C], where N is batch size, H/W are spatial dimensions, and C is the number of input channels.</dd>
 <dt><tt>W</tt> : T</dt>
-<dd></dd>
+<dd>Convolution weight tensor in the standard ONNX Conv filter layout [M, C/group, kH, kW], where M is the number of output channels.</dd>
 <dt><tt>B</tt> (optional) : T</dt>
-<dd></dd>
+<dd>Optional 1D bias tensor of shape [M].</dd>
 <dt><tt>Z</tt> (optional) : T</dt>
-<dd>Tensor to be added to the output, must be the same shape and format as the output tensor.</dd>
+<dd>Optional residual/add tensor in the same channels-last layout and shape as the output tensor Y. For 2D convolution this is [N, out_H, out_W, M].</dd>
 </dl>
 
 #### Outputs
 
 <dl>
 <dt><tt>Y</tt> : T</dt>
-<dd></dd>
+<dd>Output tensor in channels-last layout. For 2D convolution this is [N, out_H, out_W, M], where M is the number of output channels.</dd>
 </dl>
 
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16)</dt>
+<dt><tt>T</tt> : tensor(float16), tensor(float)</dt>
 <dd>Constrain input and output types to float tensors</dd>
 </dl>
 
 
@@ -810,7 +810,8 @@ The **OpSet Version** column uses the following notation:
 |||[9, 12]|**T1** = tensor(double), tensor(float), tensor(float16)<br/> **T2** = tensor(bool)|
 |LRN|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[1, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
-|LSTM|*in* X:**T**<br> *in* W:**T**<br> *in* R:**T**<br> *in* B:**T**<br> *in* sequence_lens:**T1**<br> *in* initial_h:**T**<br> *in* initial_c:**T**<br> *in* P:**T**<br> *out* Y:**T**<br> *out* Y_h:**T**<br> *out* Y_c:**T**|14+|**T** = tensor(double), tensor(float), tensor(float16)<br/> **T1** = tensor(int32)|
+|LSTM|*in* X:**T**<br> *in* W:**T**<br> *in* R:**T**<br> *in* B:**T**<br> *in* sequence_lens:**T1**<br> *in* initial_h:**T**<br> *in* initial_c:**T**<br> *in* P:**T**<br> *out* Y:**T**<br> *out* Y_h:**T**<br> *out* Y_c:**T**|22+|**T** = tensor(double), tensor(float), tensor(float16)<br/> **T1** = tensor(int32)|
+|||[14, 21]|**T** = tensor(double), tensor(float), tensor(float16)<br/> **T1** = tensor(int32)|
 |||[7, 13]|**T** = tensor(double), tensor(float), tensor(float16)<br/> **T1** = tensor(int32)|
 |LayerNormalization|*in* X:**T**<br> *in* Scale:**T**<br> *in* B:**T**<br> *out* Y:**T**<br> *out* Mean:**U**<br> *out* InvStdDev:**U**<br><br>or<br><br>*in* X:**T**<br> *in* Scale:**V**<br> *in* B:**V**<br> *out* Y:**V**<br> *out* Mean:**U**<br> *out* InvStdDev:**U**|17+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)<br/> **U** = tensor(float)|
 |||[1, 16]|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)<br/> **U** = tensor(double), tensor(float)<br/> **V** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
@@ -889,10 +890,14 @@ The **OpSet Version** column uses the following notation:
 |RNN|*in* X:**T**<br> *in* W:**T**<br> *in* R:**T**<br> *in* B:**T**<br> *in* sequence_lens:**T1**<br> *in* initial_h:**T**<br> *out* Y:**T**<br> *out* Y_h:**T**|22+|**T** = tensor(double), tensor(float), tensor(float16)<br/> **T1** = tensor(int32)|
 |||[14, 21]|**T** = tensor(double), tensor(float), tensor(float16)<br/> **T1** = tensor(int32)|
 |||[7, 13]|**T** = tensor(double), tensor(float), tensor(float16)<br/> **T1** = tensor(int32)|
-|RandomNormal|*out* output:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
-|RandomNormalLike|*in* input:**T1**<br> *out* output:**T2**|1+|**T1** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(double), tensor(float), tensor(float16)|
-|RandomUniform|*out* output:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
-|RandomUniformLike|*in* input:**T1**<br> *out* output:**T2**|1+|**T1** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(double), tensor(float), tensor(float16)|
+|RandomNormal|*out* output:**T**|22+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
+|||[1, 21]|**T** = tensor(double), tensor(float), tensor(float16)|
+|RandomNormalLike|*in* input:**T1**<br> *out* output:**T2**|22+|**T1** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
+|||[1, 21]|**T1** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(double), tensor(float), tensor(float16)|
+|RandomUniform|*out* output:**T**|22+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
+|||[1, 21]|**T** = tensor(double), tensor(float), tensor(float16)|
+|RandomUniformLike|*in* input:**T1**<br> *out* output:**T2**|22+|**T1** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
+|||[1, 21]|**T1** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(double), tensor(float), tensor(float16)|
 |Range|*in* start:**T**<br> *in* limit:**T**<br> *in* delta:**T**<br> *out* output:**T**|11+|**T** = tensor(double), tensor(float), tensor(int16), tensor(int32), tensor(int64)|
 |Reciprocal|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[6, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
 
@@ -124,6 +124,18 @@ static const char* const kOrtSessionOptionsMemoryOptimizerProbeConfig = "optimiz
 // Default is an empty string which means no optimizers are disabled.
 static const char* const kOrtSessionOptionsDisableSpecifiedOptimizers = "optimization.disable_specified_optimizers";
 
+// Maximum total output size in bytes that the constant folding optimizer is allowed to produce per node.
+// Prevents malicious models from causing excessive memory allocation during optimization.
+// If the estimated or actual output size of a constant-foldable node exceeds this limit, the node will
+// not be constant folded and will instead be executed at runtime.
+//
+// Option values:
+// - A positive integer (as string): Maximum allowed output size in bytes per constant-folded node.
+//   Default is "1073741824" (1 GB).
+// - "0": Disable the size limit (not recommended for untrusted models).
+static const char* const kOrtSessionOptionsConstantFoldingMaxOutputSizeInBytes =
+    "optimization.constant_folding_max_output_size_in_bytes";
+
 // It controls whether to run graph optimizations in loop or not.
 //
 // "0": disable. Graph Optimization Loop is disabled.
 
@@ -6,6 +6,7 @@
 #include <array>
 #include <vector>
 #include "core/common/common.h"
+#include "core/common/narrow.h"
 #include "core/providers/cpu/mlas_backend_kernel_selector_config_utils.h"
 #ifndef SHARED_PROVIDER
 #include "core/framework/op_kernel.h"
@@ -57,11 +58,11 @@ class AttentionBase {
   AttentionBase(const KernelInfoType& info, bool require_same_hidden_size) {
     int64_t num_heads = 0;
     ORT_ENFORCE(info.GetAttr("num_heads", &num_heads).IsOK() && num_heads > 0);
-    num_heads_ = static_cast<int>(num_heads);
+    num_heads_ = narrow<int>(num_heads);
 
     is_unidirectional_ = info.template GetAttrOrDefault<int64_t>("unidirectional", 0) == 1;
     do_rotary_ = info.template GetAttrOrDefault<int64_t>("do_rotary", 0) == 1;
-    rotary_embedding_ = static_cast<int>(info.template GetAttrOrDefault<int64_t>("rotary_embedding_dim", 0));
+    rotary_embedding_ = narrow<int>(info.template GetAttrOrDefault<int64_t>("rotary_embedding_dim", 0));
     mask_filter_value_ = info.template GetAttrOrDefault<float>("mask_filter_value", -10000.0f);
     scale_ = info.template GetAttrOrDefault<float>("scale", 0.0f);
     if (!info.template GetAttrs<int64_t>("qkv_hidden_sizes", qkv_hidden_sizes_).IsOK()) {
@@ -222,6 +223,14 @@ inline Status AttentionBase::CheckInputs(const TensorShape& input_shape,
                            "Input 'bias' dimension 0 should have same length as dimension 1 of input 'weights'");
   }
 
+  // Q, K, V are packed along bias_dims[0]. When their hidden sizes are required to be equal,
+  // bias_dims[0] == 3 * hidden_size must be a multiple of 3.
+  if (require_same_hidden_size_ && bias_dims[0] % 3 != 0) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Input 'bias' dimension 0 (", bias_dims[0],
+                           ") must be a multiple of 3 (Q, K, V are packed and have equal hidden sizes).");
+  }
+
   int64_t q_hidden_size = bias_dims[0] / static_cast<int64_t>(3);
   int64_t k_hidden_size = q_hidden_size;
   int64_t v_hidden_size = k_hidden_size;
@@ -241,6 +250,10 @@ inline Status AttentionBase::CheckInputs(const TensorShape& input_shape,
     q_hidden_size = qkv_hidden_sizes_[0];
     k_hidden_size = qkv_hidden_sizes_[1];
     v_hidden_size = qkv_hidden_sizes_[2];
+  } else if (q_hidden_size % num_heads_ != 0) {
+    // Match the error message produced by the qkv_hidden_sizes path above.
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "hidden_size should be divisible by num_heads:", q_hidden_size);
   }
 
   int64_t kv_sequence_length = sequence_length;
@@ -282,14 +295,14 @@ inline Status AttentionBase::CheckInputs(const TensorShape& input_shape,
                              "Inputs 'past' dimension 1 shall have same length as dimension 0 of input 0");
     }
 
-    if (static_cast<int>(past_dims[2]) != num_heads_) {
+    if (past_dims[2] != num_heads_) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              "Inputs 'past' dimension 2 shall have length of num_heads", num_heads_);
     }
 
-    if (static_cast<int>(past_dims[4]) != k_hidden_size / num_heads_) {
+    if (past_dims[4] != k_hidden_size / num_heads_) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "Inputs 'past' dimension 2 shall have length of ", k_hidden_size / num_heads_);
+                             "Inputs 'past' dimension 4 shall have length of ", k_hidden_size / num_heads_);
     }
 
     if (!past_present_share_buffer_) {
@@ -348,17 +361,17 @@ inline Status AttentionBase::CheckInputs(const TensorShape& input_shape,
 
   if (parameters != nullptr) {
     AttentionParameters* output_parameters = reinterpret_cast<AttentionParameters*>(parameters);
-    output_parameters->batch_size = static_cast<int>(batch_size);
-    output_parameters->sequence_length = static_cast<int>(sequence_length);
-    output_parameters->past_sequence_length = static_cast<int>(past_sequence_length);
-    output_parameters->kv_sequence_length = static_cast<int>(kv_sequence_length);
-    output_parameters->total_sequence_length = static_cast<int>(total_sequence_length);
-    output_parameters->max_sequence_length = static_cast<int>(max_sequence_length);
-    output_parameters->input_hidden_size = static_cast<int>(input_hidden_size);
-    output_parameters->hidden_size = static_cast<int>(q_hidden_size);
-    output_parameters->v_hidden_size = static_cast<int>(v_hidden_size);
-    output_parameters->head_size = static_cast<int>(q_hidden_size) / num_heads_;
-    output_parameters->v_head_size = static_cast<int>(v_hidden_size) / num_heads_;
+    output_parameters->batch_size = narrow<int>(batch_size);
+    output_parameters->sequence_length = narrow<int>(sequence_length);
+    output_parameters->past_sequence_length = narrow<int>(past_sequence_length);
+    output_parameters->kv_sequence_length = narrow<int>(kv_sequence_length);
+    output_parameters->total_sequence_length = narrow<int>(total_sequence_length);
+    output_parameters->max_sequence_length = narrow<int>(max_sequence_length);
+    output_parameters->input_hidden_size = narrow<int>(input_hidden_size);
+    output_parameters->hidden_size = narrow<int>(q_hidden_size);
+    output_parameters->v_hidden_size = narrow<int>(v_hidden_size);
+    output_parameters->head_size = narrow<int>(q_hidden_size) / num_heads_;
+    output_parameters->v_head_size = narrow<int>(v_hidden_size) / num_heads_;
     output_parameters->num_heads = num_heads_;
     output_parameters->is_unidirectional = is_unidirectional_;
     output_parameters->past_present_share_buffer = (past_present_share_buffer_ != 0 && past != nullptr);
@@ -398,7 +411,7 @@ inline Tensor* AttentionBase::GetPresent(TOpKernelContext* context,
                                          int head_size,
                                          int kv_sequence_length,
                                          int& past_sequence_length) const {
-  past_sequence_length = (nullptr != past) ? static_cast<int>(past->Shape().GetDims()[3]) : 0;
+  past_sequence_length = (nullptr != past) ? narrow<int>(past->Shape().GetDims()[3]) : 0;
   std::array<int64_t, 5> present_dims{2, batch_size, num_heads_,
                                       static_cast<int64_t>(kv_sequence_length) + past_sequence_length, head_size};