Handle int overflow in rnn (#28003)

tianleiwu · Copilot · web-flow · commit ffbc5e8d8223 · 2026-04-13T15:28:55.000-07:00
### Description

Fixes two overflow/underflow bugs in the CPU RNN kernel (`rnn.cc`):

- **`SafeInt` for GEMM M-dimension**: `seq_length * batch_size` was
computed as a raw `int64_t` multiply before `narrow&lt;int&gt;()`, meaning an
overflow would be UB before the check could fire. Replaced with
`SafeInt&lt;int64_t&gt;(seq_length) * batch_size` for a checked multiply.

- **`seq_length == 0` guard in `Assign_Y_h`**: For the forward
direction, `last_time_step = seq_length - 1` underflows to `-1` when
`seq_length == 0`, producing a negative `y_offset` and out-of-bounds
read. Added an early-exit that zero-fills Y_h for the direction and
returns. Also handles `sequence_lens[batch] == 0` (same underflow path),
zeroing the affected batch slot and skipping via `continue`.

### Motivation and Context

Silent UB from integer overflow/underflow in shape-derived index
arithmetic can corrupt memory or produce incorrect results without any
diagnostic signal. These cases are legal per the ONNX spec (empty
sequences, per-batch zero-length sequences) and must be handled
explicitly.

---------

Co-authored-by: copilot-swe-agent[bot] &lt;198982749+Copilot@users.noreply.github.com&gt;
diff --git a/onnxruntime/core/common/safeint.h b/onnxruntime/core/common/safeint.h
@@ -36,3 +36,48 @@ class SafeIntExceptionHandler<onnxruntime::OnnxRuntimeException> {
 #if defined(__GNUC__)
 #pragma GCC diagnostic pop
 #endif
+
+#include <type_traits>
+
+namespace onnxruntime {
+
+template <typename T>
+using remove_cvref_t = std::remove_cv_t<std::remove_reference_t<T>>;
+
+template <typename T>
+inline constexpr bool is_supported_integer_v =
+    std::is_integral_v<remove_cvref_t<T>> && !std::is_same_v<remove_cvref_t<T>, bool>;
+
+//------------------------------------------------------------------------------
+// Safe multiplication of two or more integer values into an explicit result type R.
+// Throws OnnxRuntimeException on overflow.
+//------------------------------------------------------------------------------
+template <typename R, typename T, typename U, typename... Rest>
+[[nodiscard]] R SafeMul(T a, U b, Rest... rest) {
+  static_assert(is_supported_integer_v<R>,
+                "SafeMul requires an integral result type (excluding bool)");
+  static_assert(is_supported_integer_v<T> && is_supported_integer_v<U>,
+                "SafeMul requires integral operand types (excluding bool)");
+  static_assert((is_supported_integer_v<Rest> && ...),
+                "SafeMul requires integral operand types (excluding bool)");
+
+  // SafeMultiply(T, U, T&) requires the first argument and result to share
+  // the same type. Cast the first operand to R so the result is directly in R.
+  R cast_a{};
+  if (!SafeCast(a, cast_a)) {
+    SafeIntDefaultExceptionHandler::SafeIntOnOverflow();
+  }
+
+  R result{};
+  if (!SafeMultiply(cast_a, b, result)) {
+    SafeIntDefaultExceptionHandler::SafeIntOnOverflow();
+  }
+
+  if constexpr (sizeof...(rest) > 0) {
+    return SafeMul<R>(result, rest...);
+  } else {
+    return result;
+  }
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/rnn/rnn.cc b/onnxruntime/core/providers/cpu/rnn/rnn.cc
@@ -3,6 +3,7 @@
 
 #include "core/providers/cpu/rnn/rnn.h"
 
+#include "core/common/narrow.h"
 #include "core/common/safeint.h"
 #include "core/framework/op_kernel_context_internal.h"
 #include "core/providers/cpu/rnn/rnn_activation_functors.h"
@@ -84,15 +85,32 @@ void ApplyActivationToBatches(const Tensor* sequence_lens, const T* h_prev, T* Y
 template <typename T>
 void Assign_Y_h(const T* Y_buffer_data, Tensor* Y_h, const Tensor* sequence_lens,
                 int64_t num_directions, int direction, bool isReverse, int64_t batch_size, int64_t seq_length, int64_t hidden_size) {
+  if (seq_length == 0) {
+    // No sequence data was processed; zero out Y_h for this direction.
+    const size_t y_h_direction_size = SafeMul<size_t>(batch_size, hidden_size);
+    const size_t Y_h_direction_offset = SafeMul<size_t>(direction, y_h_direction_size);
+    math::Set<T, CPUMathUtil>(y_h_direction_size, T{0},
+                              Y_h->MutableData<T>() + Y_h_direction_offset, &CPUMathUtil::Instance());
+    return;
+  }
+
   for (int batch = 0; batch < batch_size; batch++) {
     int64_t last_time_step = isReverse ? 0 : seq_length - 1;
-    if (nullptr != sequence_lens && !isReverse)
+    if (nullptr != sequence_lens && !isReverse) {
       last_time_step = sequence_lens->Data<int>()[batch] - 1;
+      if (last_time_step < 0) {
+        // sequence_lens[batch] == 0: no data was processed for this batch; zero out Y_h.
+        int64_t Y_h_offset = direction * batch_size * hidden_size + batch * hidden_size;
+        math::Set<T, CPUMathUtil>(narrow<size_t>(hidden_size), T{0},
+                                  Y_h->MutableData<T>() + Y_h_offset, &CPUMathUtil::Instance());
+        continue;
+      }
+    }
     int64_t y_offset = last_time_step * num_directions * batch_size * hidden_size +
                        direction * batch_size * hidden_size +
                        batch * hidden_size;
     int64_t Y_h_offset = direction * batch_size * hidden_size + batch * hidden_size;
-    math::CopyVector<T, CPUMathUtil>(static_cast<int>(hidden_size), Y_buffer_data + y_offset,
+    math::CopyVector<T, CPUMathUtil>(narrow<int>(hidden_size), Y_buffer_data + y_offset,
                                      Y_h->MutableData<T>() + Y_h_offset,
                                      &CPUMathUtil::Instance());
   }
@@ -109,7 +127,7 @@ void ClearMissingFrames(T* Y_buffer_data, const Tensor* sequence_lens,
               seq * num_directions * batch_size * hidden_size +
               direction * batch_size * hidden_size +
               batch * hidden_size;
-          math::Set<T, CPUMathUtil>(onnxruntime::narrow<size_t>(hidden_size), 0, Y_buffer_data + offset, &CPUMathUtil::Instance());
+          math::Set<T, CPUMathUtil>(narrow<size_t>(hidden_size), 0, Y_buffer_data + offset, &CPUMathUtil::Instance());
         }
       }
     }
@@ -155,7 +173,7 @@ Status RNN<float>::Compute(OpKernelContext* ctx) const {
   ORT_RETURN_IF_ERROR(ctx->GetTempSpaceAllocator(&alloc));
 
   // X * W^t, each direction has shape of [seq_length, batch_size, hidden_size]
-  auto x_matmul_data = alloc->Alloc(SafeInt<size_t>(sizeof(float)) * seq_length * batch_size * hidden_size_);
+  auto x_matmul_data = alloc->Alloc(SafeMul<size_t>(sizeof(float), seq_length, batch_size, hidden_size_));
   BufferUniquePtr x_matmul_buffer(x_matmul_data, BufferDeleter(alloc));
   auto* x_matmul_w_buffer_data = static_cast<float*>(x_matmul_buffer.get());
 
@@ -165,7 +183,7 @@ Status RNN<float>::Compute(OpKernelContext* ctx) const {
   if (Y != nullptr)
     Y_buffer_data = Y->MutableData<float>();
   else {
-    Y_data = alloc->Alloc(SafeInt<size_t>(sizeof(float)) * seq_length * num_directions * batch_size * hidden_size_);
+    Y_data = alloc->Alloc(SafeMul<size_t>(sizeof(float), seq_length, num_directions, batch_size, hidden_size_));
     Y_matmul_buffer = BufferUniquePtr(Y_data, BufferDeleter(alloc));
     Y_buffer_data = static_cast<float*>(Y_matmul_buffer.get());
   }
@@ -177,20 +195,20 @@ Status RNN<float>::Compute(OpKernelContext* ctx) const {
     bool isReverse = direction_ == "reverse" || direction == 1;
 
     if (B != nullptr) {
-      EigenMatrixMapRowMajor<float>(x_matmul_w_buffer_data, seq_length * SafeInt<size_t>(batch_size), onnxruntime::narrow<size_t>(hidden_size_)).rowwise() =
-          ConstEigenVectorMap<float>(B->Data<float>() + direction * 2 * hidden_size_, onnxruntime::narrow<size_t>(hidden_size_)).transpose() +
-          ConstEigenVectorMap<float>(B->Data<float>() + direction * 2 * hidden_size_ + hidden_size_, onnxruntime::narrow<size_t>(hidden_size_)).transpose();
+      EigenMatrixMapRowMajor<float>(x_matmul_w_buffer_data, SafeMul<size_t>(seq_length, batch_size), narrow<size_t>(hidden_size_)).rowwise() =
+          ConstEigenVectorMap<float>(B->Data<float>() + direction * 2 * hidden_size_, narrow<size_t>(hidden_size_)).transpose() +
+          ConstEigenVectorMap<float>(B->Data<float>() + direction * 2 * hidden_size_ + hidden_size_, narrow<size_t>(hidden_size_)).transpose();
     } else {
-      math::Set<float, CPUMathUtil>(seq_length * batch_size * SafeInt<size_t>(hidden_size_), 0, x_matmul_w_buffer_data, &CPUMathUtil::Instance());
+      math::Set<float, CPUMathUtil>(SafeMul<size_t>(seq_length, batch_size, hidden_size_), 0, x_matmul_w_buffer_data, &CPUMathUtil::Instance());
     }
 
     // X * W[direction]^t + B
     math::Gemm<float>(
         CblasNoTrans,
         CblasTrans,
-        static_cast<int>(seq_length * batch_size),
-        static_cast<int>(hidden_size_),
-        static_cast<int>(input_size),
+        SafeMul<int>(seq_length, batch_size),
+        narrow<int>(hidden_size_),
+        narrow<int>(input_size),
         1,
         X.Data<float>(),
         W.Data<float>() + direction * hidden_size_ * input_size,
@@ -202,7 +220,7 @@ Status RNN<float>::Compute(OpKernelContext* ctx) const {
       int64_t time_step = isReverse ? (seq_length - t - 1) : t;
       int64_t Y_frame_offset = (time_step * num_directions + direction) * Y_frame_size;
       float* Y_buffer_data_current_frame = Y_buffer_data + Y_frame_offset;
-      auto y_frame_mat = EigenMatrixMapRowMajor<float>(Y_buffer_data_current_frame, onnxruntime::narrow<size_t>(batch_size), onnxruntime::narrow<size_t>(hidden_size_));
+      auto y_frame_mat = EigenMatrixMapRowMajor<float>(Y_buffer_data_current_frame, narrow<size_t>(batch_size), narrow<size_t>(hidden_size_));
 
       const float* h_prev = nullptr;
       if (t == 0) {
@@ -224,21 +242,21 @@ Status RNN<float>::Compute(OpKernelContext* ctx) const {
         math::Gemm<float>(
             CblasNoTrans,
             CblasTrans,
-            static_cast<int>(batch_size),
-            static_cast<int>(hidden_size_),
-            static_cast<int>(hidden_size_),
+            narrow<int>(batch_size),
+            narrow<int>(hidden_size_),
+            narrow<int>(hidden_size_),
             1,
             h_prev,
             R.Data<float>() + direction * hidden_size_ * hidden_size_,
             0,
             Y_buffer_data_current_frame,
             tp, &mlas_backend_kernel_selector_config_);
       } else {
-        math::Set<float, CPUMathUtil>(batch_size * SafeInt<size_t>(hidden_size_), 0, Y_buffer_data_current_frame, &CPUMathUtil::Instance());
+        math::Set<float, CPUMathUtil>(SafeMul<size_t>(batch_size, hidden_size_), 0, Y_buffer_data_current_frame, &CPUMathUtil::Instance());
       }
 
       // X[time_step] * W^t + H_t_1 * R^t
-      y_frame_mat += EigenMatrixMapRowMajor<float>(&x_matmul_w_buffer_data[time_step * Y_frame_size], onnxruntime::narrow<size_t>(batch_size), onnxruntime::narrow<size_t>(hidden_size_));
+      y_frame_mat += EigenMatrixMapRowMajor<float>(&x_matmul_w_buffer_data[time_step * Y_frame_size], narrow<size_t>(batch_size), narrow<size_t>(hidden_size_));
 
       // apply activation
       ApplyActivationToBatches<float>(sequence_lens, h_prev, Y_buffer_data_current_frame,
@@ -258,10 +276,10 @@ Status RNN<float>::Compute(OpKernelContext* ctx) const {
   }
 
   if (Y != nullptr)
-    DumpMatrix("Y", Y_buffer_data, (int)(seq_length * num_directions * batch_size), (int)hidden_size_);
+    DumpMatrix("Y", Y_buffer_data, SafeMul<int>(seq_length, num_directions, batch_size), narrow<int>(hidden_size_));
 
   if (Y_h != nullptr)
-    DumpMatrix("Y_h", Y_h->Data<float>(), (int)(num_directions * batch_size), (int)hidden_size_);
+    DumpMatrix("Y_h", Y_h->Data<float>(), SafeMul<int>(num_directions, batch_size), narrow<int>(hidden_size_));
 
   return Status::OK();
 }
diff --git a/onnxruntime/test/common/safeint_test.cc b/onnxruntime/test/common/safeint_test.cc
@@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/common/safeint.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+
+#include "gtest/gtest.h"
+
+namespace onnxruntime::test {
+
+static_assert(is_supported_integer_v<int>);
+static_assert(is_supported_integer_v<uint8_t>);
+static_assert(!is_supported_integer_v<bool>);
+
+TEST(SafeIntTest, SafeMulMultipliesOperands) {
+  EXPECT_EQ(SafeMul<size_t>(size_t{2}, 3U), size_t{6});
+  EXPECT_EQ(SafeMul<int>(-2, 3, 4), -24);
+}
+
+TEST(SafeIntTest, SafeMulHandlesSameVariableOperands) {
+  const int value = 7;
+  EXPECT_EQ(SafeMul<int>(value, value), 49);
+}
+
+#ifndef ORT_NO_EXCEPTIONS
+TEST(SafeIntTest, SafeMulThrowsOnInitialCastOverflow) {
+  EXPECT_THROW((void)SafeMul<uint32_t>(-1, 2), OnnxRuntimeException);
+}
+
+TEST(SafeIntTest, SafeMulThrowsOnMultiplyOverflow) {
+  EXPECT_THROW((void)SafeMul<int>(std::numeric_limits<int>::max(), 2), OnnxRuntimeException);
+}
+#endif
+
+}  // namespace onnxruntime::test
diff --git a/onnxruntime/test/providers/cpu/rnn/rnn_op_test.cc b/onnxruntime/test/providers/cpu/rnn/rnn_op_test.cc
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <cmath>
+
 #include "core/providers/cpu/rnn/rnn.h"
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
@@ -883,5 +885,106 @@ TEST(RNNTest, RNN_with_invalid_activation_load_failure) {
            {kCudaExecutionProvider, kTensorrtExecutionProvider});
 }
 
+// Test that seq_length == 0 produces zero-filled Y and Y_h without crashing.
+TEST(RNNTest, RNN_seq_length_zero) {
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+
+  OpTester test("RNN");
+  int64_t num_directions = 1, input_size = 2, hidden_size = 3, batch_size = 2, seq_length = 0;
+
+  test.AddAttribute("activations", vector<string>(num_directions, "Tanh"));
+  test.AddAttribute("direction", "forward");
+  test.AddAttribute("hidden_size", hidden_size);
+
+  std::vector<int64_t> X_dims = {seq_length, batch_size, input_size};
+  std::vector<float> X_data{};
+  test.AddInput<float>("X", X_dims, X_data);
+
+  std::vector<int64_t> W_dims = {num_directions, hidden_size, input_size};
+  std::vector<float> W_data({-0.1f, 0.2f, 1.f, -2.f, -1.f, 3.f});
+  test.AddInput<float>("W", W_dims, W_data);
+
+  std::vector<int64_t> R_dims = {num_directions, hidden_size, hidden_size};
+  std::vector<float> R_data(hidden_size * hidden_size, 0.f);
+  test.AddInput<float>("R", R_dims, R_data);
+
+  // Y: shape [0, 1, 2, 3] -> empty
+  std::vector<int64_t> Y_dims = {seq_length, num_directions, batch_size, hidden_size};
+  std::vector<float> Y_data{};
+  test.AddOutput<float>("Y", Y_dims, Y_data);
+
+  // Y_h: shape [1, 2, 3] -> all zeros
+  std::vector<int64_t> Y_h_dims{num_directions, batch_size, hidden_size};
+  std::vector<float> Y_h_data(num_directions * batch_size * hidden_size, 0.f);
+  test.AddOutput<float>("Y_h", Y_h_dims, Y_h_data);
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
+}
+
+// Test that per-batch sequence_lens containing 0 produces zero-filled Y_h for those batches.
+TEST(RNNTest, RNN_forward_sequence_lens_with_zero) {
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+
+  OpTester test("RNN");
+  int64_t num_directions = 1, input_size = 2, hidden_size = 3, batch_size = 2, seq_length = 2;
+
+  test.AddAttribute("activations", vector<string>(num_directions, "Tanh"));
+  test.AddAttribute("direction", "forward");
+  test.AddAttribute("hidden_size", hidden_size);
+
+  // X shape: [seq_length=2, batch_size=2, input_size=2]
+  std::vector<int64_t> X_dims = {seq_length, batch_size, input_size};
+  std::vector<float> X_data({0.1f, 0.2f,
+                             0.3f, 0.4f,
+                             0.5f, 0.6f,
+                             0.7f, 0.8f});
+  test.AddInput<float>("X", X_dims, X_data);
+
+  std::vector<int64_t> W_dims = {num_directions, hidden_size, input_size};
+  std::vector<float> W_data({-0.1f, 0.2f, 1.f, -2.f, -1.f, 3.f});
+  test.AddInput<float>("W", W_dims, W_data);
+
+  std::vector<int64_t> R_dims = {num_directions, hidden_size, hidden_size};
+  std::vector<float> R_data(hidden_size * hidden_size, 0.f);
+  test.AddInput<float>("R", R_dims, R_data);
+
+  std::vector<int64_t> B_dims = {num_directions, 2 * hidden_size};
+  std::vector<float> B_data(2 * hidden_size, 0.f);
+  test.AddInput<float>("B", B_dims, B_data);
+
+  // batch 0 has sequence_lens=2, batch 1 has sequence_lens=0
+  std::vector<int64_t> sequence_lens_dims{batch_size};
+  std::vector<int> sequence_lens_data{2, 0};
+  test.AddInput<int>("sequence_lens", sequence_lens_dims, sequence_lens_data);
+
+  std::vector<int64_t> initial_h_dims = {num_directions, batch_size, hidden_size};
+  std::vector<float> initial_h_data(num_directions * batch_size * hidden_size, 0.f);
+  test.AddInput<float>("initial_h", initial_h_dims, initial_h_data);
+
+  // Y output is optional; skip it to keep test simple.
+  test.AddOptionalOutputEdge<float>();
+
+  // Y_h: shape [1, 2, 3]
+  // batch 0 gets the result of forward pass at last time step (seq_length-1=1).
+  // batch 1 has sequence_lens=0 so Y_h should be zero.
+  //
+  // For batch 0:
+  //   time_step 0: X=[0.1, 0.2], Y = tanh(X * W^T) = tanh([-0.1*0.1+0.2*0.2, 1*0.1-2*0.2, -1*0.1+3*0.2])
+  //                = tanh([0.03, -0.3, 0.5])
+  //   time_step 1: X=[0.5, 0.6], Y = tanh(X * W^T + H_prev * R^T)
+  //                R is zero, so Y = tanh([-0.1*0.5+0.2*0.6, 1*0.5-2*0.6, -1*0.5+3*0.6])
+  //                = tanh([0.07, -0.7, 1.3])
+  float y_h_batch0_f0 = std::tanh(0.07f);
+  float y_h_batch0_f1 = std::tanh(-0.7f);
+  float y_h_batch0_f2 = std::tanh(1.3f);
+
+  std::vector<int64_t> Y_h_dims{num_directions, batch_size, hidden_size};
+  std::vector<float> Y_h_data{y_h_batch0_f0, y_h_batch0_f1, y_h_batch0_f2,
+                              0.f, 0.f, 0.f};
+  test.AddOutput<float>("Y_h", Y_h_dims, Y_h_data);
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
+}
+
 }  // namespace test
 }  // namespace onnxruntime