Introduce checks to prevent buffer overflow, add tests (#28713)

yuslepukhin · web-flow · commit df44eabdc0b2 · 2026-05-29T14:36:16.000-07:00
This pull request improves the correctness and robustness of the
`InPlaceAccumulator` and `InPlaceAccumulatorV2` gradient accumulation
kernels by adding new unit tests and enforcing stricter shape validation
in both CPU and CUDA implementations. The changes ensure that shape
mismatches are caught early, and that optional outputs are properly
handled in all execution providers.

**Test coverage improvements:**

* Added a test to verify that `InPlaceAccumulator` correctly passes
through the `old_sum` unchanged and does not consume the `value` input
when the optional `update_signal` is `false`.
* Added tests for `InPlaceAccumulatorV2` to check that shape mismatches
between `accumulation_buffer` and `value` are detected and handled as
errors, covering both overwrite and accumulate branches.
* Added tests to verify that `InPlaceAccumulatorV2` correctly handles
the case where the optional `accumulation_buffer_out` output is omitted,
for both CPU and CUDA providers.
[[1]](diffhunk://#diff-c62fec6d9ac7d24c7d6befe4e18317d2690385051c01a6412651f01e190de1beR2247-R2288)
[[2]](diffhunk://#diff-c62fec6d9ac7d24c7d6befe4e18317d2690385051c01a6412651f01e190de1beR2332-R2346)

**Kernel validation improvements:**

* Added explicit shape validation to the CPU implementation of
`InPlaceAccumulatorV2`, ensuring that the shapes of the accumulation
buffer and the value tensor match.
* Added the same shape validation to the CUDA implementation of
`InPlaceAccumulatorV2`, preventing out-of-bounds memory accesses.
diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
@@ -2175,6 +2175,20 @@ TEST(GradientUtilsTest, InPlaceAccumulatorFloat32) {
   test.Run();
 }
 
+// When the optional update_signal is false, the kernel must pass old_sum through unchanged
+// and must not consume the value input.
+TEST(GradientUtilsTest, InPlaceAccumulatorFloat32_NoUpdate) {
+  OpTester test("InPlaceAccumulator", 1, onnxruntime::kMSDomain);
+
+  test.AddInput<float>("old_sum", {3}, {1.f, 2.f, 3.f});
+  test.AddInput<float>("value", {3}, {4.f, 5.f, 6.f});
+  test.AddInput<bool>("update_signal", {1}, {false});
+
+  test.AddOutput<float>("new_sum", {3}, {1.f, 2.f, 3.f});
+
+  test.Run();
+}
+
 void TestInPlaceAccumulatorV2(
     const std::vector<int64_t>& tensor_dim,
     const std::unordered_set<std::string>& excluded_providers,
@@ -2230,6 +2244,59 @@ TEST(GradientUtilsTest, InPlaceAccumulatorV2Overwrite) {
   test.Run();
 }
 
+// Verify the kernel rejects mismatched shapes between accumulation_buffer and value
+// instead of performing an out-of-bounds copy. Exercises both overwrite and accumulate branches.
+static void RunInPlaceAccumulatorV2ShapeMismatch(bool overwrite_flag,
+                                                 std::unique_ptr<IExecutionProvider> provider) {
+  OpTester test("InPlaceAccumulatorV2", 1, onnxruntime::kMSDomain);
+
+  test.AddInput<float>("old_sum", {3}, {1.f, 2.f, 3.f});
+  // value has more elements than old_sum; without validation the kernel would over-read/over-write.
+  test.AddInput<float>("value", {5}, {4.f, 5.f, 6.f, 7.f, 8.f});
+  test.AddInput<bool>("overwrite", {1}, {overwrite_flag});
+  test.AddOutput<bool>("updated", {1}, {true});
+  test.AddOutput<float>("new_sum", {3}, {0.f, 0.f, 0.f});
+
+  std::vector<std::unique_ptr<IExecutionProvider>> providers;
+  providers.emplace_back(std::move(provider));
+  test.Run(OpTester::ExpectResult::kExpectFailure,
+           "accumulation_buffer shape", {}, nullptr, &providers);
+}
+
+TEST(GradientUtilsTest, InPlaceAccumulatorV2_ShapeMismatch_Overwrite) {
+  RunInPlaceAccumulatorV2ShapeMismatch(/*overwrite_flag=*/true, DefaultCpuExecutionProvider());
+}
+
+TEST(GradientUtilsTest, InPlaceAccumulatorV2_ShapeMismatch_Accumulate) {
+  RunInPlaceAccumulatorV2ShapeMismatch(/*overwrite_flag=*/false, DefaultCpuExecutionProvider());
+}
+
+#if defined(USE_CUDA)
+TEST(GradientUtilsTest, InPlaceAccumulatorV2_ShapeMismatch_Overwrite_GPU) {
+  RunInPlaceAccumulatorV2ShapeMismatch(/*overwrite_flag=*/true, DefaultCudaExecutionProvider());
+}
+
+TEST(GradientUtilsTest, InPlaceAccumulatorV2_ShapeMismatch_Accumulate_GPU) {
+  RunInPlaceAccumulatorV2ShapeMismatch(/*overwrite_flag=*/false, DefaultCudaExecutionProvider());
+}
+#endif
+
+// Exercise the path where the optional accumulation_buffer_out output is omitted.
+// The kernel must still update the in-place accumulation_buffer and produce updated_flag.
+TEST(GradientUtilsTest, InPlaceAccumulatorV2_NoAccumulationOutput_CPU) {
+  OpTester test("InPlaceAccumulatorV2", 1, onnxruntime::kMSDomain);
+
+  test.AddInput<float>("old_sum", {3}, {1.f, 2.f, 3.f});
+  test.AddInput<float>("value", {3}, {4.f, 5.f, 6.f});
+  test.AddInput<bool>("overwrite", {1}, {false});
+  test.AddOutput<bool>("updated", {1}, {true});
+  test.AddOptionalOutputEdge<float>();
+
+  std::vector<std::unique_ptr<IExecutionProvider>> providers;
+  providers.emplace_back(DefaultCpuExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &providers);
+}
+
 #if defined(USE_CUDA)
 TEST(GradientUtilsTest, InPlaceAccumulatorV2_GPU) {
   std::vector<std::vector<int64_t>> test_dims{
@@ -2273,6 +2340,21 @@ TEST(GradientUtilsTest, InPlaceAccumulatorV2_Float16) {
   providers.emplace_back(DefaultCudaExecutionProvider());
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &providers);
 }
+
+// CUDA-side coverage for the omitted optional accumulation_buffer_out output.
+TEST(GradientUtilsTest, InPlaceAccumulatorV2_NoAccumulationOutput_GPU) {
+  OpTester test("InPlaceAccumulatorV2", 1, onnxruntime::kMSDomain);
+
+  test.AddInput<float>("old_sum", {3}, {1.f, 2.f, 3.f});
+  test.AddInput<float>("value", {3}, {4.f, 5.f, 6.f});
+  test.AddInput<bool>("overwrite", {1}, {false});
+  test.AddOutput<bool>("updated", {1}, {true});
+  test.AddOptionalOutputEdge<float>();
+
+  std::vector<std::unique_ptr<IExecutionProvider>> providers;
+  providers.emplace_back(DefaultCudaExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &providers);
+}
 #endif
 
 #if defined(USE_CUDA)
diff --git a/orttraining/orttraining/training_ops/cpu/optimizer/gradient_control.cc b/orttraining/orttraining/training_ops/cpu/optimizer/gradient_control.cc
@@ -95,6 +95,10 @@ Status InPlaceAccumulatorV2<T>::Compute(OpKernelContext* context) const {
   const Tensor* new_value = context->Input<Tensor>(1);
   const Tensor* overwrite_tensor = context->Input<Tensor>(2);
 
+  ORT_RETURN_IF_NOT(accumulation_buffer->Shape() == new_value->Shape(),
+                    "InPlaceAccumulatorV2: accumulation_buffer shape (", accumulation_buffer->Shape(),
+                    ") must match value shape (", new_value->Shape(), ").");
+
   void* accumulation_buffer_data = accumulation_buffer->template MutableData<T>();
   const bool overwrite = overwrite_tensor != nullptr ? *(overwrite_tensor->template Data<bool>()) : false;
 
diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.cc b/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.cc
@@ -119,6 +119,10 @@ Status InPlaceAccumulatorV2<T, T_GRAD>::ComputeInternal(OpKernelContext* ctx) co
   const Tensor* overwrite_tensor = ctx->Input<Tensor>(2);
   const bool overwrite = overwrite_tensor != nullptr ? *(overwrite_tensor->template Data<bool>()) : false;
 
+  ORT_RETURN_IF_NOT(left_addee_buffer.Shape() == right_addee_buffer.Shape(),
+                    "InPlaceAccumulatorV2: accumulation_buffer shape (", left_addee_buffer.Shape(),
+                    ") must match value shape (", right_addee_buffer.Shape(), ").");
+
   if (overwrite) {
     const T_GRAD* source = right_addee_buffer.template Data<T_GRAD>();
     T* target = left_addee_buffer.template MutableData<T>();