From 294b2776155013c1b4606a4d171b20dfe23fee6d Mon Sep 17 00:00:00 2001
From: Esun Kim <veblush@google.com>
Date: Thu, 18 Jun 2026 10:43:37 -0700
Subject: [PATCH 1/2] Synced

---
 ci/tflite_files.txt                           |  25 +--
 tensorflow/lite/core/c/common.cc              |  34 +++-
 tensorflow/lite/core/c/common.h               |  41 ++++-
 tensorflow/lite/kernels/internal/BUILD        |   1 +
 tensorflow/lite/kernels/internal/common.h     |  20 +++
 .../lite/kernels/internal/reference/add.h     |  30 +++-
 .../kernels/internal/reference/batch_matmul.h | 162 +++++++++---------
 .../internal/reference/binary_function.h      |  43 +----
 .../internal/reference/broadcast_loop.h       | 132 ++++++++++++++
 .../kernels/internal/reference/comparisons.cc |  41 -----
 .../kernels/internal/reference/comparisons.h  |  79 +++------
 .../internal/reference/concatenation.h        |   3 +-
 .../internal/reference/depthwiseconv_uint8.h  |   6 +-
 .../lite/kernels/internal/reference/div.h     | 109 ++++--------
 .../internal/reference/integer_ops/add.h      |  14 ++
 .../internal/reference/maximum_minimum.h      |  26 +--
 .../lite/kernels/internal/reference/prelu.h   |  64 +++----
 .../lite/kernels/internal/reference/softmax.h |  25 +--
 .../kernels/internal/reference/transpose.h    |   3 +
 tensorflow/lite/kernels/internal/types.h      |  44 ++++-
 tensorflow/lite/kernels/kernel_util.cc        |  46 +++--
 tensorflow/lite/kernels/kernel_util.h         |  27 +++
 .../lite/tools/flatbuffer_utils_test.py       |   6 +-
 tensorflow/lite/tools/test_utils.py           |   2 +-
 tensorflow/lite/tools/visualize_test.py       |   4 +-
 tensorflow/lite/types/BUILD                   |  11 ++
 tensorflow/lite/types/half.h                  |  83 +++++++++
 27 files changed, 680 insertions(+), 401 deletions(-)
 create mode 100644 tensorflow/lite/kernels/internal/reference/broadcast_loop.h
 delete mode 100644 tensorflow/lite/kernels/internal/reference/comparisons.cc
 create mode 100644 tensorflow/lite/types/BUILD
 create mode 100644 tensorflow/lite/types/half.h

diff --git a/ci/tflite_files.txt b/ci/tflite_files.txt
index 351ece5cca4..ccbfd354215 100644
--- a/ci/tflite_files.txt
+++ b/ci/tflite_files.txt
@@ -30,16 +30,17 @@ tensorflow/lite/core/c/common.h
 tensorflow/lite/core/macros.h
 tensorflow/lite/kernels/internal/common.h
 tensorflow/lite/kernels/internal/compatibility.h
-tensorflow/lite/kernels/internal/portable_tensor_utils.h
 tensorflow/lite/kernels/internal/portable_tensor_utils.cc
+tensorflow/lite/kernels/internal/portable_tensor_utils.h
 tensorflow/lite/kernels/internal/quantization_util.h
-tensorflow/lite/kernels/internal/reference/add.h
 tensorflow/lite/kernels/internal/reference/add_n.h
+tensorflow/lite/kernels/internal/reference/add.h
 tensorflow/lite/kernels/internal/reference/arg_min_max.h
-tensorflow/lite/kernels/internal/reference/batch_to_space_nd.h
 tensorflow/lite/kernels/internal/reference/batch_matmul.h
+tensorflow/lite/kernels/internal/reference/batch_to_space_nd.h
 tensorflow/lite/kernels/internal/reference/binary_function.h
 tensorflow/lite/kernels/internal/reference/broadcast_args.h
+tensorflow/lite/kernels/internal/reference/broadcast_loop.h
 tensorflow/lite/kernels/internal/reference/broadcast_to.h
 tensorflow/lite/kernels/internal/reference/ceil.h
 tensorflow/lite/kernels/internal/reference/comparisons.h
@@ -54,17 +55,17 @@ tensorflow/lite/kernels/internal/reference/div.h
 tensorflow/lite/kernels/internal/reference/elu.h
 tensorflow/lite/kernels/internal/reference/exp.h
 tensorflow/lite/kernels/internal/reference/fill.h
-tensorflow/lite/kernels/internal/reference/floor.h
 tensorflow/lite/kernels/internal/reference/floor_div.h
 tensorflow/lite/kernels/internal/reference/floor_mod.h
+tensorflow/lite/kernels/internal/reference/floor.h
 tensorflow/lite/kernels/internal/reference/fully_connected.h
 tensorflow/lite/kernels/internal/reference/hard_swish.h
 tensorflow/lite/kernels/internal/reference/integer_ops/add.h
 tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
 tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
 tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
-tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
 tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h
+tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
 tensorflow/lite/kernels/internal/reference/integer_ops/mean.h
 tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
 tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h
@@ -73,14 +74,16 @@ tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h
 tensorflow/lite/kernels/internal/reference/l2normalization.h
 tensorflow/lite/kernels/internal/reference/leaky_relu.h
 tensorflow/lite/kernels/internal/reference/log_softmax.h
+tensorflow/lite/kernels/internal/reference/logistic.h
+tensorflow/lite/kernels/internal/reference/lstm_cell.h
 tensorflow/lite/kernels/internal/reference/maximum_minimum.h
 tensorflow/lite/kernels/internal/reference/mul.h
 tensorflow/lite/kernels/internal/reference/neg.h
 tensorflow/lite/kernels/internal/reference/pad.h
 tensorflow/lite/kernels/internal/reference/pooling.h
+tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
 tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
 tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
-tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
 tensorflow/lite/kernels/internal/reference/prelu.h
 tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h
 tensorflow/lite/kernels/internal/reference/quantize.h
@@ -90,18 +93,16 @@ tensorflow/lite/kernels/internal/reference/resize_bilinear.h
 tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h
 tensorflow/lite/kernels/internal/reference/reverse.h
 tensorflow/lite/kernels/internal/reference/round.h
+tensorflow/lite/kernels/internal/reference/select.h
+tensorflow/lite/kernels/internal/reference/slice.h
 tensorflow/lite/kernels/internal/reference/softmax.h
 tensorflow/lite/kernels/internal/reference/space_to_batch_nd.h
 tensorflow/lite/kernels/internal/reference/space_to_depth.h
-tensorflow/lite/kernels/internal/reference/sub.h
-tensorflow/lite/kernels/internal/reference/logistic.h
-tensorflow/lite/kernels/internal/reference/lstm_cell.h
-tensorflow/lite/kernels/internal/reference/select.h
-tensorflow/lite/kernels/internal/reference/slice.h
 tensorflow/lite/kernels/internal/reference/strided_slice.h
+tensorflow/lite/kernels/internal/reference/sub.h
 tensorflow/lite/kernels/internal/reference/tanh.h
-tensorflow/lite/kernels/internal/reference/transpose.h
 tensorflow/lite/kernels/internal/reference/transpose_conv.h
+tensorflow/lite/kernels/internal/reference/transpose.h
 tensorflow/lite/kernels/internal/cppmath.h
 tensorflow/lite/kernels/internal/max.h
 tensorflow/lite/kernels/internal/min.h
diff --git a/tensorflow/lite/core/c/common.cc b/tensorflow/lite/core/c/common.cc
index 6d247309ec5..37254cacc2f 100644
--- a/tensorflow/lite/core/c/common.cc
+++ b/tensorflow/lite/core/c/common.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #endif  // TF_LITE_STATIC_MEMORY
 
 #include <cstring>
+#include <new>
 #include <type_traits>
 #include <utility>
 
@@ -111,6 +112,7 @@ TfLiteSparsity TfLiteSparsityClone(const TfLiteSparsity& src) {
   if (src.dim_metadata) {
     dst.dim_metadata = reinterpret_cast<TfLiteDimensionMetadata*>(
         calloc(1, sizeof(TfLiteDimensionMetadata) * src.dim_metadata_size));
+    if (src.dim_metadata_size > 0 && !dst.dim_metadata) return TfLiteSparsity();
     for (int i = 0; i < src.dim_metadata_size; ++i) {
       dst.dim_metadata[i] = src.dim_metadata[i];
       dst.dim_metadata[i].array_segments =
@@ -129,6 +131,7 @@ TfLiteSparsity* TfLiteSparsityClone(const TfLiteSparsity* const src) {
   }
   TfLiteSparsity* dst =
       reinterpret_cast<TfLiteSparsity*>(calloc(1, sizeof(TfLiteSparsity)));
+  if (!dst) return nullptr;
   *dst = TfLiteSparsityClone(*src);
   return dst;
 }
@@ -147,6 +150,7 @@ TfLiteQuantization TfLiteQuantizationClone(const TfLiteQuantization& src) {
       break;
     case kTfLiteAffineQuantization: {
       dst.params = calloc(1, sizeof(TfLiteAffineQuantization));
+      if (!dst.params) return TfLiteQuantization();
       const TfLiteAffineQuantization* const src_params =
           reinterpret_cast<TfLiteAffineQuantization*>(src.params);
       TfLiteAffineQuantization* const dst_params =
@@ -158,6 +162,7 @@ TfLiteQuantization TfLiteQuantizationClone(const TfLiteQuantization& src) {
     }
     case kTfLiteBlockwiseQuantization: {
       dst.params = calloc(1, sizeof(TfLiteBlockwiseQuantization));
+      if (!dst.params) return TfLiteQuantization();
       const TfLiteBlockwiseQuantization* const src_params =
           (TfLiteBlockwiseQuantization*)(src.params);
       TfLiteBlockwiseQuantization* const dst_params =
@@ -219,6 +224,9 @@ TfLiteFloatArray* TfLiteFloatArrayCopy(const TfLiteFloatArray* src) {
 void TfLiteFloatArrayFree(TfLiteFloatArray* a) { TfLiteVarArrayFree(a); }
 
 void TfLiteTensorDataFree(TfLiteTensor* t) {
+  if (t == nullptr) {
+    return;
+  }
   if (t->allocation_type == kTfLiteVariantObject && t->data.data) {
     delete static_cast<VariantData*>(t->data.data);
   } else if (t->allocation_type == kTfLiteDynamic ||
@@ -238,6 +246,9 @@ void TfLiteTensorDataFree(TfLiteTensor* t) {
 }
 
 void TfLiteQuantizationFree(TfLiteQuantization* quantization) {
+  if (quantization == nullptr) {
+    return;
+  }
   if (quantization->type == kTfLiteAffineQuantization) {
     TfLiteAffineQuantization* q_params =
         reinterpret_cast<TfLiteAffineQuantization*>(quantization->params);
@@ -294,6 +305,9 @@ void TfLiteSparsityFree(TfLiteSparsity* sparsity) {
 }
 
 void TfLiteTensorFree(TfLiteTensor* t) {
+  if (t == nullptr) {
+    return;
+  }
   TfLiteTensorDataFree(t);
   if (t->dims) TfLiteIntArrayFree(t->dims);
   t->dims = nullptr;
@@ -308,7 +322,7 @@ void TfLiteTensorFree(TfLiteTensor* t) {
   t->sparsity = nullptr;
 }
 
-TfLiteTensor TfLiteTensorClone(const TfLiteTensor src) {
+TfLiteTensor TfLiteTensorClone(TfLiteTensor src) {
   // We copy all of the source data first, then we clone the fields that can't
   // be shared between two tensor instances.
   TfLiteTensor dst = src;
@@ -335,16 +349,18 @@ TfLiteTensor TfLiteTensorClone(const TfLiteTensor src) {
         break;
       case kTfLiteAllocationStrategyMalloc:
         dst.data.data = malloc(src.bytes);
+        if (src.bytes > 0 && !dst.data.data) return TfLiteTensor();
         std::memcpy(dst.data.data, src.data.data, src.bytes);
         break;
       case kTfLiteAllocationStrategyNew:
         // Special case for variant objects. They are allocated using new/delete
         // but require using the `CloneTo` function.
         if (src.allocation_type == kTfLiteVariantObject) {
-          dst.data.data = reinterpret_cast<const VariantData*>(src.data.data)
-                              ->CloneTo(nullptr);
+          dst.data.data =
+              static_cast<const VariantData*>(src.data.data)->CloneTo(nullptr);
         } else {
-          dst.data.data = new char[src.bytes];
+          dst.data.data = new (std::nothrow) char[src.bytes];
+          if (src.bytes > 0 && !dst.data.data) return TfLiteTensor();
           std::memcpy(dst.data.data, src.data.data, src.bytes);
         }
         break;
@@ -394,13 +410,21 @@ TfLiteStatus TfLiteTensorCopy(const TfLiteTensor* src, TfLiteTensor* dst) {
     }
     auto* dst_vd = static_cast<VariantData*>(dst->data.data);
     auto* src_vd = static_cast<VariantData*>(src->data.data);
+    if (!src_vd) return kTfLiteError;
 
     // `CloneTo` will handle the case when `dst_vd` is nullptr, so it is safe
     // to `CloneTo` something which was "freed". Also, returning from `CloneTo`
     // will implicitly cast to `VariantData`; don't need static cast here.
     dst->data.data = src_vd->CloneTo(dst_vd);
   } else {
-    memcpy(dst->data.raw, src->data.raw, src->bytes);
+    if (dst->allocation_type == kTfLiteVariantObject) {
+      TfLiteTensorDataFree(dst);
+      dst->allocation_type = src->allocation_type;
+    }
+    if (src->bytes > 0) {
+      if (!dst->data.raw || !src->data.raw) return kTfLiteError;
+      memcpy(dst->data.raw, src->data.raw, src->bytes);
+    }
   }
   dst->buffer_handle = src->buffer_handle;
   dst->data_is_stale = src->data_is_stale;
diff --git a/tensorflow/lite/core/c/common.h b/tensorflow/lite/core/c/common.h
index a3b0dbd7492..8ea233516b3 100644
--- a/tensorflow/lite/core/c/common.h
+++ b/tensorflow/lite/core/c/common.h
@@ -56,6 +56,7 @@ limitations under the License.
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
+#include <stdio.h>
 
 #include "tensorflow/lite/core/c/c_api_types.h"  // IWYU pragma: export
 
@@ -277,13 +278,34 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a);
     }                                                                        \
   } while (0)
 
-#define TF_LITE_ENSURE_OK(context, status) \
-  do {                                     \
-    const TfLiteStatus s = (status);       \
-    if ((s) != kTfLiteOk) {                \
-      return s;                            \
-    }                                      \
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
+#define TF_LITE_VAR_ARG_HEAD(FIRST, ...) FIRST
+#define TF_LITE_STRINGIFY_HELPER(x) #x
+#define TF_LITE_STRINGIFY(x) TF_LITE_STRINGIFY_HELPER(x)
+// Checks that `status` evaluates to `kTfLiteOk`.
+//
+// Can take a printf style log message and its parameters after the status. The
+// message will be printed using `TF_LITE_KERNEL_LOG` in case of error.
+#define TF_LITE_ENSURE_OK(context, status, ...)                              \
+  do {                                                                       \
+    const TfLiteStatus s = (status);                                         \
+    if (s != kTfLiteOk) {                                                    \
+      if (sizeof(TF_LITE_VAR_ARG_HEAD("" __VA_ARGS__)) > sizeof("")) {       \
+        TF_LITE_MAYBE_KERNEL_LOG((context), __FILE__ ":" TF_LITE_STRINGIFY(  \
+                                                __LINE__) ": " __VA_ARGS__); \
+      }                                                                      \
+      return s;                                                              \
+    }                                                                        \
   } while (0)
+#else
+#define TF_LITE_ENSURE_OK(context, status, ...) \
+  do {                                          \
+    const TfLiteStatus s = (status);            \
+    if ((s) != kTfLiteOk) {                     \
+      return s;                                 \
+    }                                           \
+  } while (0)
+#endif
 
 // `std::unreachable` not available until CC23.
 #ifdef __GNUC__  // GCC, Clang, ICC
@@ -1060,6 +1082,13 @@ typedef struct TfLiteContext {
   /// WARNING: This is an experimental interface that is subject to change.
   TfLiteStatus (*ReleaseSubgraphContext)(struct TfLiteContext* context,
                                          int subgraph_index);
+#if defined(_WIN32)
+  /// Create a array of a given `size` (uninitialized entries).
+  TfLiteIntArray* (*TfLiteIntArrayCreate)(int size);  // NOLINT
+
+  /// Free memory of array `a`.
+  void (*TfLiteIntArrayFree)(TfLiteIntArray* a);  // NOLINT
+#endif                                            // defined(_WIN32)
 } TfLiteContext;
 
 /// `TfLiteOperator` is an external version of `TfLiteRegistration`
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 79cb502eeb8..e18452a9340 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -146,6 +146,7 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         ":compatibility",
+        "//tensorflow/lite/types:half",
     ],
 )
 
diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h
index 4d990d70aa0..929168b7098 100644
--- a/tensorflow/lite/kernels/internal/common.h
+++ b/tensorflow/lite/kernels/internal/common.h
@@ -78,7 +78,11 @@ bool ReduceDimensionsForBroadcast(const RuntimeShape& input1_shape,
       if (!broadcast_input1) {
         broadcast_input1 = true;
         broadcast_input2 = false;
+        if (num_compressed_dims >= MAX_DIM) return false;
         num_compressed_dims++;
+        if (num_compressed_dims > MAX_DIM) {
+          return false;
+        }
       }
       compressed_input2_shape[num_compressed_dims - 1] *= input2_dim;
       compressed_output_shape[num_compressed_dims - 1] *= input2_dim;
@@ -86,7 +90,11 @@ bool ReduceDimensionsForBroadcast(const RuntimeShape& input1_shape,
       if (!broadcast_input2) {
         broadcast_input1 = false;
         broadcast_input2 = true;
+        if (num_compressed_dims >= MAX_DIM) return false;
         num_compressed_dims++;
+        if (num_compressed_dims > MAX_DIM) {
+          return false;
+        }
       }
       compressed_input1_shape[num_compressed_dims - 1] *= input1_dim;
       compressed_output_shape[num_compressed_dims - 1] *= input1_dim;
@@ -95,7 +103,11 @@ bool ReduceDimensionsForBroadcast(const RuntimeShape& input1_shape,
       if (broadcast_input1 || broadcast_input2 || first_nonunit) {
         broadcast_input1 = false;
         broadcast_input2 = false;
+        if (num_compressed_dims >= MAX_DIM) return false;
         num_compressed_dims++;
+        if (num_compressed_dims > MAX_DIM) {
+          return false;
+        }
       }
       compressed_input1_shape[num_compressed_dims - 1] *= input1_dim;
       compressed_input2_shape[num_compressed_dims - 1] *= input1_dim;
@@ -105,7 +117,11 @@ bool ReduceDimensionsForBroadcast(const RuntimeShape& input1_shape,
   }
   if (num_input1_dims > num_input2_dims) {
     if (!broadcast_input2) {
+      if (num_compressed_dims >= MAX_DIM) return false;
       num_compressed_dims++;
+      if (num_compressed_dims > MAX_DIM) {
+        return false;
+      }
     }
     for (size_t i = 0; i < num_input1_dims - num_input2_dims; i++) {
       const size_t input1_dim = input1_dims[i];
@@ -117,7 +133,11 @@ bool ReduceDimensionsForBroadcast(const RuntimeShape& input1_shape,
     }
   } else if (num_input2_dims > num_input1_dims) {
     if (!broadcast_input1) {
+      if (num_compressed_dims >= MAX_DIM) return false;
       num_compressed_dims++;
+      if (num_compressed_dims > MAX_DIM) {
+        return false;
+      }
     }
     for (size_t i = 0; i < num_input2_dims - num_input1_dims; i++) {
       const size_t input2_dim = input2_dims[i];
diff --git a/tensorflow/lite/kernels/internal/reference/add.h b/tensorflow/lite/kernels/internal/reference/add.h
index 5b520bd1e13..d41299d16d3 100644
--- a/tensorflow/lite/kernels/internal/reference/add.h
+++ b/tensorflow/lite/kernels/internal/reference/add.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "fixedpoint/fixedpoint.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h"
 
 namespace tflite {
 
@@ -39,7 +40,7 @@ inline void Add(const ArithmeticParams& params,
   const int flat_size =
       MatchingElementsSize(input1_shape, input2_shape, output_shape);
   for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = ActivationFunctionWithMinMax(
+    output_data[i] = ActivationFunctionWithMinMax<T>(
         input1_data[i] + input2_data[i], activation_min, activation_max);
   }
 }
@@ -328,6 +329,20 @@ BroadcastAdd6DSlow(const ArithmeticParams& params,
   constexpr int kMaxBroadcastDim = 6;
   T activation_min, activation_max;
   GetActivationParams(params, &activation_min, &activation_max);
+  const int broadcast_rank = std::max(
+      output_shape.DimensionsCount(),
+      std::max(input1_shape.DimensionsCount(), input2_shape.DimensionsCount()));
+  if (broadcast_rank > kMaxBroadcastDim) {
+    ForEachBroadcastedElement(
+        input1_shape, input2_shape, output_shape,
+        [&](int output_index, int input1_index, int input2_index) {
+          output_data[output_index] = ActivationFunctionWithMinMax(
+              static_cast<T>(input1_data[input1_index] +
+                             input2_data[input2_index]),
+              activation_min, activation_max);
+        });
+    return;
+  }
 
   // In Tensorflow, the dimensions are canonically named (batch_number, row,
   // col, channel), with extents (batches, height, width, depth), with the
@@ -421,6 +436,19 @@ BroadcastAdd6DSlow(const ArithmeticParams& params,
                    const RuntimeShape& input2_shape, const T* input2_data,
                    const RuntimeShape& output_shape, T* output_data) {
   constexpr int kMaxBroadcastDim = 6;
+  const int broadcast_rank = std::max(
+      output_shape.DimensionsCount(),
+      std::max(input1_shape.DimensionsCount(), input2_shape.DimensionsCount()));
+  if (broadcast_rank > kMaxBroadcastDim) {
+    ForEachBroadcastedElement(
+        input1_shape, input2_shape, output_shape,
+        [&](int output_index, int input1_index, int input2_index) {
+          AddElementwise(1, params, input1_data + input1_index,
+                         input2_data + input2_index,
+                         output_data + output_index);
+        });
+    return;
+  }
 
   // In Tensorflow, the dimensions are canonically named (batch_number, row,
   // col, channel), with extents (batches, height, width, depth), with the
diff --git a/tensorflow/lite/kernels/internal/reference/batch_matmul.h b/tensorflow/lite/kernels/internal/reference/batch_matmul.h
index 71f456703a3..a0853526233 100644
--- a/tensorflow/lite/kernels/internal/reference/batch_matmul.h
+++ b/tensorflow/lite/kernels/internal/reference/batch_matmul.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
+#include <limits>
 
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
@@ -38,13 +39,13 @@ inline int broadcast_dim(int lhs_dim, int rhs_dim) {
 
 // Compute the "extent" for iterating on this dimension.
 // If we are broadcasting, then don't advance (i.e return 0).
-inline int extent(const RuntimeShape& shape, int x) {
+inline size_t extent(const RuntimeShape& shape, int x) {
   if (shape.Dims(x) == 1) {
     return 0;
   }
-  int prod = 1;
+  size_t prod = 1;
   for (int i = x + 1; i < shape.DimensionsCount(); ++i) {
-    prod *= shape.Dims(i);
+    prod *= static_cast<size_t>(shape.Dims(i));
   }
   return prod;
 }
@@ -60,45 +61,45 @@ inline void BatchMatMul(const RuntimeShape& lhs_shape, const Ta* lhs_data,
   const RuntimeShape extended_rhs_shape =
       RuntimeShape::ExtendedShape(5, rhs_shape);
 
-  const int batch_dim0 = batch_matmul::broadcast_dim(
-      extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
-  const int batch_dim1 = batch_matmul::broadcast_dim(
-      extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
-  const int batch_dim2 = batch_matmul::broadcast_dim(
-      extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
+  const size_t batch_dim0 = static_cast<size_t>(batch_matmul::broadcast_dim(
+      extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0)));
+  const size_t batch_dim1 = static_cast<size_t>(batch_matmul::broadcast_dim(
+      extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1)));
+  const size_t batch_dim2 = static_cast<size_t>(batch_matmul::broadcast_dim(
+      extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2)));
 
-  const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
-  const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
-  const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
-  const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
-  const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
-  const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);
+  const size_t lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
+  const size_t lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
+  const size_t lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
+  const size_t rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
+  const size_t rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
+  const size_t rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);
 
   // Set params for each matrix multiply.
-  const int lhs_rows = extended_lhs_shape.Dims(3);
-  const int rhs_cols = extended_rhs_shape.Dims(4);
-  const int accum_depth = extended_lhs_shape.Dims(4);
+  const size_t lhs_rows = static_cast<size_t>(extended_lhs_shape.Dims(3));
+  const size_t rhs_cols = static_cast<size_t>(extended_rhs_shape.Dims(4));
+  const size_t accum_depth = static_cast<size_t>(extended_lhs_shape.Dims(4));
 
-  for (int b0 = 0; b0 < batch_dim0; ++b0) {
+  for (size_t b0 = 0; b0 < batch_dim0; ++b0) {
     const Ta* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
     const Tb* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
-    for (int b1 = 0; b1 < batch_dim1; ++b1) {
+    for (size_t b1 = 0; b1 < batch_dim1; ++b1) {
       const Ta* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
       const Tb* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
-      for (int b2 = 0; b2 < batch_dim2; ++b2) {
+      for (size_t b2 = 0; b2 < batch_dim2; ++b2) {
         const Ta* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
         const Tb* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
         Tout* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
                                        b1 * batch_dim2 + b2) *
                                           lhs_rows * rhs_cols;
-        for (int j = 0; j < rhs_cols; ++j) {
-          for (int i = 0; i < lhs_rows; ++i) {
+        for (size_t j = 0; j < rhs_cols; ++j) {
+          for (size_t i = 0; i < lhs_rows; ++i) {
             Tout total = 0;
-            for (int k = 0; k < accum_depth; ++k) {
+            for (size_t k = 0; k < accum_depth; ++k) {
               total += static_cast<Tout>(lhs_ptr2[accum_depth * i + k]) *
                        static_cast<Tout>(rhs_ptr2[j * accum_depth + k]);
             }
-            int idx = lhs_rows * j + i;
+            size_t idx = lhs_rows * j + i;
             out_ptr[idx] = total;
           }
         }
@@ -119,57 +120,62 @@ inline void BatchMatMul(const RuntimeShape& lhs_shape, const int8_t* lhs_data,
   const RuntimeShape extended_rhs_shape =
       RuntimeShape::ExtendedShape(5, rhs_shape);
 
-  const int batch_dim0 = batch_matmul::broadcast_dim(
-      extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
-  const int batch_dim1 = batch_matmul::broadcast_dim(
-      extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
-  const int batch_dim2 = batch_matmul::broadcast_dim(
-      extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
+  const size_t batch_dim0 = static_cast<size_t>(batch_matmul::broadcast_dim(
+      extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0)));
+  const size_t batch_dim1 = static_cast<size_t>(batch_matmul::broadcast_dim(
+      extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1)));
+  const size_t batch_dim2 = static_cast<size_t>(batch_matmul::broadcast_dim(
+      extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2)));
 
-  const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
-  const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
-  const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
-  const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
-  const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
-  const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);
+  const size_t lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
+  const size_t lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
+  const size_t lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
+  const size_t rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
+  const size_t rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
+  const size_t rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);
 
   // Set params for each matrix multiply.
-  const int lhs_rows = extended_lhs_shape.Dims(3);
-  const int rhs_cols = extended_rhs_shape.Dims(4);
-  const int accum_depth = extended_lhs_shape.Dims(4);
+  const size_t lhs_rows = static_cast<size_t>(extended_lhs_shape.Dims(3));
+  const size_t rhs_cols = static_cast<size_t>(extended_rhs_shape.Dims(4));
+  const size_t accum_depth = static_cast<size_t>(extended_lhs_shape.Dims(4));
 
-  const int ioff_ext0 = rhs_ext0 == 0 ? 0 : rhs_cols;
-  const int ioff_ext1 = rhs_ext1 == 0 ? 0 : rhs_cols;
-  const int ioff_ext2 = rhs_ext2 == 0 ? 0 : rhs_cols;
-  const int woff_ext0 = lhs_ext0 == 0 ? 0 : lhs_rows;
-  const int woff_ext1 = lhs_ext1 == 0 ? 0 : lhs_rows;
-  const int woff_ext2 = lhs_ext2 == 0 ? 0 : lhs_rows;
+  const size_t ioff_ext0 = rhs_ext0 == 0 ? 0 : rhs_cols;
+  const size_t ioff_ext1 = rhs_ext1 == 0 ? 0 : rhs_cols;
+  const size_t ioff_ext2 = rhs_ext2 == 0 ? 0 : rhs_cols;
+  const size_t woff_ext0 = lhs_ext0 == 0 ? 0 : lhs_rows;
+  const size_t woff_ext1 = lhs_ext1 == 0 ? 0 : lhs_rows;
+  const size_t woff_ext2 = lhs_ext2 == 0 ? 0 : lhs_rows;
 
   if (!compute_row_sums || *compute_row_sums) {
-    int num_weights_matrices = 1;
+    size_t num_weights_matrices = 1;
     for (int i = 1; i < extended_lhs_shape.DimensionsCount() - 2; ++i) {
-      num_weights_matrices *= extended_lhs_shape.Dims(i);
+      num_weights_matrices *= static_cast<size_t>(extended_lhs_shape.Dims(i));
     }
+    TFLITE_DCHECK_LE(num_weights_matrices * lhs_rows,
+                     static_cast<size_t>(std::numeric_limits<int>::max()));
+    TFLITE_DCHECK_LE(accum_depth,
+                     static_cast<size_t>(std::numeric_limits<int>::max()));
     tensor_utils::ReductionSumVector(
-        lhs_data, row_sums, num_weights_matrices * lhs_rows, accum_depth);
+        lhs_data, row_sums, static_cast<int>(num_weights_matrices * lhs_rows),
+        static_cast<int>(accum_depth));
     if (compute_row_sums) {
       *compute_row_sums = false;
     }
   }
 
-  for (int b0 = 0; b0 < batch_dim0; ++b0) {
+  for (size_t b0 = 0; b0 < batch_dim0; ++b0) {
     const int8_t* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
     const int8_t* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
     const int32_t* ioff_ptr0 = input_offset + (b0 * ioff_ext0);
     const float* scale_ptr0 = scaling_factors + (b0 * ioff_ext0);
     const int32_t* woff_ptr0 = row_sums + (b0 * woff_ext0);
-    for (int b1 = 0; b1 < batch_dim1; ++b1) {
+    for (size_t b1 = 0; b1 < batch_dim1; ++b1) {
       const int8_t* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
       const int8_t* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
       const int32_t* ioff_ptr1 = ioff_ptr0 + (b1 * ioff_ext1);
       const float* scale_ptr1 = scale_ptr0 + (b1 * ioff_ext1);
       const int32_t* woff_ptr1 = woff_ptr0 + (b1 * woff_ext1);
-      for (int b2 = 0; b2 < batch_dim2; ++b2) {
+      for (size_t b2 = 0; b2 < batch_dim2; ++b2) {
         const int8_t* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
         const int8_t* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
         const int32_t* ioff_ptr2 = ioff_ptr1 + (b2 * ioff_ext2);
@@ -178,18 +184,18 @@ inline void BatchMatMul(const RuntimeShape& lhs_shape, const int8_t* lhs_data,
         float* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
                                         b1 * batch_dim2 + b2) *
                                            lhs_rows * rhs_cols;
-        for (int j = 0; j < rhs_cols; ++j) {
+        for (size_t j = 0; j < rhs_cols; ++j) {
           const float batch_scaling_factor = scale_ptr2[j];
           const float batch_offset = static_cast<float>(ioff_ptr2[j]);
-          for (int i = 0; i < lhs_rows; ++i) {
+          for (size_t i = 0; i < lhs_rows; ++i) {
             int32_t total = 0;
-            for (int k = 0; k < accum_depth; ++k) {
+            for (size_t k = 0; k < accum_depth; ++k) {
               total +=
                   lhs_ptr2[accum_depth * i + k] * rhs_ptr2[j * accum_depth + k];
             }
             int32_t row_sum = woff_ptr2[i];
             total -= row_sum * batch_offset;
-            int idx = lhs_rows * j + i;
+            size_t idx = lhs_rows * j + i;
             float scale = batch_scaling_factor;
             if (per_channel_scales) {
               scale *= per_channel_scales[i];
@@ -214,24 +220,24 @@ inline void BatchMatMul(const FullyConnectedParams& params,
   const RuntimeShape extended_rhs_shape =
       RuntimeShape::ExtendedShape(5, rhs_shape);
 
-  const int batch_dim0 = batch_matmul::broadcast_dim(
-      extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
-  const int batch_dim1 = batch_matmul::broadcast_dim(
-      extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
-  const int batch_dim2 = batch_matmul::broadcast_dim(
-      extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
+  const size_t batch_dim0 = static_cast<size_t>(batch_matmul::broadcast_dim(
+      extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0)));
+  const size_t batch_dim1 = static_cast<size_t>(batch_matmul::broadcast_dim(
+      extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1)));
+  const size_t batch_dim2 = static_cast<size_t>(batch_matmul::broadcast_dim(
+      extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2)));
 
-  const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
-  const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
-  const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
-  const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
-  const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
-  const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);
+  const size_t lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
+  const size_t lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
+  const size_t lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
+  const size_t rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
+  const size_t rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
+  const size_t rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);
 
   // Set params for each matrix multiply.
-  const int lhs_rows = extended_lhs_shape.Dims(3);
-  const int rhs_cols = extended_rhs_shape.Dims(4);
-  const int accum_depth = extended_lhs_shape.Dims(4);
+  const size_t lhs_rows = static_cast<size_t>(extended_lhs_shape.Dims(3));
+  const size_t rhs_cols = static_cast<size_t>(extended_rhs_shape.Dims(4));
+  const size_t accum_depth = static_cast<size_t>(extended_lhs_shape.Dims(4));
 
   const int32_t input_offset = params.input_offset;
   const int32_t filter_offset = params.weights_offset;
@@ -242,23 +248,23 @@ inline void BatchMatMul(const FullyConnectedParams& params,
   const int32_t output_activation_max = params.quantized_activation_max;
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
 
-  for (int b0 = 0; b0 < batch_dim0; ++b0) {
+  for (size_t b0 = 0; b0 < batch_dim0; ++b0) {
     const lhsT* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
     const rhsT* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
-    for (int b1 = 0; b1 < batch_dim1; ++b1) {
+    for (size_t b1 = 0; b1 < batch_dim1; ++b1) {
       const lhsT* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
       const rhsT* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
-      for (int b2 = 0; b2 < batch_dim2; ++b2) {
+      for (size_t b2 = 0; b2 < batch_dim2; ++b2) {
         const lhsT* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
         const rhsT* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
         outputT* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
                                           b1 * batch_dim2 + b2) *
                                              lhs_rows * rhs_cols;
 
-        for (int j = 0; j < rhs_cols; ++j) {
-          for (int i = 0; i < lhs_rows; ++i) {
+        for (size_t j = 0; j < rhs_cols; ++j) {
+          for (size_t i = 0; i < lhs_rows; ++i) {
             AccumT total = 0;
-            for (int k = 0; k < accum_depth; ++k) {
+            for (size_t k = 0; k < accum_depth; ++k) {
               AccumT lhs_val = lhs_ptr2[accum_depth * i + k];
               AccumT rhs_val = rhs_ptr2[accum_depth * j + k];
               total += (lhs_val + filter_offset) * (rhs_val + input_offset);
@@ -268,7 +274,7 @@ inline void BatchMatMul(const FullyConnectedParams& params,
             total_scaled += output_offset;
             total_scaled = std::max(total_scaled, output_activation_min);
             total_scaled = std::min(total_scaled, output_activation_max);
-            const int idx = lhs_rows * j + i;
+            const size_t idx = lhs_rows * j + i;
             out_ptr[idx] = static_cast<outputT>(total_scaled);
           }
         }
diff --git a/tensorflow/lite/kernels/internal/reference/binary_function.h b/tensorflow/lite/kernels/internal/reference/binary_function.h
index 0b124af87f0..611fc3241b0 100644
--- a/tensorflow/lite/kernels/internal/reference/binary_function.h
+++ b/tensorflow/lite/kernels/internal/reference/binary_function.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
@@ -32,42 +33,12 @@ inline void BroadcastBinaryFunction4DSlow(
     const RuntimeShape& unextended_input2_shape, const T2* input2_data,
     const RuntimeShape& unextended_output_shape, R* output_data,
     R (*func)(T1, T2)) {
-  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
-  const RuntimeShape output_shape =
-      RuntimeShape::ExtendedShape(4, unextended_output_shape);
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
-                                      unextended_input2_shape, &desc1, &desc2);
-
-  const int* dims_data =
-      reinterpret_cast<const int*>(output_shape.DimsDataUpTo5D());
-  for (int b = 0; b < output_shape.Dims(0); ++b) {
-    int out_idx_b = b * dims_data[1];
-    int in_idx1_b = desc1.strides[0] * b;
-    int in_idx2_b = desc2.strides[0] * b;
-    for (int y = 0; y < output_shape.Dims(1); ++y) {
-      int out_idx_y = (out_idx_b + y) * dims_data[2];
-      int in_idx1_y = in_idx1_b + desc1.strides[1] * y;
-      int in_idx2_y = in_idx2_b + desc2.strides[1] * y;
-      for (int x = 0; x < output_shape.Dims(2); ++x) {
-        int out_idx_x = (out_idx_y + x) * dims_data[3];
-        int in1_idx = in_idx1_y + desc1.strides[2] * x;
-        int in2_idx = in_idx2_y + desc2.strides[2] * x;
-        for (int c = 0; c < output_shape.Dims(3); ++c) {
-          auto out_idx = out_idx_x + c;
-          auto in1_val = input1_data[in1_idx];
-          auto in2_val = input2_data[in2_idx];
-          output_data[out_idx] = func(in1_val, in2_val);
-          in1_idx += desc1.strides[3];
-          in2_idx += desc2.strides[3];
-        }
-      }
-    }
-  }
+  ForEachBroadcastedElement(
+      unextended_input1_shape, unextended_input2_shape, unextended_output_shape,
+      [&](int output_index, int input1_index, int input2_index) {
+        output_data[output_index] =
+            func(input1_data[input1_index], input2_data[input2_index]);
+      });
 }
 
 // R: Result type. T1: Input 1 type. T2: Input 2 type.
diff --git a/tensorflow/lite/kernels/internal/reference/broadcast_loop.h b/tensorflow/lite/kernels/internal/reference/broadcast_loop.h
new file mode 100644
index 00000000000..d52f2904637
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/broadcast_loop.h
@@ -0,0 +1,132 @@
+/* Copyright 2026 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_LOOP_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_LOOP_H_
+
+#include <algorithm>
+#include <vector>
+
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline std::vector<int> BroadcastStridesForShape(
+    const RuntimeShape& unextended_shape,
+    const RuntimeShape& extended_output_shape) {
+  const int dims_count = extended_output_shape.DimensionsCount();
+  const RuntimeShape extended_shape =
+      RuntimeShape::ExtendedShape(dims_count, unextended_shape);
+  std::vector<int> strides(dims_count);
+  int stride = 1;
+  for (int i = dims_count - 1; i >= 0; --i) {
+    const int dim = extended_shape.Dims(i);
+    const int output_dim = extended_output_shape.Dims(i);
+    strides[i] = (dim == 1 && output_dim != 1) ? 0 : stride;
+    stride *= dim;
+  }
+  return strides;
+}
+
+inline std::vector<int> StridesForShape(const RuntimeShape& shape) {
+  const int dims_count = shape.DimensionsCount();
+  std::vector<int> strides(dims_count);
+  int stride = 1;
+  for (int i = dims_count - 1; i >= 0; --i) {
+    strides[i] = stride;
+    stride *= shape.Dims(i);
+  }
+  return strides;
+}
+
+template <typename Fn>
+inline void ForEachBroadcastedElement(const RuntimeShape& input1_shape,
+                                      const RuntimeShape& input2_shape,
+                                      const RuntimeShape& output_shape, Fn fn) {
+  const int dims_count = std::max(
+      output_shape.DimensionsCount(),
+      std::max(input1_shape.DimensionsCount(), input2_shape.DimensionsCount()));
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(dims_count, output_shape);
+  const std::vector<int> output_strides =
+      StridesForShape(extended_output_shape);
+  const std::vector<int> input1_strides =
+      BroadcastStridesForShape(input1_shape, extended_output_shape);
+  const std::vector<int> input2_strides =
+      BroadcastStridesForShape(input2_shape, extended_output_shape);
+
+  const int flat_size = output_shape.FlatSize();
+  for (int output_index = 0; output_index < flat_size; ++output_index) {
+    int remaining_index = output_index;
+    int input1_index = 0;
+    int input2_index = 0;
+    for (int dim = 0; dim < dims_count; ++dim) {
+      const int output_stride = output_strides[dim];
+      const int coordinate =
+          output_stride == 0 ? 0 : remaining_index / output_stride;
+      if (output_stride != 0) {
+        remaining_index %= output_stride;
+      }
+      input1_index += coordinate * input1_strides[dim];
+      input2_index += coordinate * input2_strides[dim];
+    }
+    fn(output_index, input1_index, input2_index);
+  }
+}
+
+template <typename Fn>
+inline void ForEachBroadcastedElement(const RuntimeShape& input1_shape,
+                                      const RuntimeShape& input2_shape,
+                                      const RuntimeShape& input3_shape,
+                                      const RuntimeShape& output_shape, Fn fn) {
+  const int dims_count = std::max(
+      std::max(output_shape.DimensionsCount(), input1_shape.DimensionsCount()),
+      std::max(input2_shape.DimensionsCount(), input3_shape.DimensionsCount()));
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(dims_count, output_shape);
+  const std::vector<int> output_strides =
+      StridesForShape(extended_output_shape);
+  const std::vector<int> input1_strides =
+      BroadcastStridesForShape(input1_shape, extended_output_shape);
+  const std::vector<int> input2_strides =
+      BroadcastStridesForShape(input2_shape, extended_output_shape);
+  const std::vector<int> input3_strides =
+      BroadcastStridesForShape(input3_shape, extended_output_shape);
+
+  const int flat_size = output_shape.FlatSize();
+  for (int output_index = 0; output_index < flat_size; ++output_index) {
+    int remaining_index = output_index;
+    int input1_index = 0;
+    int input2_index = 0;
+    int input3_index = 0;
+    for (int dim = 0; dim < dims_count; ++dim) {
+      const int output_stride = output_strides[dim];
+      const int coordinate =
+          output_stride == 0 ? 0 : remaining_index / output_stride;
+      if (output_stride != 0) {
+        remaining_index %= output_stride;
+      }
+      input1_index += coordinate * input1_strides[dim];
+      input2_index += coordinate * input2_strides[dim];
+      input3_index += coordinate * input3_strides[dim];
+    }
+    fn(output_index, input1_index, input2_index, input3_index);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_LOOP_H_
\ No newline at end of file
diff --git a/tensorflow/lite/kernels/internal/reference/comparisons.cc b/tensorflow/lite/kernels/internal/reference/comparisons.cc
deleted file mode 100644
index 36ce951ec17..00000000000
--- a/tensorflow/lite/kernels/internal/reference/comparisons.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/kernels/internal/reference/comparisons.h"
-
-#include "tensorflow/lite/kernels/internal/common.h"
-#include "tensorflow/lite/kernels/internal/compatibility.h"
-#include "tensorflow/lite/kernels/internal/runtime_shape.h"
-
-namespace tflite {
-namespace reference_ops {
-
-BroadcastComparison4DSlowCommon BroadcastComparison4DSlowPreprocess(
-    const RuntimeShape& unextended_input1_shape,
-    const RuntimeShape& unextended_input2_shape,
-    const RuntimeShape& unextended_output_shape) {
-  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
-                                      unextended_input2_shape, &desc1, &desc2);
-  return {RuntimeShape::ExtendedShape(4, unextended_output_shape), desc1,
-          desc2};
-}
-
-}  // namespace reference_ops
-}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/reference/comparisons.h b/tensorflow/lite/kernels/internal/reference/comparisons.h
index e40e4045cc7..165110fe8eb 100644
--- a/tensorflow/lite/kernels/internal/reference/comparisons.h
+++ b/tensorflow/lite/kernels/internal/reference/comparisons.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/macros.h"
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h"
 #include "tensorflow/lite/kernels/internal/runtime_shape.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
@@ -110,40 +111,18 @@ inline void ComparisonWithScaling(
   }
 }
 
-struct BroadcastComparison4DSlowCommon {
-  const RuntimeShape output_shape;
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-};
-
-TFLITE_NOINLINE
-BroadcastComparison4DSlowCommon BroadcastComparison4DSlowPreprocess(
-    const RuntimeShape& unextended_input1_shape,
-    const RuntimeShape& unextended_input2_shape,
-    const RuntimeShape& unextended_output_shape);
-
 template <typename T, ComparisonFn<T> F>
 inline void BroadcastComparison4DSlowImpl(
     const ComparisonParams& op_params,
     const RuntimeShape& unextended_input1_shape, const T* input1_data,
     const RuntimeShape& unextended_input2_shape, const T* input2_data,
     const RuntimeShape& unextended_output_shape, bool* output_data) {
-  const BroadcastComparison4DSlowCommon dims =
-      BroadcastComparison4DSlowPreprocess(unextended_input1_shape,
-                                          unextended_input2_shape,
-                                          unextended_output_shape);
-
-  for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
-    for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
-      for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
-        for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
-          output_data[Offset(dims.output_shape, b, y, x, c)] =
-              F(input1_data[SubscriptToIndex(dims.desc1, b, y, x, c)],
-                input2_data[SubscriptToIndex(dims.desc2, b, y, x, c)]);
-        }
-      }
-    }
-  }
+  ForEachBroadcastedElement(
+      unextended_input1_shape, unextended_input2_shape, unextended_output_shape,
+      [&](int output_index, int input1_index, int input2_index) {
+        output_data[output_index] =
+            F(input1_data[input1_index], input2_data[input2_index]);
+      });
 }
 
 template <ComparisonFn<float> F>
@@ -165,11 +144,6 @@ inline void BroadcastComparison4DSlowWithScaling(
     const RuntimeShape& unextended_input1_shape, const T* input1_data,
     const RuntimeShape& unextended_input2_shape, const T* input2_data,
     const RuntimeShape& unextended_output_shape, bool* output_data) {
-  const BroadcastComparison4DSlowCommon dims =
-      BroadcastComparison4DSlowPreprocess(unextended_input1_shape,
-                                          unextended_input2_shape,
-                                          unextended_output_shape);
-
   int left_shift = op_params.left_shift;
   int32_t input1_offset = op_params.input1_offset;
   int32_t input1_multiplier = op_params.input1_multiplier;
@@ -178,30 +152,21 @@ inline void BroadcastComparison4DSlowWithScaling(
   int32_t input2_multiplier = op_params.input2_multiplier;
   int input2_shift = op_params.input2_shift;
 
-  for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
-    for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
-      for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
-        for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
-          const int32_t input1_val =
-              input1_offset +
-              input1_data[SubscriptToIndex(dims.desc1, b, y, x, c)];
-          const int32_t input2_val =
-              input2_offset +
-              input2_data[SubscriptToIndex(dims.desc2, b, y, x, c)];
-          const int32_t shifted_input1_val = input1_val * (1 << left_shift);
-          const int32_t shifted_input2_val = input2_val * (1 << left_shift);
-          const int32_t scaled_input1_val =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  shifted_input1_val, input1_multiplier, input1_shift);
-          const int32_t scaled_input2_val =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  shifted_input2_val, input2_multiplier, input2_shift);
-          output_data[Offset(dims.output_shape, b, y, x, c)] =
-              F(scaled_input1_val, scaled_input2_val);
-        }
-      }
-    }
-  }
+  ForEachBroadcastedElement(
+      unextended_input1_shape, unextended_input2_shape, unextended_output_shape,
+      [&](int output_index, int input1_index, int input2_index) {
+        const int32_t input1_val = input1_offset + input1_data[input1_index];
+        const int32_t input2_val = input2_offset + input2_data[input2_index];
+        const int32_t shifted_input1_val = input1_val * (1 << left_shift);
+        const int32_t shifted_input2_val = input2_val * (1 << left_shift);
+        const int32_t scaled_input1_val =
+            MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                shifted_input1_val, input1_multiplier, input1_shift);
+        const int32_t scaled_input2_val =
+            MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                shifted_input2_val, input2_multiplier, input2_shift);
+        output_data[output_index] = F(scaled_input1_val, scaled_input2_val);
+      });
 }
 
 #define TFLITE_COMPARISON_OP(name)                                             \
diff --git a/tensorflow/lite/kernels/internal/reference/concatenation.h b/tensorflow/lite/kernels/internal/reference/concatenation.h
index 4a82d7c502d..915492b1e92 100644
--- a/tensorflow/lite/kernels/internal/reference/concatenation.h
+++ b/tensorflow/lite/kernels/internal/reference/concatenation.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONCATENATION_H_
 
 #include <algorithm>
+#include <cstddef>
 
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
@@ -109,7 +110,7 @@ inline void Concatenation<Int4>(const ConcatenationParams& params,
   // not garbage.
   // Note: output_shape.FlatSize() gives number of elements (nibbles).
   // Bytes needed: (elements + 1) / 2.
-  memset(output_ptr, 0, (output_shape.FlatSize() + 1) / 2);
+  memset(output_ptr, 0, (static_cast<size_t>(output_shape.FlatSize()) + 1) / 2);
 
   int64_t output_offset = 0;
   for (int k = 0; k < outer_size; k++) {
diff --git a/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h b/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
index d4fba1399fb..be9f5fcbe0c 100644
--- a/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
+++ b/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
@@ -153,7 +153,8 @@ struct DepthwiseConvBasicKernel {
         for (int out_x = 0; out_x < output_width; ++out_x) {
           for (int ic = 0; ic < input_depth; ++ic) {
             for (int m = 0; m < depth_multiplier; m++) {
-              const int oc = m + ic * depth_multiplier;
+              const int64_t oc =
+                  m + static_cast<int64_t>(ic) * depth_multiplier;
               const int in_x_origin = (out_x * stride_width) - pad_width;
               const int in_y_origin = (out_y * stride_height) - pad_height;
               int32_t acc = 0;
@@ -240,7 +241,8 @@ struct DepthwiseConvBasicKernel {
         for (int out_x = 0; out_x < output_width; ++out_x) {
           for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
             for (int m = 0; m < depth_multiplier; ++m) {
-              const int output_channel = m + in_channel * depth_multiplier;
+              const int64_t output_channel =
+                  m + static_cast<int64_t>(in_channel) * depth_multiplier;
               const int in_x_origin = (out_x * stride_width) - pad_width;
               const int in_y_origin = (out_y * stride_height) - pad_height;
               int32_t acc = 0;
diff --git a/tensorflow/lite/kernels/internal/reference/div.h b/tensorflow/lite/kernels/internal/reference/div.h
index 5f26d3b8e6d..9ba3902271f 100644
--- a/tensorflow/lite/kernels/internal/reference/div.h
+++ b/tensorflow/lite/kernels/internal/reference/div.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h"
 
 namespace tflite {
 
@@ -117,50 +118,37 @@ inline void BroadcastDivSlowQuantized(
     const T* input1_data, const RuntimeShape& unextended_input2_shape,
     const T* input2_data, const RuntimeShape& unextended_output_shape,
     T* output_data) {
-  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
-  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
-  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
-
-  NdArrayDesc<N> desc1;
-  NdArrayDesc<N> desc2;
-  NdArrayDesc<N> output_desc;
-  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
-                                      unextended_input2_shape, &desc1, &desc2);
-  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
-                 &output_desc);
-
   DivCheckArithmeticParams<T>(params);
 
-  auto div_func = [&](int indexes[N]) {
-    int32_t input1_val =
-        params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
-    int32_t input2_val =
-        params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
-    TFLITE_DCHECK_NE(input2_val, 0);
-    if (input2_val < 0) {
-      // Invert signs to avoid a negative input2_val as input2_inv needs to be
-      // positive to be used as multiplier of MultiplyByQuantizedMultiplier.
-      input1_val = -input1_val;
-      input2_val = -input2_val;
-    }
-    int recip_shift;
-    const int32_t input2_inv = GetReciprocal(input2_val, 31, &recip_shift);
-    const int headroom = CountLeadingSignBits(input1_val);
-    const int32_t unscaled_quotient =
-        MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
-                                                    headroom);
-    const int total_shift = params.output_shift - recip_shift - headroom;
-    const int32_t unclamped_result =
-        params.output_offset +
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(
-            unscaled_quotient, params.output_multiplier, total_shift);
-    const int32_t clamped_output =
-        std::min(params.quantized_activation_max,
-                 std::max(params.quantized_activation_min, unclamped_result));
-    output_data[SubscriptToIndex(output_desc, indexes)] =
-        static_cast<T>(clamped_output);
-  };
-  NDOpsHelper<N>(output_desc, div_func);
+  ForEachBroadcastedElement(
+      unextended_input1_shape, unextended_input2_shape, unextended_output_shape,
+      [&](int output_index, int input1_index, int input2_index) {
+        int32_t input1_val = params.input1_offset + input1_data[input1_index];
+        int32_t input2_val = params.input2_offset + input2_data[input2_index];
+        TFLITE_DCHECK_NE(input2_val, 0);
+        if (input2_val < 0) {
+          // Invert signs to avoid a negative input2_val as input2_inv needs to
+          // be positive to be used as multiplier of
+          // MultiplyByQuantizedMultiplier.
+          input1_val = -input1_val;
+          input2_val = -input2_val;
+        }
+        int recip_shift;
+        const int32_t input2_inv = GetReciprocal(input2_val, 31, &recip_shift);
+        const int headroom = CountLeadingSignBits(input1_val);
+        const int32_t unscaled_quotient =
+            MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
+                                                        headroom);
+        const int total_shift = params.output_shift - recip_shift - headroom;
+        const int32_t unclamped_result =
+            params.output_offset +
+            MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                unscaled_quotient, params.output_multiplier, total_shift);
+        const int32_t clamped_output = std::min(
+            params.quantized_activation_max,
+            std::max(params.quantized_activation_min, unclamped_result));
+        output_data[output_index] = static_cast<T>(clamped_output);
+      });
 }
 
 template <int N = 5>
@@ -202,10 +190,6 @@ inline void BroadcastDivSlow(const ArithmeticParams& params,
       input2_data, unextended_output_shape, output_data);
 }
 
-// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
-// dimensionality if the runtime code does a single loop over one dimension
-// that handles broadcasting as the base case. The code generator would then
-// generate max(D1, D2) nested for loops.
 template <typename T, int N = 5>
 void BroadcastDivSlow(const ArithmeticParams& params,
                       const RuntimeShape& unextended_input1_shape,
@@ -218,34 +202,13 @@ void BroadcastDivSlow(const ArithmeticParams& params,
   T output_activation_max;
   GetActivationParams(params, &output_activation_min, &output_activation_max);
 
-  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
-  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
-  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
-
-  NdArrayDesc<N> desc1;
-  NdArrayDesc<N> desc2;
-  NdArrayDesc<N> output_desc;
-  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
-                                      unextended_input2_shape, &desc1, &desc2);
-  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
-                 &output_desc);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest
-  // stride, typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-
-  auto div_func = [&](int indexes[N]) {
-    output_data[SubscriptToIndex(output_desc, indexes)] =
-        ActivationFunctionWithMinMax(
-            input1_data[SubscriptToIndex(desc1, indexes)] /
-                input2_data[SubscriptToIndex(desc2, indexes)],
+  ForEachBroadcastedElement(
+      unextended_input1_shape, unextended_input2_shape, unextended_output_shape,
+      [&](int output_index, int input1_index, int input2_index) {
+        output_data[output_index] = ActivationFunctionWithMinMax(
+            input1_data[input1_index] / input2_data[input2_index],
             output_activation_min, output_activation_max);
-  };
-  NDOpsHelper<N>(output_desc, div_func);
+      });
 }
 
 template <typename T>
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/add.h b/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
index c2a0e0f082c..9f61a9196a5 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <limits>
 
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
@@ -132,6 +133,19 @@ void BroadcastBinaryFunction6DSlow(
     void (*check_arithmetic_params)(const ArithmeticParams&),
     T (*binary_func)(T, T, const ArithmeticParams&)) {
   constexpr int kMaxBroadcastDim = 6;
+  const int broadcast_rank = std::max(
+      output_shape.DimensionsCount(),
+      std::max(input1_shape.DimensionsCount(), input2_shape.DimensionsCount()));
+  if (broadcast_rank > kMaxBroadcastDim) {
+    check_arithmetic_params(params);
+    reference_ops::ForEachBroadcastedElement(
+        input1_shape, input2_shape, output_shape,
+        [&](int output_index, int input1_index, int input2_index) {
+          output_data[output_index] = binary_func(
+              input1_data[input1_index], input2_data[input2_index], params);
+        });
+    return;
+  }
 
   // In Tensorflow, the dimensions are canonically named (batch_number, row,
   // col, channel), with extents (batches, height, width, depth), with the
diff --git a/tensorflow/lite/kernels/internal/reference/maximum_minimum.h b/tensorflow/lite/kernels/internal/reference/maximum_minimum.h
index cd11b4191ac..805801df8a6 100644
--- a/tensorflow/lite/kernels/internal/reference/maximum_minimum.h
+++ b/tensorflow/lite/kernels/internal/reference/maximum_minimum.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MAXIMUM_MINIMUM_H_
 
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
@@ -37,24 +38,13 @@ void MaximumMinimumBroadcastSlow(const RuntimeShape& unextended_input1_shape,
       output_data[i] = op(input1_data[i], input2_data[i]);
     }
   } else {
-    TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
-    TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
-    TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
-
-    NdArrayDesc<N> desc1;
-    NdArrayDesc<N> desc2;
-    NdArrayDesc<N> output_desc;
-    NdArrayDescsForElementwiseBroadcast(
-        unextended_input1_shape, unextended_input2_shape, &desc1, &desc2);
-    CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
-                   &output_desc);
-
-    auto maxmin_func = [&](int indexes[N]) {
-      output_data[SubscriptToIndex(output_desc, indexes)] =
-          op(input1_data[SubscriptToIndex(desc1, indexes)],
-             input2_data[SubscriptToIndex(desc2, indexes)]);
-    };
-    NDOpsHelper<N>(output_desc, maxmin_func);
+    ForEachBroadcastedElement(
+        unextended_input1_shape, unextended_input2_shape,
+        unextended_output_shape,
+        [&](int output_index, int input1_index, int input2_index) {
+          output_data[output_index] =
+              op(input1_data[input1_index], input2_data[input2_index]);
+        });
   }
 }
 
diff --git a/tensorflow/lite/kernels/internal/reference/prelu.h b/tensorflow/lite/kernels/internal/reference/prelu.h
index 1a5ef0cb1f4..400a1bac354 100644
--- a/tensorflow/lite/kernels/internal/reference/prelu.h
+++ b/tensorflow/lite/kernels/internal/reference/prelu.h
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
@@ -31,47 +32,30 @@ inline void BroadcastPrelu4DSlow(
     const PreluParams& params, const RuntimeShape& input_shape,
     const T* input_data, const RuntimeShape& alpha_shape, const U* alpha_data,
     const RuntimeShape& output_shape, T* output_data) {
-  TFLITE_DCHECK_LE(input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(alpha_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 4);
-  const RuntimeShape extended_output_shape =
-      RuntimeShape::ExtendedShape(4, output_shape);
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input_shape, alpha_shape, &desc1, &desc2);
-
-  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
-    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
-      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
-        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          int output_index = Offset(extended_output_shape, b, y, x, c);
-          int input_index = SubscriptToIndex(desc1, b, y, x, c);
-          const int32_t input_value =
-              params.input_offset + input_data[input_index];
-          int32_t output_value;
-          if (input_value >= 0) {
-            output_value = MultiplyByQuantizedMultiplier(
-                input_value, params.output_multiplier_1, params.output_shift_1);
-          } else {
-            auto alpha_index = SubscriptToIndex(desc2, b, y, x, c);
-            const int32_t alpha_value =
-                params.alpha_offset + alpha_data[alpha_index];
-
-            output_value = MultiplyByQuantizedMultiplier(
-                input_value * alpha_value, params.output_multiplier_2,
-                params.output_shift_2);
-          }
-          output_value += params.output_offset;
-
-          const int32_t quantized_min = std::numeric_limits<T>::min();
-          const int32_t quantized_max = std::numeric_limits<T>::max();
-          const int32_t clamped_output =
-              std::min(quantized_max, std::max(quantized_min, output_value));
-          output_data[output_index] = static_cast<T>(clamped_output);
+  const int32_t quantized_min = std::numeric_limits<T>::min();
+  const int32_t quantized_max = std::numeric_limits<T>::max();
+  ForEachBroadcastedElement(
+      input_shape, alpha_shape, output_shape,
+      [&](int output_index, int input_index, int alpha_index) {
+        const int32_t input_value =
+            params.input_offset + input_data[input_index];
+        int32_t output_value;
+        if (input_value >= 0) {
+          output_value = MultiplyByQuantizedMultiplier(
+              input_value, params.output_multiplier_1, params.output_shift_1);
+        } else {
+          const int32_t alpha_value =
+              params.alpha_offset + alpha_data[alpha_index];
+          output_value = MultiplyByQuantizedMultiplier(
+              input_value * alpha_value, params.output_multiplier_2,
+              params.output_shift_2);
         }
-      }
-    }
-  }
+        output_value += params.output_offset;
+
+        const int32_t clamped_output =
+            std::min(quantized_max, std::max(quantized_min, output_value));
+        output_data[output_index] = static_cast<T>(clamped_output);
+      });
 }
 
 template <typename T, typename U>
diff --git a/tensorflow/lite/kernels/internal/reference/softmax.h b/tensorflow/lite/kernels/internal/reference/softmax.h
index 2930217b61f..27018436503 100644
--- a/tensorflow/lite/kernels/internal/reference/softmax.h
+++ b/tensorflow/lite/kernels/internal/reference/softmax.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <limits>
+#include <type_traits>
 
 #include "fixedpoint/fixedpoint.h"
 #include "tensorflow/lite/kernels/internal/common.h"
@@ -28,9 +29,11 @@ limitations under the License.
 namespace tflite {
 namespace reference_ops {
 
+template <typename T,
+          typename std::enable_if<!std::is_integral<T>::value, int>::type = 0>
 inline void Softmax(const SoftmaxParams& params,
-                    const RuntimeShape& input_shape, const float* input_data,
-                    const RuntimeShape& output_shape, float* output_data) {
+                    const RuntimeShape& input_shape, const T* input_data,
+                    const RuntimeShape& output_shape, T* output_data) {
   const int trailing_dim = input_shape.DimensionsCount() - 1;
   const int outer_size =
       MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
@@ -38,26 +41,24 @@ inline void Softmax(const SoftmaxParams& params,
       MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
 
   for (int i = 0; i < outer_size; ++i) {
-    // Find max element value which we'll use to ensure numerical stability
-    // taking advantage of the following equality:
-    // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
-    float max = std::numeric_limits<float>::lowest();
+    T max = std::numeric_limits<T>::lowest();
     for (int c = 0; c < depth; ++c) {
       max = std::max(max, input_data[i * depth + c]);
     }
 
-    // Compute sum.
     float sum = 0.f;
     for (int c = 0; c < depth; ++c) {
-      const float exp_c = std::exp((input_data[i * depth + c] - max) *
-                                   static_cast<float>(params.beta));
-      output_data[i * depth + c] = exp_c;
+      const float exp_c =
+          std::exp((static_cast<float>(input_data[i * depth + c]) -
+                    static_cast<float>(max)) *
+                   static_cast<float>(params.beta));
+      output_data[i * depth + c] = static_cast<T>(exp_c);
       sum += exp_c;
     }
 
-    // Compute result.
     for (int c = 0; c < depth; ++c) {
-      output_data[i * depth + c] = output_data[i * depth + c] / sum;
+      output_data[i * depth + c] =
+          static_cast<T>(static_cast<float>(output_data[i * depth + c]) / sum);
     }
   }
 }
diff --git a/tensorflow/lite/kernels/internal/reference/transpose.h b/tensorflow/lite/kernels/internal/reference/transpose.h
index 7e2bf7b266a..fbd991141a7 100644
--- a/tensorflow/lite/kernels/internal/reference/transpose.h
+++ b/tensorflow/lite/kernels/internal/reference/transpose.h
@@ -176,6 +176,9 @@ template <typename T, int N = kTransposeMaxDimensions>
 void Transpose(const TransposeParams& params, const RuntimeShape& input_shape,
                const T* input_data, const RuntimeShape& output_shape,
                T* output_data) {
+  if (input_shape.FlatSize() == 0) {
+    return;
+  }
   using transpose_internal::SetupTransposeStrides;
   using transpose_internal::TransposeImpl;
   using transpose_internal::TransposeStorageType;
diff --git a/tensorflow/lite/kernels/internal/types.h b/tensorflow/lite/kernels/internal/types.h
index 1cfc43d1662..c640e18abde 100644
--- a/tensorflow/lite/kernels/internal/types.h
+++ b/tensorflow/lite/kernels/internal/types.h
@@ -19,10 +19,40 @@ limitations under the License.
 #include <cstdint>
 #include <cstring>
 #include <initializer_list>
+#include <limits>
 #include <type_traits>
 
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/types/half.h"
+
+namespace std {
+template <>
+class numeric_limits<tflite::half> {
+ public:
+  static constexpr bool is_specialized =
+      true;  // NOLINT(readability-identifier-naming)
+  static constexpr tflite::half min() noexcept {
+    return tflite::half::smallest_normal();
+  }
+  static constexpr tflite::half max() noexcept { return tflite::half::max(); }
+  static constexpr tflite::half lowest() noexcept {
+    return tflite::half::min();
+  }
+  static constexpr tflite::half epsilon() noexcept {
+    return tflite::half::epsilon();
+  }
+  static constexpr tflite::half quiet_NaN() noexcept {
+#if TFLITE_ARCH_FLOAT16
+    return tflite::half(__builtin_nanf(""));
+#else
+    return tflite::half::from_bits(0x7e00);
+#endif
+  }
+  static constexpr bool is_signed =
+      true;  // NOLINT(readability-identifier-naming)
+};
+}  // namespace std
 
 namespace tflite {
 
@@ -170,11 +200,13 @@ inline bool NextIndex(const int num_dims, const int* dims, IndexType* current) {
   }
   TFLITE_DCHECK(dims != nullptr);
   TFLITE_DCHECK(current != nullptr);
+  for (int i = 0; i < num_dims; ++i) {
+    TFLITE_DCHECK_GE(dims[i], 0);
+  }
   int carry = 1;
   for (int idx = num_dims - 1; idx >= 0; --idx) {
     IndexType current_val = current[idx] + carry;
-    TFLITE_DCHECK_GE(dims[idx], current_val);
-    if (dims[idx] == current_val) {
+    if (current_val >= dims[idx]) {
       current[idx] = 0;
     } else {
       current[idx] = current_val;
@@ -999,7 +1031,7 @@ struct TanhParams {
   int input_left_shift;
 };
 
-constexpr int kTransposeMaxDimensions = 6;
+constexpr int kTransposeMaxDimensions = 8;
 
 struct TransposeParams {
   int8_t perm_count;
@@ -1075,6 +1107,12 @@ inline void GetActivationParams(const P& params, float* min, float* max) {
   *max = params.float_activation_max;
 }
 
+template <typename P>
+inline void GetActivationParams(const P& params, half* min, half* max) {
+  *min = static_cast<half>(params.float_activation_min);
+  *max = static_cast<half>(params.float_activation_max);
+}
+
 template <typename P>
 inline void GetActivationParams(const P& params, int64_t* min, int64_t* max) {
   *min = params.int64_activation_min;
diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc
index 62feffc1c0a..d281bffd608 100644
--- a/tensorflow/lite/kernels/kernel_util.cc
+++ b/tensorflow/lite/kernels/kernel_util.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #ifndef TF_LITE_STATIC_MEMORY
 #include <string>
 
+#include "absl/types/span.h"
 #include "tensorflow/lite/array.h"
 #endif  // TF_LITE_STATIC_MEMORY
 
@@ -34,6 +35,10 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 
+#ifndef TF_LITE_STATIC_MEMORY
+#include "tensorflow/lite/util.h"
+#endif
+
 #if defined(__APPLE__)
 #include "TargetConditionals.h"
 #endif
@@ -101,9 +106,8 @@ inline TfLiteStatus GetMutableInputSafe(const TfLiteContext* context,
                                         const TfLiteNode* node, int index,
                                         const TfLiteTensor** tensor) {
   int tensor_index;
-  TF_LITE_ENSURE_OK(
-      context, ValidateTensorIndexingSafe(context, index, node->inputs->size,
-                                          node->inputs->data, &tensor_index));
+  TF_LITE_ENSURE_STATUS(ValidateTensorIndexingSafe(
+      context, index, node->inputs->size, node->inputs->data, &tensor_index));
   *tensor = GetTensorAtIndex(context, tensor_index);
   return kTfLiteOk;
 }
@@ -140,9 +144,8 @@ TfLiteTensor* GetOutput(TfLiteContext* context, const TfLiteNode* node,
 TfLiteStatus GetOutputSafe(const TfLiteContext* context, const TfLiteNode* node,
                            int index, TfLiteTensor** tensor) {
   int tensor_index;
-  TF_LITE_ENSURE_OK(
-      context, ValidateTensorIndexingSafe(context, index, node->outputs->size,
-                                          node->outputs->data, &tensor_index));
+  TF_LITE_ENSURE_STATUS(ValidateTensorIndexingSafe(
+      context, index, node->outputs->size, node->outputs->data, &tensor_index));
   *tensor = GetTensorAtIndex(context, tensor_index);
   return kTfLiteOk;
 }
@@ -167,8 +170,8 @@ TfLiteStatus GetTemporarySafe(const TfLiteContext* context,
                               const TfLiteNode* node, int index,
                               TfLiteTensor** tensor) {
   int tensor_index;
-  TF_LITE_ENSURE_OK(context, ValidateTensorIndexingSafe(
-                                 context, index, node->temporaries->size,
+  TF_LITE_ENSURE_STATUS(
+      ValidateTensorIndexingSafe(context, index, node->temporaries->size,
                                  node->temporaries->data, &tensor_index));
   *tensor = GetTensorAtIndex(context, tensor_index);
   return kTfLiteOk;
@@ -188,8 +191,8 @@ TfLiteStatus GetIntermediatesSafe(const TfLiteContext* context,
                                   const TfLiteNode* node, int index,
                                   TfLiteTensor** tensor) {
   int tensor_index;
-  TF_LITE_ENSURE_OK(context, ValidateTensorIndexingSafe(
-                                 context, index, node->intermediates->size,
+  TF_LITE_ENSURE_STATUS(
+      ValidateTensorIndexingSafe(context, index, node->intermediates->size,
                                  node->intermediates->data, &tensor_index));
   *tensor = GetTensorAtIndex(context, tensor_index);
   return kTfLiteOk;
@@ -595,4 +598,27 @@ bool HasUnspecifiedDimension(const TfLiteTensor* tensor) {
   return false;
 }
 
+#ifndef TF_LITE_STATIC_MEMORY
+TfLiteStatus CheckedShapeProduct(TfLiteContext* context,
+                                 absl::Span<const int> dims,
+                                 const char* error_message, size_t& product) {
+  // The CheckedNumElements function already checks for negative dimensions, so
+  // we don't do it here.
+  TF_LITE_ENSURE_MSG(context, CheckedNumElements(dims, product) == kTfLiteOk,
+                     "%s", error_message);
+  return kTfLiteOk;
+}
+
+TfLiteStatus CheckedShapeProductToInt(TfLiteContext* context,
+                                      absl::Span<const int> dims,
+                                      const char* error_message, int& product) {
+  for (const int dim : dims) {
+    TF_LITE_ENSURE_MSG(context, dim >= 0, "Encountered a negative dimension.");
+  }
+  TF_LITE_ENSURE_MSG(context, CheckedNumElements(dims, product) == kTfLiteOk,
+                     "%s", error_message);
+  return kTfLiteOk;
+}
+#endif
+
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h
index 25e5386ccb6..70436965710 100644
--- a/tensorflow/lite/kernels/kernel_util.h
+++ b/tensorflow/lite/kernels/kernel_util.h
@@ -17,9 +17,11 @@ limitations under the License.
 
 #include <stdint.h>
 
+#include <cstddef>
 #include <limits>
 #ifndef TF_LITE_STATIC_MEMORY
 #include <string>
+#include "absl/types/span.h"
 #endif  // TF_LITE_STATIC_MEMORY
 
 #include "tensorflow/lite/core/c/builtin_op_data.h"
@@ -341,6 +343,31 @@ bool IsMobilePlatform();
 // Returns whether there is unspecified dimension in the tensor's dim signature.
 bool HasUnspecifiedDimension(const TfLiteTensor* tensor);
 
+#ifndef TF_LITE_STATIC_MEMORY
+/**
+ * Calculates the product of the given dimensions. Returns an error if any of
+ * the dimensions is negative or if the product overflows.
+ * @param context The context to use for error reporting.
+ * @param dims The dimensions to multiply.
+ * @param error_message The error message to use if an error is encountered.
+ * @param product The output parameter to store the product.
+ */
+TfLiteStatus CheckedShapeProduct(TfLiteContext* context,
+                                 absl::Span<const int> dims,
+                                 const char* error_message, size_t& product);
+
+/**
+ * Calculates the product of the given dimensions. Returns an error if any of
+ * the dimensions is negative or if the product overflows.
+ * @param context The context to use for error reporting.
+ * @param dims The dimensions to multiply.
+ * @param error_message The error message to use if an error is encountered.
+ * @param product The output parameter to store the product.
+ */
+TfLiteStatus CheckedShapeProductToInt(TfLiteContext* context,
+                                      absl::Span<const int> dims,
+                                      const char* error_message, int& product);
+#endif
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_
diff --git a/tensorflow/lite/tools/flatbuffer_utils_test.py b/tensorflow/lite/tools/flatbuffer_utils_test.py
index 13074aaca5e..e8a2e46b9be 100644
--- a/tensorflow/lite/tools/flatbuffer_utils_test.py
+++ b/tensorflow/lite/tools/flatbuffer_utils_test.py
@@ -18,9 +18,9 @@
 import subprocess
 import sys
 
-from tflite_micro.tensorflow.lite.python import schema_py_generated as schema  # pylint:disable=g-direct-tensorflow-import
-from tflite_micro.tensorflow.lite.tools import flatbuffer_utils
-from tflite_micro.tensorflow.lite.tools import test_utils
+from tflite_micro.tensorflow.lite_micro.tensorflow.lite.python import schema_py_generated as schema  # pylint:disable=g-direct-tensorflow-import
+from tflite_micro.tensorflow.lite_micro.tensorflow.lite.tools import flatbuffer_utils
+from tflite_micro.tensorflow.lite_micro.tensorflow.lite.tools import test_utils
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/lite/tools/test_utils.py b/tensorflow/lite/tools/test_utils.py
index 44157143d5d..582fbd2879b 100644
--- a/tensorflow/lite/tools/test_utils.py
+++ b/tensorflow/lite/tools/test_utils.py
@@ -18,7 +18,7 @@
 """
 
 import flatbuffers
-from tflite_micro.tensorflow.lite.python import schema_py_generated as schema_fb
+from tflite_micro.tensorflow.lite_micro.tensorflow.lite.python import schema_py_generated as schema_fb
 
 TFLITE_SCHEMA_VERSION = 3
 
diff --git a/tensorflow/lite/tools/visualize_test.py b/tensorflow/lite/tools/visualize_test.py
index 68de38cc9d7..4cbb01f2b58 100644
--- a/tensorflow/lite/tools/visualize_test.py
+++ b/tensorflow/lite/tools/visualize_test.py
@@ -16,8 +16,8 @@
 import os
 import re
 
-from tflite_micro.tensorflow.lite.tools import test_utils
-from tflite_micro.tensorflow.lite.tools import visualize
+from tflite_micro.tensorflow.lite_micro.tensorflow.lite.tools import test_utils
+from tflite_micro.tensorflow.lite_micro.tensorflow.lite.tools import visualize
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/lite/types/BUILD b/tensorflow/lite/types/BUILD
new file mode 100644
index 00000000000..d3894462cb0
--- /dev/null
+++ b/tensorflow/lite/types/BUILD
@@ -0,0 +1,11 @@
+load("@rules_cc//cc:cc_library.bzl", "cc_library")
+
+cc_library(
+    name = "half",
+    hdrs = [
+        "half.h",
+    ],
+    visibility = [
+        "//tensorflow/lite:__subpackages__",
+    ],
+)
diff --git a/tensorflow/lite/types/half.h b/tensorflow/lite/types/half.h
new file mode 100644
index 00000000000..48b796a2107
--- /dev/null
+++ b/tensorflow/lite/types/half.h
@@ -0,0 +1,83 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TYPES_HALF_H_
+#define TENSORFLOW_LITE_TYPES_HALF_H_
+
+#include <cstdint>
+
+namespace tflite {
+
+class half {
+ private:
+  // We need this hoop jumping to enable implementing a constexpr `from_bits`.
+  struct zero_initializer {};
+  explicit constexpr half(zero_initializer) : bits_(0) {}
+
+ public:
+  half() = default;
+
+  // Disabled in TFLM to avoid dependencies on external fp16 conversion
+  // libraries. This is safe because TFLM does not currently support
+  // Float16 kernels, meaning reference kernel templates are never instantiated
+  // with `tflite::half`.
+  //
+  // If Float16 support is needed in the future, the build target must be
+  // updated to depend on a proper fp16 library and this code re-enabled.
+#if 0
+  half(float x) : bits_(fp16_ieee_from_fp32_value(x)) {}  // NOLINT
+  explicit half(int x)
+      : bits_(fp16_ieee_from_fp32_value(static_cast<float>(x))) {}
+
+  operator float() const { return fp16_ieee_to_fp32_value(bits_); }  // NOLINT
+#endif
+
+  static constexpr half from_bits(uint16_t bits) {
+    half result{zero_initializer{}};
+    result.bits_ = bits;
+    return result;
+  }
+
+  constexpr uint16_t to_bits() const { return bits_; }
+
+  bool is_zero() const {
+    // Check for +/- zero (0x0000/0x8000). uint16 overflow is well defined to
+    // wrap around.
+    return static_cast<uint16_t>(bits_ * 2) == 0;
+  }
+
+  static constexpr half epsilon() {
+    return half::from_bits(0x1400);  // 2^-10 = 0.0009765625
+  }
+  static constexpr half infinity() { return from_bits(0x7c00); }
+  static constexpr half min() { return from_bits(0xfbff); }
+  static constexpr half max() { return from_bits(0x7bff); }
+  static constexpr half smallest_normal() {
+    return from_bits(0x0400);  // 2^-14
+  }
+  static constexpr half min_identity() { return from_bits(0x7c00); }
+  static constexpr half max_identity() { return from_bits(0xfc00); }
+  static constexpr half sum_identity() { return from_bits(0); }
+
+  // Not private due to -Werror=class-memaccess, which can't be disabled:
+  // - via a --copt, because it seems to have no effect.
+  // - via .bazelrc, because it then applies to C code, and the compiler says
+  //   this flag is not valid in C.
+  uint16_t bits_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TYPES_HALF_H_

From 6ad2479cbc412c9fa8804d60bc16a8a92f16b342 Mon Sep 17 00:00:00 2001
From: Esun Kim <veblush@google.com>
Date: Thu, 18 Jun 2026 11:05:53 -0700
Subject: [PATCH 2/2] No abseil

---
 tensorflow/lite/kernels/kernel_util.cc | 45 +++++++++++++++++---------
 tensorflow/lite/kernels/kernel_util.h  |  8 ++---
 2 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc
index d281bffd608..3575f845163 100644
--- a/tensorflow/lite/kernels/kernel_util.cc
+++ b/tensorflow/lite/kernels/kernel_util.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #ifndef TF_LITE_STATIC_MEMORY
 #include <string>
 
-#include "absl/types/span.h"
 #include "tensorflow/lite/array.h"
 #endif  // TF_LITE_STATIC_MEMORY
 
@@ -35,9 +34,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 
-#ifndef TF_LITE_STATIC_MEMORY
-#include "tensorflow/lite/util.h"
-#endif
+
 
 #if defined(__APPLE__)
 #include "TargetConditionals.h"
@@ -598,27 +595,43 @@ bool HasUnspecifiedDimension(const TfLiteTensor* tensor) {
   return false;
 }
 
-#ifndef TF_LITE_STATIC_MEMORY
 TfLiteStatus CheckedShapeProduct(TfLiteContext* context,
-                                 absl::Span<const int> dims,
+                                 std::initializer_list<int> dims,
                                  const char* error_message, size_t& product) {
-  // The CheckedNumElements function already checks for negative dimensions, so
-  // we don't do it here.
-  TF_LITE_ENSURE_MSG(context, CheckedNumElements(dims, product) == kTfLiteOk,
-                     "%s", error_message);
+  size_t checked_count = 1;
+  for (const int d : dims) {
+    if (d < 0) {
+      TF_LITE_ENSURE_MSG(context, false, "%s", error_message);
+    }
+    if (checked_count > 0 &&
+        static_cast<size_t>(d) > std::numeric_limits<size_t>::max() / checked_count) {
+      TF_LITE_ENSURE_MSG(context, false, "%s", error_message);
+    }
+    checked_count *= d;
+  }
+  product = checked_count;
   return kTfLiteOk;
 }
 
 TfLiteStatus CheckedShapeProductToInt(TfLiteContext* context,
-                                      absl::Span<const int> dims,
+                                      std::initializer_list<int> dims,
                                       const char* error_message, int& product) {
-  for (const int dim : dims) {
-    TF_LITE_ENSURE_MSG(context, dim >= 0, "Encountered a negative dimension.");
+  size_t checked_count = 1;
+  for (const int d : dims) {
+    if (d < 0) {
+      TF_LITE_ENSURE_MSG(context, false, "Encountered a negative dimension.");
+    }
+    if (checked_count > 0 &&
+        static_cast<size_t>(d) > std::numeric_limits<size_t>::max() / checked_count) {
+      TF_LITE_ENSURE_MSG(context, false, "%s", error_message);
+    }
+    checked_count *= d;
   }
-  TF_LITE_ENSURE_MSG(context, CheckedNumElements(dims, product) == kTfLiteOk,
-                     "%s", error_message);
+  if (checked_count > std::numeric_limits<int>::max()) {
+    TF_LITE_ENSURE_MSG(context, false, "%s", error_message);
+  }
+  product = static_cast<int>(checked_count);
   return kTfLiteOk;
 }
-#endif
 
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h
index 70436965710..5ef47194a70 100644
--- a/tensorflow/lite/kernels/kernel_util.h
+++ b/tensorflow/lite/kernels/kernel_util.h
@@ -18,10 +18,10 @@ limitations under the License.
 #include <stdint.h>
 
 #include <cstddef>
+#include <initializer_list>
 #include <limits>
 #ifndef TF_LITE_STATIC_MEMORY
 #include <string>
-#include "absl/types/span.h"
 #endif  // TF_LITE_STATIC_MEMORY
 
 #include "tensorflow/lite/core/c/builtin_op_data.h"
@@ -343,7 +343,6 @@ bool IsMobilePlatform();
 // Returns whether there is unspecified dimension in the tensor's dim signature.
 bool HasUnspecifiedDimension(const TfLiteTensor* tensor);
 
-#ifndef TF_LITE_STATIC_MEMORY
 /**
  * Calculates the product of the given dimensions. Returns an error if any of
  * the dimensions is negative or if the product overflows.
@@ -353,7 +352,7 @@ bool HasUnspecifiedDimension(const TfLiteTensor* tensor);
  * @param product The output parameter to store the product.
  */
 TfLiteStatus CheckedShapeProduct(TfLiteContext* context,
-                                 absl::Span<const int> dims,
+                                 std::initializer_list<int> dims,
                                  const char* error_message, size_t& product);
 
 /**
@@ -365,9 +364,8 @@ TfLiteStatus CheckedShapeProduct(TfLiteContext* context,
  * @param product The output parameter to store the product.
  */
 TfLiteStatus CheckedShapeProductToInt(TfLiteContext* context,
-                                      absl::Span<const int> dims,
+                                      std::initializer_list<int> dims,
                                       const char* error_message, int& product);
-#endif
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_