From 294b2776155013c1b4606a4d171b20dfe23fee6d Mon Sep 17 00:00:00 2001 From: Esun Kim Date: Thu, 18 Jun 2026 10:43:37 -0700 Subject: [PATCH 1/2] Synced --- ci/tflite_files.txt | 25 +-- tensorflow/lite/core/c/common.cc | 34 +++- tensorflow/lite/core/c/common.h | 41 ++++- tensorflow/lite/kernels/internal/BUILD | 1 + tensorflow/lite/kernels/internal/common.h | 20 +++ .../lite/kernels/internal/reference/add.h | 30 +++- .../kernels/internal/reference/batch_matmul.h | 162 +++++++++--------- .../internal/reference/binary_function.h | 43 +---- .../internal/reference/broadcast_loop.h | 132 ++++++++++++++ .../kernels/internal/reference/comparisons.cc | 41 ----- .../kernels/internal/reference/comparisons.h | 79 +++------ .../internal/reference/concatenation.h | 3 +- .../internal/reference/depthwiseconv_uint8.h | 6 +- .../lite/kernels/internal/reference/div.h | 109 ++++-------- .../internal/reference/integer_ops/add.h | 14 ++ .../internal/reference/maximum_minimum.h | 26 +-- .../lite/kernels/internal/reference/prelu.h | 64 +++---- .../lite/kernels/internal/reference/softmax.h | 25 +-- .../kernels/internal/reference/transpose.h | 3 + tensorflow/lite/kernels/internal/types.h | 44 ++++- tensorflow/lite/kernels/kernel_util.cc | 46 +++-- tensorflow/lite/kernels/kernel_util.h | 27 +++ .../lite/tools/flatbuffer_utils_test.py | 6 +- tensorflow/lite/tools/test_utils.py | 2 +- tensorflow/lite/tools/visualize_test.py | 4 +- tensorflow/lite/types/BUILD | 11 ++ tensorflow/lite/types/half.h | 83 +++++++++ 27 files changed, 680 insertions(+), 401 deletions(-) create mode 100644 tensorflow/lite/kernels/internal/reference/broadcast_loop.h delete mode 100644 tensorflow/lite/kernels/internal/reference/comparisons.cc create mode 100644 tensorflow/lite/types/BUILD create mode 100644 tensorflow/lite/types/half.h diff --git a/ci/tflite_files.txt b/ci/tflite_files.txt index 351ece5cca4..ccbfd354215 100644 --- a/ci/tflite_files.txt +++ b/ci/tflite_files.txt @@ -30,16 +30,17 @@ tensorflow/lite/core/c/common.h tensorflow/lite/core/macros.h tensorflow/lite/kernels/internal/common.h tensorflow/lite/kernels/internal/compatibility.h -tensorflow/lite/kernels/internal/portable_tensor_utils.h tensorflow/lite/kernels/internal/portable_tensor_utils.cc +tensorflow/lite/kernels/internal/portable_tensor_utils.h tensorflow/lite/kernels/internal/quantization_util.h -tensorflow/lite/kernels/internal/reference/add.h tensorflow/lite/kernels/internal/reference/add_n.h +tensorflow/lite/kernels/internal/reference/add.h tensorflow/lite/kernels/internal/reference/arg_min_max.h -tensorflow/lite/kernels/internal/reference/batch_to_space_nd.h tensorflow/lite/kernels/internal/reference/batch_matmul.h +tensorflow/lite/kernels/internal/reference/batch_to_space_nd.h tensorflow/lite/kernels/internal/reference/binary_function.h tensorflow/lite/kernels/internal/reference/broadcast_args.h +tensorflow/lite/kernels/internal/reference/broadcast_loop.h tensorflow/lite/kernels/internal/reference/broadcast_to.h tensorflow/lite/kernels/internal/reference/ceil.h tensorflow/lite/kernels/internal/reference/comparisons.h @@ -54,17 +55,17 @@ tensorflow/lite/kernels/internal/reference/div.h tensorflow/lite/kernels/internal/reference/elu.h tensorflow/lite/kernels/internal/reference/exp.h tensorflow/lite/kernels/internal/reference/fill.h -tensorflow/lite/kernels/internal/reference/floor.h tensorflow/lite/kernels/internal/reference/floor_div.h tensorflow/lite/kernels/internal/reference/floor_mod.h +tensorflow/lite/kernels/internal/reference/floor.h tensorflow/lite/kernels/internal/reference/fully_connected.h tensorflow/lite/kernels/internal/reference/hard_swish.h tensorflow/lite/kernels/internal/reference/integer_ops/add.h tensorflow/lite/kernels/internal/reference/integer_ops/conv.h tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h -tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h +tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h tensorflow/lite/kernels/internal/reference/integer_ops/mean.h tensorflow/lite/kernels/internal/reference/integer_ops/mul.h tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h @@ -73,14 +74,16 @@ tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h tensorflow/lite/kernels/internal/reference/l2normalization.h tensorflow/lite/kernels/internal/reference/leaky_relu.h tensorflow/lite/kernels/internal/reference/log_softmax.h +tensorflow/lite/kernels/internal/reference/logistic.h +tensorflow/lite/kernels/internal/reference/lstm_cell.h tensorflow/lite/kernels/internal/reference/maximum_minimum.h tensorflow/lite/kernels/internal/reference/mul.h tensorflow/lite/kernels/internal/reference/neg.h tensorflow/lite/kernels/internal/reference/pad.h tensorflow/lite/kernels/internal/reference/pooling.h +tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h -tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h tensorflow/lite/kernels/internal/reference/prelu.h tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h tensorflow/lite/kernels/internal/reference/quantize.h @@ -90,18 +93,16 @@ tensorflow/lite/kernels/internal/reference/resize_bilinear.h tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h tensorflow/lite/kernels/internal/reference/reverse.h tensorflow/lite/kernels/internal/reference/round.h +tensorflow/lite/kernels/internal/reference/select.h +tensorflow/lite/kernels/internal/reference/slice.h tensorflow/lite/kernels/internal/reference/softmax.h tensorflow/lite/kernels/internal/reference/space_to_batch_nd.h tensorflow/lite/kernels/internal/reference/space_to_depth.h -tensorflow/lite/kernels/internal/reference/sub.h -tensorflow/lite/kernels/internal/reference/logistic.h -tensorflow/lite/kernels/internal/reference/lstm_cell.h -tensorflow/lite/kernels/internal/reference/select.h -tensorflow/lite/kernels/internal/reference/slice.h tensorflow/lite/kernels/internal/reference/strided_slice.h +tensorflow/lite/kernels/internal/reference/sub.h tensorflow/lite/kernels/internal/reference/tanh.h -tensorflow/lite/kernels/internal/reference/transpose.h tensorflow/lite/kernels/internal/reference/transpose_conv.h +tensorflow/lite/kernels/internal/reference/transpose.h tensorflow/lite/kernels/internal/cppmath.h tensorflow/lite/kernels/internal/max.h tensorflow/lite/kernels/internal/min.h diff --git a/tensorflow/lite/core/c/common.cc b/tensorflow/lite/core/c/common.cc index 6d247309ec5..37254cacc2f 100644 --- a/tensorflow/lite/core/c/common.cc +++ b/tensorflow/lite/core/c/common.cc @@ -20,6 +20,7 @@ limitations under the License. #endif // TF_LITE_STATIC_MEMORY #include +#include #include #include @@ -111,6 +112,7 @@ TfLiteSparsity TfLiteSparsityClone(const TfLiteSparsity& src) { if (src.dim_metadata) { dst.dim_metadata = reinterpret_cast( calloc(1, sizeof(TfLiteDimensionMetadata) * src.dim_metadata_size)); + if (src.dim_metadata_size > 0 && !dst.dim_metadata) return TfLiteSparsity(); for (int i = 0; i < src.dim_metadata_size; ++i) { dst.dim_metadata[i] = src.dim_metadata[i]; dst.dim_metadata[i].array_segments = @@ -129,6 +131,7 @@ TfLiteSparsity* TfLiteSparsityClone(const TfLiteSparsity* const src) { } TfLiteSparsity* dst = reinterpret_cast(calloc(1, sizeof(TfLiteSparsity))); + if (!dst) return nullptr; *dst = TfLiteSparsityClone(*src); return dst; } @@ -147,6 +150,7 @@ TfLiteQuantization TfLiteQuantizationClone(const TfLiteQuantization& src) { break; case kTfLiteAffineQuantization: { dst.params = calloc(1, sizeof(TfLiteAffineQuantization)); + if (!dst.params) return TfLiteQuantization(); const TfLiteAffineQuantization* const src_params = reinterpret_cast(src.params); TfLiteAffineQuantization* const dst_params = @@ -158,6 +162,7 @@ TfLiteQuantization TfLiteQuantizationClone(const TfLiteQuantization& src) { } case kTfLiteBlockwiseQuantization: { dst.params = calloc(1, sizeof(TfLiteBlockwiseQuantization)); + if (!dst.params) return TfLiteQuantization(); const TfLiteBlockwiseQuantization* const src_params = (TfLiteBlockwiseQuantization*)(src.params); TfLiteBlockwiseQuantization* const dst_params = @@ -219,6 +224,9 @@ TfLiteFloatArray* TfLiteFloatArrayCopy(const TfLiteFloatArray* src) { void TfLiteFloatArrayFree(TfLiteFloatArray* a) { TfLiteVarArrayFree(a); } void TfLiteTensorDataFree(TfLiteTensor* t) { + if (t == nullptr) { + return; + } if (t->allocation_type == kTfLiteVariantObject && t->data.data) { delete static_cast(t->data.data); } else if (t->allocation_type == kTfLiteDynamic || @@ -238,6 +246,9 @@ void TfLiteTensorDataFree(TfLiteTensor* t) { } void TfLiteQuantizationFree(TfLiteQuantization* quantization) { + if (quantization == nullptr) { + return; + } if (quantization->type == kTfLiteAffineQuantization) { TfLiteAffineQuantization* q_params = reinterpret_cast(quantization->params); @@ -294,6 +305,9 @@ void TfLiteSparsityFree(TfLiteSparsity* sparsity) { } void TfLiteTensorFree(TfLiteTensor* t) { + if (t == nullptr) { + return; + } TfLiteTensorDataFree(t); if (t->dims) TfLiteIntArrayFree(t->dims); t->dims = nullptr; @@ -308,7 +322,7 @@ void TfLiteTensorFree(TfLiteTensor* t) { t->sparsity = nullptr; } -TfLiteTensor TfLiteTensorClone(const TfLiteTensor src) { +TfLiteTensor TfLiteTensorClone(TfLiteTensor src) { // We copy all of the source data first, then we clone the fields that can't // be shared between two tensor instances. TfLiteTensor dst = src; @@ -335,16 +349,18 @@ TfLiteTensor TfLiteTensorClone(const TfLiteTensor src) { break; case kTfLiteAllocationStrategyMalloc: dst.data.data = malloc(src.bytes); + if (src.bytes > 0 && !dst.data.data) return TfLiteTensor(); std::memcpy(dst.data.data, src.data.data, src.bytes); break; case kTfLiteAllocationStrategyNew: // Special case for variant objects. They are allocated using new/delete // but require using the `CloneTo` function. if (src.allocation_type == kTfLiteVariantObject) { - dst.data.data = reinterpret_cast(src.data.data) - ->CloneTo(nullptr); + dst.data.data = + static_cast(src.data.data)->CloneTo(nullptr); } else { - dst.data.data = new char[src.bytes]; + dst.data.data = new (std::nothrow) char[src.bytes]; + if (src.bytes > 0 && !dst.data.data) return TfLiteTensor(); std::memcpy(dst.data.data, src.data.data, src.bytes); } break; @@ -394,13 +410,21 @@ TfLiteStatus TfLiteTensorCopy(const TfLiteTensor* src, TfLiteTensor* dst) { } auto* dst_vd = static_cast(dst->data.data); auto* src_vd = static_cast(src->data.data); + if (!src_vd) return kTfLiteError; // `CloneTo` will handle the case when `dst_vd` is nullptr, so it is safe // to `CloneTo` something which was "freed". Also, returning from `CloneTo` // will implicitly cast to `VariantData`; don't need static cast here. dst->data.data = src_vd->CloneTo(dst_vd); } else { - memcpy(dst->data.raw, src->data.raw, src->bytes); + if (dst->allocation_type == kTfLiteVariantObject) { + TfLiteTensorDataFree(dst); + dst->allocation_type = src->allocation_type; + } + if (src->bytes > 0) { + if (!dst->data.raw || !src->data.raw) return kTfLiteError; + memcpy(dst->data.raw, src->data.raw, src->bytes); + } } dst->buffer_handle = src->buffer_handle; dst->data_is_stale = src->data_is_stale; diff --git a/tensorflow/lite/core/c/common.h b/tensorflow/lite/core/c/common.h index a3b0dbd7492..8ea233516b3 100644 --- a/tensorflow/lite/core/c/common.h +++ b/tensorflow/lite/core/c/common.h @@ -56,6 +56,7 @@ limitations under the License. #include #include #include +#include #include "tensorflow/lite/core/c/c_api_types.h" // IWYU pragma: export @@ -277,13 +278,34 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a); } \ } while (0) -#define TF_LITE_ENSURE_OK(context, status) \ - do { \ - const TfLiteStatus s = (status); \ - if ((s) != kTfLiteOk) { \ - return s; \ - } \ +#ifndef TF_LITE_STRIP_ERROR_STRINGS +#define TF_LITE_VAR_ARG_HEAD(FIRST, ...) FIRST +#define TF_LITE_STRINGIFY_HELPER(x) #x +#define TF_LITE_STRINGIFY(x) TF_LITE_STRINGIFY_HELPER(x) +// Checks that `status` evaluates to `kTfLiteOk`. +// +// Can take a printf style log message and its parameters after the status. The +// message will be printed using `TF_LITE_KERNEL_LOG` in case of error. +#define TF_LITE_ENSURE_OK(context, status, ...) \ + do { \ + const TfLiteStatus s = (status); \ + if (s != kTfLiteOk) { \ + if (sizeof(TF_LITE_VAR_ARG_HEAD("" __VA_ARGS__)) > sizeof("")) { \ + TF_LITE_MAYBE_KERNEL_LOG((context), __FILE__ ":" TF_LITE_STRINGIFY( \ + __LINE__) ": " __VA_ARGS__); \ + } \ + return s; \ + } \ } while (0) +#else +#define TF_LITE_ENSURE_OK(context, status, ...) \ + do { \ + const TfLiteStatus s = (status); \ + if ((s) != kTfLiteOk) { \ + return s; \ + } \ + } while (0) +#endif // `std::unreachable` not available until CC23. #ifdef __GNUC__ // GCC, Clang, ICC @@ -1060,6 +1082,13 @@ typedef struct TfLiteContext { /// WARNING: This is an experimental interface that is subject to change. TfLiteStatus (*ReleaseSubgraphContext)(struct TfLiteContext* context, int subgraph_index); +#if defined(_WIN32) + /// Create a array of a given `size` (uninitialized entries). + TfLiteIntArray* (*TfLiteIntArrayCreate)(int size); // NOLINT + + /// Free memory of array `a`. + void (*TfLiteIntArrayFree)(TfLiteIntArray* a); // NOLINT +#endif // defined(_WIN32) } TfLiteContext; /// `TfLiteOperator` is an external version of `TfLiteRegistration` diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD index 79cb502eeb8..e18452a9340 100644 --- a/tensorflow/lite/kernels/internal/BUILD +++ b/tensorflow/lite/kernels/internal/BUILD @@ -146,6 +146,7 @@ cc_library( copts = tflite_copts(), deps = [ ":compatibility", + "//tensorflow/lite/types:half", ], ) diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h index 4d990d70aa0..929168b7098 100644 --- a/tensorflow/lite/kernels/internal/common.h +++ b/tensorflow/lite/kernels/internal/common.h @@ -78,7 +78,11 @@ bool ReduceDimensionsForBroadcast(const RuntimeShape& input1_shape, if (!broadcast_input1) { broadcast_input1 = true; broadcast_input2 = false; + if (num_compressed_dims >= MAX_DIM) return false; num_compressed_dims++; + if (num_compressed_dims > MAX_DIM) { + return false; + } } compressed_input2_shape[num_compressed_dims - 1] *= input2_dim; compressed_output_shape[num_compressed_dims - 1] *= input2_dim; @@ -86,7 +90,11 @@ bool ReduceDimensionsForBroadcast(const RuntimeShape& input1_shape, if (!broadcast_input2) { broadcast_input1 = false; broadcast_input2 = true; + if (num_compressed_dims >= MAX_DIM) return false; num_compressed_dims++; + if (num_compressed_dims > MAX_DIM) { + return false; + } } compressed_input1_shape[num_compressed_dims - 1] *= input1_dim; compressed_output_shape[num_compressed_dims - 1] *= input1_dim; @@ -95,7 +103,11 @@ bool ReduceDimensionsForBroadcast(const RuntimeShape& input1_shape, if (broadcast_input1 || broadcast_input2 || first_nonunit) { broadcast_input1 = false; broadcast_input2 = false; + if (num_compressed_dims >= MAX_DIM) return false; num_compressed_dims++; + if (num_compressed_dims > MAX_DIM) { + return false; + } } compressed_input1_shape[num_compressed_dims - 1] *= input1_dim; compressed_input2_shape[num_compressed_dims - 1] *= input1_dim; @@ -105,7 +117,11 @@ bool ReduceDimensionsForBroadcast(const RuntimeShape& input1_shape, } if (num_input1_dims > num_input2_dims) { if (!broadcast_input2) { + if (num_compressed_dims >= MAX_DIM) return false; num_compressed_dims++; + if (num_compressed_dims > MAX_DIM) { + return false; + } } for (size_t i = 0; i < num_input1_dims - num_input2_dims; i++) { const size_t input1_dim = input1_dims[i]; @@ -117,7 +133,11 @@ bool ReduceDimensionsForBroadcast(const RuntimeShape& input1_shape, } } else if (num_input2_dims > num_input1_dims) { if (!broadcast_input1) { + if (num_compressed_dims >= MAX_DIM) return false; num_compressed_dims++; + if (num_compressed_dims > MAX_DIM) { + return false; + } } for (size_t i = 0; i < num_input2_dims - num_input1_dims; i++) { const size_t input2_dim = input2_dims[i]; diff --git a/tensorflow/lite/kernels/internal/reference/add.h b/tensorflow/lite/kernels/internal/reference/add.h index 5b520bd1e13..d41299d16d3 100644 --- a/tensorflow/lite/kernels/internal/reference/add.h +++ b/tensorflow/lite/kernels/internal/reference/add.h @@ -23,6 +23,7 @@ limitations under the License. #include "fixedpoint/fixedpoint.h" #include "tensorflow/lite/kernels/internal/common.h" #include "tensorflow/lite/kernels/internal/compatibility.h" +#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h" namespace tflite { @@ -39,7 +40,7 @@ inline void Add(const ArithmeticParams& params, const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape); for (int i = 0; i < flat_size; ++i) { - output_data[i] = ActivationFunctionWithMinMax( + output_data[i] = ActivationFunctionWithMinMax( input1_data[i] + input2_data[i], activation_min, activation_max); } } @@ -328,6 +329,20 @@ BroadcastAdd6DSlow(const ArithmeticParams& params, constexpr int kMaxBroadcastDim = 6; T activation_min, activation_max; GetActivationParams(params, &activation_min, &activation_max); + const int broadcast_rank = std::max( + output_shape.DimensionsCount(), + std::max(input1_shape.DimensionsCount(), input2_shape.DimensionsCount())); + if (broadcast_rank > kMaxBroadcastDim) { + ForEachBroadcastedElement( + input1_shape, input2_shape, output_shape, + [&](int output_index, int input1_index, int input2_index) { + output_data[output_index] = ActivationFunctionWithMinMax( + static_cast(input1_data[input1_index] + + input2_data[input2_index]), + activation_min, activation_max); + }); + return; + } // In Tensorflow, the dimensions are canonically named (batch_number, row, // col, channel), with extents (batches, height, width, depth), with the @@ -421,6 +436,19 @@ BroadcastAdd6DSlow(const ArithmeticParams& params, const RuntimeShape& input2_shape, const T* input2_data, const RuntimeShape& output_shape, T* output_data) { constexpr int kMaxBroadcastDim = 6; + const int broadcast_rank = std::max( + output_shape.DimensionsCount(), + std::max(input1_shape.DimensionsCount(), input2_shape.DimensionsCount())); + if (broadcast_rank > kMaxBroadcastDim) { + ForEachBroadcastedElement( + input1_shape, input2_shape, output_shape, + [&](int output_index, int input1_index, int input2_index) { + AddElementwise(1, params, input1_data + input1_index, + input2_data + input2_index, + output_data + output_index); + }); + return; + } // In Tensorflow, the dimensions are canonically named (batch_number, row, // col, channel), with extents (batches, height, width, depth), with the diff --git a/tensorflow/lite/kernels/internal/reference/batch_matmul.h b/tensorflow/lite/kernels/internal/reference/batch_matmul.h index 71f456703a3..a0853526233 100644 --- a/tensorflow/lite/kernels/internal/reference/batch_matmul.h +++ b/tensorflow/lite/kernels/internal/reference/batch_matmul.h @@ -17,6 +17,7 @@ limitations under the License. #include #include +#include #include "tensorflow/lite/kernels/internal/common.h" #include "tensorflow/lite/kernels/internal/compatibility.h" @@ -38,13 +39,13 @@ inline int broadcast_dim(int lhs_dim, int rhs_dim) { // Compute the "extent" for iterating on this dimension. // If we are broadcasting, then don't advance (i.e return 0). -inline int extent(const RuntimeShape& shape, int x) { +inline size_t extent(const RuntimeShape& shape, int x) { if (shape.Dims(x) == 1) { return 0; } - int prod = 1; + size_t prod = 1; for (int i = x + 1; i < shape.DimensionsCount(); ++i) { - prod *= shape.Dims(i); + prod *= static_cast(shape.Dims(i)); } return prod; } @@ -60,45 +61,45 @@ inline void BatchMatMul(const RuntimeShape& lhs_shape, const Ta* lhs_data, const RuntimeShape extended_rhs_shape = RuntimeShape::ExtendedShape(5, rhs_shape); - const int batch_dim0 = batch_matmul::broadcast_dim( - extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0)); - const int batch_dim1 = batch_matmul::broadcast_dim( - extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1)); - const int batch_dim2 = batch_matmul::broadcast_dim( - extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2)); + const size_t batch_dim0 = static_cast(batch_matmul::broadcast_dim( + extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0))); + const size_t batch_dim1 = static_cast(batch_matmul::broadcast_dim( + extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1))); + const size_t batch_dim2 = static_cast(batch_matmul::broadcast_dim( + extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2))); - const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0); - const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1); - const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2); - const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0); - const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1); - const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2); + const size_t lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0); + const size_t lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1); + const size_t lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2); + const size_t rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0); + const size_t rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1); + const size_t rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2); // Set params for each matrix multiply. - const int lhs_rows = extended_lhs_shape.Dims(3); - const int rhs_cols = extended_rhs_shape.Dims(4); - const int accum_depth = extended_lhs_shape.Dims(4); + const size_t lhs_rows = static_cast(extended_lhs_shape.Dims(3)); + const size_t rhs_cols = static_cast(extended_rhs_shape.Dims(4)); + const size_t accum_depth = static_cast(extended_lhs_shape.Dims(4)); - for (int b0 = 0; b0 < batch_dim0; ++b0) { + for (size_t b0 = 0; b0 < batch_dim0; ++b0) { const Ta* lhs_ptr0 = lhs_data + (b0 * lhs_ext0); const Tb* rhs_ptr0 = rhs_data + (b0 * rhs_ext0); - for (int b1 = 0; b1 < batch_dim1; ++b1) { + for (size_t b1 = 0; b1 < batch_dim1; ++b1) { const Ta* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1; const Tb* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1; - for (int b2 = 0; b2 < batch_dim2; ++b2) { + for (size_t b2 = 0; b2 < batch_dim2; ++b2) { const Ta* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2; const Tb* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2; Tout* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) * lhs_rows * rhs_cols; - for (int j = 0; j < rhs_cols; ++j) { - for (int i = 0; i < lhs_rows; ++i) { + for (size_t j = 0; j < rhs_cols; ++j) { + for (size_t i = 0; i < lhs_rows; ++i) { Tout total = 0; - for (int k = 0; k < accum_depth; ++k) { + for (size_t k = 0; k < accum_depth; ++k) { total += static_cast(lhs_ptr2[accum_depth * i + k]) * static_cast(rhs_ptr2[j * accum_depth + k]); } - int idx = lhs_rows * j + i; + size_t idx = lhs_rows * j + i; out_ptr[idx] = total; } } @@ -119,57 +120,62 @@ inline void BatchMatMul(const RuntimeShape& lhs_shape, const int8_t* lhs_data, const RuntimeShape extended_rhs_shape = RuntimeShape::ExtendedShape(5, rhs_shape); - const int batch_dim0 = batch_matmul::broadcast_dim( - extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0)); - const int batch_dim1 = batch_matmul::broadcast_dim( - extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1)); - const int batch_dim2 = batch_matmul::broadcast_dim( - extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2)); + const size_t batch_dim0 = static_cast(batch_matmul::broadcast_dim( + extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0))); + const size_t batch_dim1 = static_cast(batch_matmul::broadcast_dim( + extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1))); + const size_t batch_dim2 = static_cast(batch_matmul::broadcast_dim( + extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2))); - const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0); - const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1); - const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2); - const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0); - const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1); - const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2); + const size_t lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0); + const size_t lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1); + const size_t lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2); + const size_t rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0); + const size_t rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1); + const size_t rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2); // Set params for each matrix multiply. - const int lhs_rows = extended_lhs_shape.Dims(3); - const int rhs_cols = extended_rhs_shape.Dims(4); - const int accum_depth = extended_lhs_shape.Dims(4); + const size_t lhs_rows = static_cast(extended_lhs_shape.Dims(3)); + const size_t rhs_cols = static_cast(extended_rhs_shape.Dims(4)); + const size_t accum_depth = static_cast(extended_lhs_shape.Dims(4)); - const int ioff_ext0 = rhs_ext0 == 0 ? 0 : rhs_cols; - const int ioff_ext1 = rhs_ext1 == 0 ? 0 : rhs_cols; - const int ioff_ext2 = rhs_ext2 == 0 ? 0 : rhs_cols; - const int woff_ext0 = lhs_ext0 == 0 ? 0 : lhs_rows; - const int woff_ext1 = lhs_ext1 == 0 ? 0 : lhs_rows; - const int woff_ext2 = lhs_ext2 == 0 ? 0 : lhs_rows; + const size_t ioff_ext0 = rhs_ext0 == 0 ? 0 : rhs_cols; + const size_t ioff_ext1 = rhs_ext1 == 0 ? 0 : rhs_cols; + const size_t ioff_ext2 = rhs_ext2 == 0 ? 0 : rhs_cols; + const size_t woff_ext0 = lhs_ext0 == 0 ? 0 : lhs_rows; + const size_t woff_ext1 = lhs_ext1 == 0 ? 0 : lhs_rows; + const size_t woff_ext2 = lhs_ext2 == 0 ? 0 : lhs_rows; if (!compute_row_sums || *compute_row_sums) { - int num_weights_matrices = 1; + size_t num_weights_matrices = 1; for (int i = 1; i < extended_lhs_shape.DimensionsCount() - 2; ++i) { - num_weights_matrices *= extended_lhs_shape.Dims(i); + num_weights_matrices *= static_cast(extended_lhs_shape.Dims(i)); } + TFLITE_DCHECK_LE(num_weights_matrices * lhs_rows, + static_cast(std::numeric_limits::max())); + TFLITE_DCHECK_LE(accum_depth, + static_cast(std::numeric_limits::max())); tensor_utils::ReductionSumVector( - lhs_data, row_sums, num_weights_matrices * lhs_rows, accum_depth); + lhs_data, row_sums, static_cast(num_weights_matrices * lhs_rows), + static_cast(accum_depth)); if (compute_row_sums) { *compute_row_sums = false; } } - for (int b0 = 0; b0 < batch_dim0; ++b0) { + for (size_t b0 = 0; b0 < batch_dim0; ++b0) { const int8_t* lhs_ptr0 = lhs_data + (b0 * lhs_ext0); const int8_t* rhs_ptr0 = rhs_data + (b0 * rhs_ext0); const int32_t* ioff_ptr0 = input_offset + (b0 * ioff_ext0); const float* scale_ptr0 = scaling_factors + (b0 * ioff_ext0); const int32_t* woff_ptr0 = row_sums + (b0 * woff_ext0); - for (int b1 = 0; b1 < batch_dim1; ++b1) { + for (size_t b1 = 0; b1 < batch_dim1; ++b1) { const int8_t* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1; const int8_t* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1; const int32_t* ioff_ptr1 = ioff_ptr0 + (b1 * ioff_ext1); const float* scale_ptr1 = scale_ptr0 + (b1 * ioff_ext1); const int32_t* woff_ptr1 = woff_ptr0 + (b1 * woff_ext1); - for (int b2 = 0; b2 < batch_dim2; ++b2) { + for (size_t b2 = 0; b2 < batch_dim2; ++b2) { const int8_t* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2; const int8_t* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2; const int32_t* ioff_ptr2 = ioff_ptr1 + (b2 * ioff_ext2); @@ -178,18 +184,18 @@ inline void BatchMatMul(const RuntimeShape& lhs_shape, const int8_t* lhs_data, float* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) * lhs_rows * rhs_cols; - for (int j = 0; j < rhs_cols; ++j) { + for (size_t j = 0; j < rhs_cols; ++j) { const float batch_scaling_factor = scale_ptr2[j]; const float batch_offset = static_cast(ioff_ptr2[j]); - for (int i = 0; i < lhs_rows; ++i) { + for (size_t i = 0; i < lhs_rows; ++i) { int32_t total = 0; - for (int k = 0; k < accum_depth; ++k) { + for (size_t k = 0; k < accum_depth; ++k) { total += lhs_ptr2[accum_depth * i + k] * rhs_ptr2[j * accum_depth + k]; } int32_t row_sum = woff_ptr2[i]; total -= row_sum * batch_offset; - int idx = lhs_rows * j + i; + size_t idx = lhs_rows * j + i; float scale = batch_scaling_factor; if (per_channel_scales) { scale *= per_channel_scales[i]; @@ -214,24 +220,24 @@ inline void BatchMatMul(const FullyConnectedParams& params, const RuntimeShape extended_rhs_shape = RuntimeShape::ExtendedShape(5, rhs_shape); - const int batch_dim0 = batch_matmul::broadcast_dim( - extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0)); - const int batch_dim1 = batch_matmul::broadcast_dim( - extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1)); - const int batch_dim2 = batch_matmul::broadcast_dim( - extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2)); + const size_t batch_dim0 = static_cast(batch_matmul::broadcast_dim( + extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0))); + const size_t batch_dim1 = static_cast(batch_matmul::broadcast_dim( + extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1))); + const size_t batch_dim2 = static_cast(batch_matmul::broadcast_dim( + extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2))); - const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0); - const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1); - const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2); - const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0); - const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1); - const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2); + const size_t lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0); + const size_t lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1); + const size_t lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2); + const size_t rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0); + const size_t rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1); + const size_t rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2); // Set params for each matrix multiply. - const int lhs_rows = extended_lhs_shape.Dims(3); - const int rhs_cols = extended_rhs_shape.Dims(4); - const int accum_depth = extended_lhs_shape.Dims(4); + const size_t lhs_rows = static_cast(extended_lhs_shape.Dims(3)); + const size_t rhs_cols = static_cast(extended_rhs_shape.Dims(4)); + const size_t accum_depth = static_cast(extended_lhs_shape.Dims(4)); const int32_t input_offset = params.input_offset; const int32_t filter_offset = params.weights_offset; @@ -242,23 +248,23 @@ inline void BatchMatMul(const FullyConnectedParams& params, const int32_t output_activation_max = params.quantized_activation_max; TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - for (int b0 = 0; b0 < batch_dim0; ++b0) { + for (size_t b0 = 0; b0 < batch_dim0; ++b0) { const lhsT* lhs_ptr0 = lhs_data + (b0 * lhs_ext0); const rhsT* rhs_ptr0 = rhs_data + (b0 * rhs_ext0); - for (int b1 = 0; b1 < batch_dim1; ++b1) { + for (size_t b1 = 0; b1 < batch_dim1; ++b1) { const lhsT* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1; const rhsT* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1; - for (int b2 = 0; b2 < batch_dim2; ++b2) { + for (size_t b2 = 0; b2 < batch_dim2; ++b2) { const lhsT* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2; const rhsT* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2; outputT* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) * lhs_rows * rhs_cols; - for (int j = 0; j < rhs_cols; ++j) { - for (int i = 0; i < lhs_rows; ++i) { + for (size_t j = 0; j < rhs_cols; ++j) { + for (size_t i = 0; i < lhs_rows; ++i) { AccumT total = 0; - for (int k = 0; k < accum_depth; ++k) { + for (size_t k = 0; k < accum_depth; ++k) { AccumT lhs_val = lhs_ptr2[accum_depth * i + k]; AccumT rhs_val = rhs_ptr2[accum_depth * j + k]; total += (lhs_val + filter_offset) * (rhs_val + input_offset); @@ -268,7 +274,7 @@ inline void BatchMatMul(const FullyConnectedParams& params, total_scaled += output_offset; total_scaled = std::max(total_scaled, output_activation_min); total_scaled = std::min(total_scaled, output_activation_max); - const int idx = lhs_rows * j + i; + const size_t idx = lhs_rows * j + i; out_ptr[idx] = static_cast(total_scaled); } } diff --git a/tensorflow/lite/kernels/internal/reference/binary_function.h b/tensorflow/lite/kernels/internal/reference/binary_function.h index 0b124af87f0..611fc3241b0 100644 --- a/tensorflow/lite/kernels/internal/reference/binary_function.h +++ b/tensorflow/lite/kernels/internal/reference/binary_function.h @@ -17,6 +17,7 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/common.h" #include "tensorflow/lite/kernels/internal/compatibility.h" +#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h" #include "tensorflow/lite/kernels/internal/types.h" namespace tflite { @@ -32,42 +33,12 @@ inline void BroadcastBinaryFunction4DSlow( const RuntimeShape& unextended_input2_shape, const T2* input2_data, const RuntimeShape& unextended_output_shape, R* output_data, R (*func)(T1, T2)) { - TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); - TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); - TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); - const RuntimeShape output_shape = - RuntimeShape::ExtendedShape(4, unextended_output_shape); - - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, - unextended_input2_shape, &desc1, &desc2); - - const int* dims_data = - reinterpret_cast(output_shape.DimsDataUpTo5D()); - for (int b = 0; b < output_shape.Dims(0); ++b) { - int out_idx_b = b * dims_data[1]; - int in_idx1_b = desc1.strides[0] * b; - int in_idx2_b = desc2.strides[0] * b; - for (int y = 0; y < output_shape.Dims(1); ++y) { - int out_idx_y = (out_idx_b + y) * dims_data[2]; - int in_idx1_y = in_idx1_b + desc1.strides[1] * y; - int in_idx2_y = in_idx2_b + desc2.strides[1] * y; - for (int x = 0; x < output_shape.Dims(2); ++x) { - int out_idx_x = (out_idx_y + x) * dims_data[3]; - int in1_idx = in_idx1_y + desc1.strides[2] * x; - int in2_idx = in_idx2_y + desc2.strides[2] * x; - for (int c = 0; c < output_shape.Dims(3); ++c) { - auto out_idx = out_idx_x + c; - auto in1_val = input1_data[in1_idx]; - auto in2_val = input2_data[in2_idx]; - output_data[out_idx] = func(in1_val, in2_val); - in1_idx += desc1.strides[3]; - in2_idx += desc2.strides[3]; - } - } - } - } + ForEachBroadcastedElement( + unextended_input1_shape, unextended_input2_shape, unextended_output_shape, + [&](int output_index, int input1_index, int input2_index) { + output_data[output_index] = + func(input1_data[input1_index], input2_data[input2_index]); + }); } // R: Result type. T1: Input 1 type. T2: Input 2 type. diff --git a/tensorflow/lite/kernels/internal/reference/broadcast_loop.h b/tensorflow/lite/kernels/internal/reference/broadcast_loop.h new file mode 100644 index 00000000000..d52f2904637 --- /dev/null +++ b/tensorflow/lite/kernels/internal/reference/broadcast_loop.h @@ -0,0 +1,132 @@ +/* Copyright 2026 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_LOOP_H_ +#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_LOOP_H_ + +#include +#include + +#include "tensorflow/lite/kernels/internal/runtime_shape.h" + +namespace tflite { +namespace reference_ops { + +inline std::vector BroadcastStridesForShape( + const RuntimeShape& unextended_shape, + const RuntimeShape& extended_output_shape) { + const int dims_count = extended_output_shape.DimensionsCount(); + const RuntimeShape extended_shape = + RuntimeShape::ExtendedShape(dims_count, unextended_shape); + std::vector strides(dims_count); + int stride = 1; + for (int i = dims_count - 1; i >= 0; --i) { + const int dim = extended_shape.Dims(i); + const int output_dim = extended_output_shape.Dims(i); + strides[i] = (dim == 1 && output_dim != 1) ? 0 : stride; + stride *= dim; + } + return strides; +} + +inline std::vector StridesForShape(const RuntimeShape& shape) { + const int dims_count = shape.DimensionsCount(); + std::vector strides(dims_count); + int stride = 1; + for (int i = dims_count - 1; i >= 0; --i) { + strides[i] = stride; + stride *= shape.Dims(i); + } + return strides; +} + +template +inline void ForEachBroadcastedElement(const RuntimeShape& input1_shape, + const RuntimeShape& input2_shape, + const RuntimeShape& output_shape, Fn fn) { + const int dims_count = std::max( + output_shape.DimensionsCount(), + std::max(input1_shape.DimensionsCount(), input2_shape.DimensionsCount())); + const RuntimeShape extended_output_shape = + RuntimeShape::ExtendedShape(dims_count, output_shape); + const std::vector output_strides = + StridesForShape(extended_output_shape); + const std::vector input1_strides = + BroadcastStridesForShape(input1_shape, extended_output_shape); + const std::vector input2_strides = + BroadcastStridesForShape(input2_shape, extended_output_shape); + + const int flat_size = output_shape.FlatSize(); + for (int output_index = 0; output_index < flat_size; ++output_index) { + int remaining_index = output_index; + int input1_index = 0; + int input2_index = 0; + for (int dim = 0; dim < dims_count; ++dim) { + const int output_stride = output_strides[dim]; + const int coordinate = + output_stride == 0 ? 0 : remaining_index / output_stride; + if (output_stride != 0) { + remaining_index %= output_stride; + } + input1_index += coordinate * input1_strides[dim]; + input2_index += coordinate * input2_strides[dim]; + } + fn(output_index, input1_index, input2_index); + } +} + +template +inline void ForEachBroadcastedElement(const RuntimeShape& input1_shape, + const RuntimeShape& input2_shape, + const RuntimeShape& input3_shape, + const RuntimeShape& output_shape, Fn fn) { + const int dims_count = std::max( + std::max(output_shape.DimensionsCount(), input1_shape.DimensionsCount()), + std::max(input2_shape.DimensionsCount(), input3_shape.DimensionsCount())); + const RuntimeShape extended_output_shape = + RuntimeShape::ExtendedShape(dims_count, output_shape); + const std::vector output_strides = + StridesForShape(extended_output_shape); + const std::vector input1_strides = + BroadcastStridesForShape(input1_shape, extended_output_shape); + const std::vector input2_strides = + BroadcastStridesForShape(input2_shape, extended_output_shape); + const std::vector input3_strides = + BroadcastStridesForShape(input3_shape, extended_output_shape); + + const int flat_size = output_shape.FlatSize(); + for (int output_index = 0; output_index < flat_size; ++output_index) { + int remaining_index = output_index; + int input1_index = 0; + int input2_index = 0; + int input3_index = 0; + for (int dim = 0; dim < dims_count; ++dim) { + const int output_stride = output_strides[dim]; + const int coordinate = + output_stride == 0 ? 0 : remaining_index / output_stride; + if (output_stride != 0) { + remaining_index %= output_stride; + } + input1_index += coordinate * input1_strides[dim]; + input2_index += coordinate * input2_strides[dim]; + input3_index += coordinate * input3_strides[dim]; + } + fn(output_index, input1_index, input2_index, input3_index); + } +} + +} // namespace reference_ops +} // namespace tflite + +#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_LOOP_H_ \ No newline at end of file diff --git a/tensorflow/lite/kernels/internal/reference/comparisons.cc b/tensorflow/lite/kernels/internal/reference/comparisons.cc deleted file mode 100644 index 36ce951ec17..00000000000 --- a/tensorflow/lite/kernels/internal/reference/comparisons.cc +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/lite/kernels/internal/reference/comparisons.h" - -#include "tensorflow/lite/kernels/internal/common.h" -#include "tensorflow/lite/kernels/internal/compatibility.h" -#include "tensorflow/lite/kernels/internal/runtime_shape.h" - -namespace tflite { -namespace reference_ops { - -BroadcastComparison4DSlowCommon BroadcastComparison4DSlowPreprocess( - const RuntimeShape& unextended_input1_shape, - const RuntimeShape& unextended_input2_shape, - const RuntimeShape& unextended_output_shape) { - TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); - TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); - TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, - unextended_input2_shape, &desc1, &desc2); - return {RuntimeShape::ExtendedShape(4, unextended_output_shape), desc1, - desc2}; -} - -} // namespace reference_ops -} // namespace tflite diff --git a/tensorflow/lite/kernels/internal/reference/comparisons.h b/tensorflow/lite/kernels/internal/reference/comparisons.h index e40e4045cc7..165110fe8eb 100644 --- a/tensorflow/lite/kernels/internal/reference/comparisons.h +++ b/tensorflow/lite/kernels/internal/reference/comparisons.h @@ -20,6 +20,7 @@ limitations under the License. #include "tensorflow/lite/core/c/common.h" #include "tensorflow/lite/core/macros.h" #include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h" #include "tensorflow/lite/kernels/internal/runtime_shape.h" #include "tensorflow/lite/kernels/internal/types.h" @@ -110,40 +111,18 @@ inline void ComparisonWithScaling( } } -struct BroadcastComparison4DSlowCommon { - const RuntimeShape output_shape; - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; -}; - -TFLITE_NOINLINE -BroadcastComparison4DSlowCommon BroadcastComparison4DSlowPreprocess( - const RuntimeShape& unextended_input1_shape, - const RuntimeShape& unextended_input2_shape, - const RuntimeShape& unextended_output_shape); - template F> inline void BroadcastComparison4DSlowImpl( const ComparisonParams& op_params, const RuntimeShape& unextended_input1_shape, const T* input1_data, const RuntimeShape& unextended_input2_shape, const T* input2_data, const RuntimeShape& unextended_output_shape, bool* output_data) { - const BroadcastComparison4DSlowCommon dims = - BroadcastComparison4DSlowPreprocess(unextended_input1_shape, - unextended_input2_shape, - unextended_output_shape); - - for (int b = 0; b < dims.output_shape.Dims(0); ++b) { - for (int y = 0; y < dims.output_shape.Dims(1); ++y) { - for (int x = 0; x < dims.output_shape.Dims(2); ++x) { - for (int c = 0; c < dims.output_shape.Dims(3); ++c) { - output_data[Offset(dims.output_shape, b, y, x, c)] = - F(input1_data[SubscriptToIndex(dims.desc1, b, y, x, c)], - input2_data[SubscriptToIndex(dims.desc2, b, y, x, c)]); - } - } - } - } + ForEachBroadcastedElement( + unextended_input1_shape, unextended_input2_shape, unextended_output_shape, + [&](int output_index, int input1_index, int input2_index) { + output_data[output_index] = + F(input1_data[input1_index], input2_data[input2_index]); + }); } template F> @@ -165,11 +144,6 @@ inline void BroadcastComparison4DSlowWithScaling( const RuntimeShape& unextended_input1_shape, const T* input1_data, const RuntimeShape& unextended_input2_shape, const T* input2_data, const RuntimeShape& unextended_output_shape, bool* output_data) { - const BroadcastComparison4DSlowCommon dims = - BroadcastComparison4DSlowPreprocess(unextended_input1_shape, - unextended_input2_shape, - unextended_output_shape); - int left_shift = op_params.left_shift; int32_t input1_offset = op_params.input1_offset; int32_t input1_multiplier = op_params.input1_multiplier; @@ -178,30 +152,21 @@ inline void BroadcastComparison4DSlowWithScaling( int32_t input2_multiplier = op_params.input2_multiplier; int input2_shift = op_params.input2_shift; - for (int b = 0; b < dims.output_shape.Dims(0); ++b) { - for (int y = 0; y < dims.output_shape.Dims(1); ++y) { - for (int x = 0; x < dims.output_shape.Dims(2); ++x) { - for (int c = 0; c < dims.output_shape.Dims(3); ++c) { - const int32_t input1_val = - input1_offset + - input1_data[SubscriptToIndex(dims.desc1, b, y, x, c)]; - const int32_t input2_val = - input2_offset + - input2_data[SubscriptToIndex(dims.desc2, b, y, x, c)]; - const int32_t shifted_input1_val = input1_val * (1 << left_shift); - const int32_t shifted_input2_val = input2_val * (1 << left_shift); - const int32_t scaled_input1_val = - MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input1_val, input1_multiplier, input1_shift); - const int32_t scaled_input2_val = - MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input2_val, input2_multiplier, input2_shift); - output_data[Offset(dims.output_shape, b, y, x, c)] = - F(scaled_input1_val, scaled_input2_val); - } - } - } - } + ForEachBroadcastedElement( + unextended_input1_shape, unextended_input2_shape, unextended_output_shape, + [&](int output_index, int input1_index, int input2_index) { + const int32_t input1_val = input1_offset + input1_data[input1_index]; + const int32_t input2_val = input2_offset + input2_data[input2_index]; + const int32_t shifted_input1_val = input1_val * (1 << left_shift); + const int32_t shifted_input2_val = input2_val * (1 << left_shift); + const int32_t scaled_input1_val = + MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input1_val, input1_multiplier, input1_shift); + const int32_t scaled_input2_val = + MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input2_val, input2_multiplier, input2_shift); + output_data[output_index] = F(scaled_input1_val, scaled_input2_val); + }); } #define TFLITE_COMPARISON_OP(name) \ diff --git a/tensorflow/lite/kernels/internal/reference/concatenation.h b/tensorflow/lite/kernels/internal/reference/concatenation.h index 4a82d7c502d..915492b1e92 100644 --- a/tensorflow/lite/kernels/internal/reference/concatenation.h +++ b/tensorflow/lite/kernels/internal/reference/concatenation.h @@ -17,6 +17,7 @@ limitations under the License. #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONCATENATION_H_ #include +#include #include "tensorflow/lite/kernels/internal/common.h" #include "tensorflow/lite/kernels/internal/compatibility.h" @@ -109,7 +110,7 @@ inline void Concatenation(const ConcatenationParams& params, // not garbage. // Note: output_shape.FlatSize() gives number of elements (nibbles). // Bytes needed: (elements + 1) / 2. - memset(output_ptr, 0, (output_shape.FlatSize() + 1) / 2); + memset(output_ptr, 0, (static_cast(output_shape.FlatSize()) + 1) / 2); int64_t output_offset = 0; for (int k = 0; k < outer_size; k++) { diff --git a/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h b/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h index d4fba1399fb..be9f5fcbe0c 100644 --- a/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h +++ b/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h @@ -153,7 +153,8 @@ struct DepthwiseConvBasicKernel { for (int out_x = 0; out_x < output_width; ++out_x) { for (int ic = 0; ic < input_depth; ++ic) { for (int m = 0; m < depth_multiplier; m++) { - const int oc = m + ic * depth_multiplier; + const int64_t oc = + m + static_cast(ic) * depth_multiplier; const int in_x_origin = (out_x * stride_width) - pad_width; const int in_y_origin = (out_y * stride_height) - pad_height; int32_t acc = 0; @@ -240,7 +241,8 @@ struct DepthwiseConvBasicKernel { for (int out_x = 0; out_x < output_width; ++out_x) { for (int in_channel = 0; in_channel < input_depth; ++in_channel) { for (int m = 0; m < depth_multiplier; ++m) { - const int output_channel = m + in_channel * depth_multiplier; + const int64_t output_channel = + m + static_cast(in_channel) * depth_multiplier; const int in_x_origin = (out_x * stride_width) - pad_width; const int in_y_origin = (out_y * stride_height) - pad_height; int32_t acc = 0; diff --git a/tensorflow/lite/kernels/internal/reference/div.h b/tensorflow/lite/kernels/internal/reference/div.h index 5f26d3b8e6d..9ba3902271f 100644 --- a/tensorflow/lite/kernels/internal/reference/div.h +++ b/tensorflow/lite/kernels/internal/reference/div.h @@ -18,6 +18,7 @@ limitations under the License. #include #include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h" namespace tflite { @@ -117,50 +118,37 @@ inline void BroadcastDivSlowQuantized( const T* input1_data, const RuntimeShape& unextended_input2_shape, const T* input2_data, const RuntimeShape& unextended_output_shape, T* output_data) { - TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N); - TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N); - TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N); - - NdArrayDesc desc1; - NdArrayDesc desc2; - NdArrayDesc output_desc; - NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, - unextended_input2_shape, &desc1, &desc2); - CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape), - &output_desc); - DivCheckArithmeticParams(params); - auto div_func = [&](int indexes[N]) { - int32_t input1_val = - params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)]; - int32_t input2_val = - params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)]; - TFLITE_DCHECK_NE(input2_val, 0); - if (input2_val < 0) { - // Invert signs to avoid a negative input2_val as input2_inv needs to be - // positive to be used as multiplier of MultiplyByQuantizedMultiplier. - input1_val = -input1_val; - input2_val = -input2_val; - } - int recip_shift; - const int32_t input2_inv = GetReciprocal(input2_val, 31, &recip_shift); - const int headroom = CountLeadingSignBits(input1_val); - const int32_t unscaled_quotient = - MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv, - headroom); - const int total_shift = params.output_shift - recip_shift - headroom; - const int32_t unclamped_result = - params.output_offset + - MultiplyByQuantizedMultiplierSmallerThanOneExp( - unscaled_quotient, params.output_multiplier, total_shift); - const int32_t clamped_output = - std::min(params.quantized_activation_max, - std::max(params.quantized_activation_min, unclamped_result)); - output_data[SubscriptToIndex(output_desc, indexes)] = - static_cast(clamped_output); - }; - NDOpsHelper(output_desc, div_func); + ForEachBroadcastedElement( + unextended_input1_shape, unextended_input2_shape, unextended_output_shape, + [&](int output_index, int input1_index, int input2_index) { + int32_t input1_val = params.input1_offset + input1_data[input1_index]; + int32_t input2_val = params.input2_offset + input2_data[input2_index]; + TFLITE_DCHECK_NE(input2_val, 0); + if (input2_val < 0) { + // Invert signs to avoid a negative input2_val as input2_inv needs to + // be positive to be used as multiplier of + // MultiplyByQuantizedMultiplier. + input1_val = -input1_val; + input2_val = -input2_val; + } + int recip_shift; + const int32_t input2_inv = GetReciprocal(input2_val, 31, &recip_shift); + const int headroom = CountLeadingSignBits(input1_val); + const int32_t unscaled_quotient = + MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv, + headroom); + const int total_shift = params.output_shift - recip_shift - headroom; + const int32_t unclamped_result = + params.output_offset + + MultiplyByQuantizedMultiplierSmallerThanOneExp( + unscaled_quotient, params.output_multiplier, total_shift); + const int32_t clamped_output = std::min( + params.quantized_activation_max, + std::max(params.quantized_activation_min, unclamped_result)); + output_data[output_index] = static_cast(clamped_output); + }); } template @@ -202,10 +190,6 @@ inline void BroadcastDivSlow(const ArithmeticParams& params, input2_data, unextended_output_shape, output_data); } -// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary -// dimensionality if the runtime code does a single loop over one dimension -// that handles broadcasting as the base case. The code generator would then -// generate max(D1, D2) nested for loops. template void BroadcastDivSlow(const ArithmeticParams& params, const RuntimeShape& unextended_input1_shape, @@ -218,34 +202,13 @@ void BroadcastDivSlow(const ArithmeticParams& params, T output_activation_max; GetActivationParams(params, &output_activation_min, &output_activation_max); - TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N); - TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N); - TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N); - - NdArrayDesc desc1; - NdArrayDesc desc2; - NdArrayDesc output_desc; - NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, - unextended_input2_shape, &desc1, &desc2); - CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape), - &output_desc); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest - // stride, typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - - auto div_func = [&](int indexes[N]) { - output_data[SubscriptToIndex(output_desc, indexes)] = - ActivationFunctionWithMinMax( - input1_data[SubscriptToIndex(desc1, indexes)] / - input2_data[SubscriptToIndex(desc2, indexes)], + ForEachBroadcastedElement( + unextended_input1_shape, unextended_input2_shape, unextended_output_shape, + [&](int output_index, int input1_index, int input2_index) { + output_data[output_index] = ActivationFunctionWithMinMax( + input1_data[input1_index] / input2_data[input2_index], output_activation_min, output_activation_max); - }; - NDOpsHelper(output_desc, div_func); + }); } template diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/add.h b/tensorflow/lite/kernels/internal/reference/integer_ops/add.h index c2a0e0f082c..9f61a9196a5 100644 --- a/tensorflow/lite/kernels/internal/reference/integer_ops/add.h +++ b/tensorflow/lite/kernels/internal/reference/integer_ops/add.h @@ -20,6 +20,7 @@ limitations under the License. #include #include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h" #include "tensorflow/lite/kernels/internal/types.h" namespace tflite { @@ -132,6 +133,19 @@ void BroadcastBinaryFunction6DSlow( void (*check_arithmetic_params)(const ArithmeticParams&), T (*binary_func)(T, T, const ArithmeticParams&)) { constexpr int kMaxBroadcastDim = 6; + const int broadcast_rank = std::max( + output_shape.DimensionsCount(), + std::max(input1_shape.DimensionsCount(), input2_shape.DimensionsCount())); + if (broadcast_rank > kMaxBroadcastDim) { + check_arithmetic_params(params); + reference_ops::ForEachBroadcastedElement( + input1_shape, input2_shape, output_shape, + [&](int output_index, int input1_index, int input2_index) { + output_data[output_index] = binary_func( + input1_data[input1_index], input2_data[input2_index], params); + }); + return; + } // In Tensorflow, the dimensions are canonically named (batch_number, row, // col, channel), with extents (batches, height, width, depth), with the diff --git a/tensorflow/lite/kernels/internal/reference/maximum_minimum.h b/tensorflow/lite/kernels/internal/reference/maximum_minimum.h index cd11b4191ac..805801df8a6 100644 --- a/tensorflow/lite/kernels/internal/reference/maximum_minimum.h +++ b/tensorflow/lite/kernels/internal/reference/maximum_minimum.h @@ -16,6 +16,7 @@ limitations under the License. #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MAXIMUM_MINIMUM_H_ #include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h" #include "tensorflow/lite/kernels/internal/types.h" namespace tflite { @@ -37,24 +38,13 @@ void MaximumMinimumBroadcastSlow(const RuntimeShape& unextended_input1_shape, output_data[i] = op(input1_data[i], input2_data[i]); } } else { - TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N); - TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N); - TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N); - - NdArrayDesc desc1; - NdArrayDesc desc2; - NdArrayDesc output_desc; - NdArrayDescsForElementwiseBroadcast( - unextended_input1_shape, unextended_input2_shape, &desc1, &desc2); - CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape), - &output_desc); - - auto maxmin_func = [&](int indexes[N]) { - output_data[SubscriptToIndex(output_desc, indexes)] = - op(input1_data[SubscriptToIndex(desc1, indexes)], - input2_data[SubscriptToIndex(desc2, indexes)]); - }; - NDOpsHelper(output_desc, maxmin_func); + ForEachBroadcastedElement( + unextended_input1_shape, unextended_input2_shape, + unextended_output_shape, + [&](int output_index, int input1_index, int input2_index) { + output_data[output_index] = + op(input1_data[input1_index], input2_data[input2_index]); + }); } } diff --git a/tensorflow/lite/kernels/internal/reference/prelu.h b/tensorflow/lite/kernels/internal/reference/prelu.h index 1a5ef0cb1f4..400a1bac354 100644 --- a/tensorflow/lite/kernels/internal/reference/prelu.h +++ b/tensorflow/lite/kernels/internal/reference/prelu.h @@ -19,6 +19,7 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/common.h" #include "tensorflow/lite/kernels/internal/compatibility.h" +#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h" #include "tensorflow/lite/kernels/internal/types.h" namespace tflite { @@ -31,47 +32,30 @@ inline void BroadcastPrelu4DSlow( const PreluParams& params, const RuntimeShape& input_shape, const T* input_data, const RuntimeShape& alpha_shape, const U* alpha_data, const RuntimeShape& output_shape, T* output_data) { - TFLITE_DCHECK_LE(input_shape.DimensionsCount(), 4); - TFLITE_DCHECK_LE(alpha_shape.DimensionsCount(), 4); - TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 4); - const RuntimeShape extended_output_shape = - RuntimeShape::ExtendedShape(4, output_shape); - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input_shape, alpha_shape, &desc1, &desc2); - - for (int b = 0; b < extended_output_shape.Dims(0); ++b) { - for (int y = 0; y < extended_output_shape.Dims(1); ++y) { - for (int x = 0; x < extended_output_shape.Dims(2); ++x) { - for (int c = 0; c < extended_output_shape.Dims(3); ++c) { - int output_index = Offset(extended_output_shape, b, y, x, c); - int input_index = SubscriptToIndex(desc1, b, y, x, c); - const int32_t input_value = - params.input_offset + input_data[input_index]; - int32_t output_value; - if (input_value >= 0) { - output_value = MultiplyByQuantizedMultiplier( - input_value, params.output_multiplier_1, params.output_shift_1); - } else { - auto alpha_index = SubscriptToIndex(desc2, b, y, x, c); - const int32_t alpha_value = - params.alpha_offset + alpha_data[alpha_index]; - - output_value = MultiplyByQuantizedMultiplier( - input_value * alpha_value, params.output_multiplier_2, - params.output_shift_2); - } - output_value += params.output_offset; - - const int32_t quantized_min = std::numeric_limits::min(); - const int32_t quantized_max = std::numeric_limits::max(); - const int32_t clamped_output = - std::min(quantized_max, std::max(quantized_min, output_value)); - output_data[output_index] = static_cast(clamped_output); + const int32_t quantized_min = std::numeric_limits::min(); + const int32_t quantized_max = std::numeric_limits::max(); + ForEachBroadcastedElement( + input_shape, alpha_shape, output_shape, + [&](int output_index, int input_index, int alpha_index) { + const int32_t input_value = + params.input_offset + input_data[input_index]; + int32_t output_value; + if (input_value >= 0) { + output_value = MultiplyByQuantizedMultiplier( + input_value, params.output_multiplier_1, params.output_shift_1); + } else { + const int32_t alpha_value = + params.alpha_offset + alpha_data[alpha_index]; + output_value = MultiplyByQuantizedMultiplier( + input_value * alpha_value, params.output_multiplier_2, + params.output_shift_2); } - } - } - } + output_value += params.output_offset; + + const int32_t clamped_output = + std::min(quantized_max, std::max(quantized_min, output_value)); + output_data[output_index] = static_cast(clamped_output); + }); } template diff --git a/tensorflow/lite/kernels/internal/reference/softmax.h b/tensorflow/lite/kernels/internal/reference/softmax.h index 2930217b61f..27018436503 100644 --- a/tensorflow/lite/kernels/internal/reference/softmax.h +++ b/tensorflow/lite/kernels/internal/reference/softmax.h @@ -17,6 +17,7 @@ limitations under the License. #include #include +#include #include "fixedpoint/fixedpoint.h" #include "tensorflow/lite/kernels/internal/common.h" @@ -28,9 +29,11 @@ limitations under the License. namespace tflite { namespace reference_ops { +template ::value, int>::type = 0> inline void Softmax(const SoftmaxParams& params, - const RuntimeShape& input_shape, const float* input_data, - const RuntimeShape& output_shape, float* output_data) { + const RuntimeShape& input_shape, const T* input_data, + const RuntimeShape& output_shape, T* output_data) { const int trailing_dim = input_shape.DimensionsCount() - 1; const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); @@ -38,26 +41,24 @@ inline void Softmax(const SoftmaxParams& params, MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); for (int i = 0; i < outer_size; ++i) { - // Find max element value which we'll use to ensure numerical stability - // taking advantage of the following equality: - // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) - float max = std::numeric_limits::lowest(); + T max = std::numeric_limits::lowest(); for (int c = 0; c < depth; ++c) { max = std::max(max, input_data[i * depth + c]); } - // Compute sum. float sum = 0.f; for (int c = 0; c < depth; ++c) { - const float exp_c = std::exp((input_data[i * depth + c] - max) * - static_cast(params.beta)); - output_data[i * depth + c] = exp_c; + const float exp_c = + std::exp((static_cast(input_data[i * depth + c]) - + static_cast(max)) * + static_cast(params.beta)); + output_data[i * depth + c] = static_cast(exp_c); sum += exp_c; } - // Compute result. for (int c = 0; c < depth; ++c) { - output_data[i * depth + c] = output_data[i * depth + c] / sum; + output_data[i * depth + c] = + static_cast(static_cast(output_data[i * depth + c]) / sum); } } } diff --git a/tensorflow/lite/kernels/internal/reference/transpose.h b/tensorflow/lite/kernels/internal/reference/transpose.h index 7e2bf7b266a..fbd991141a7 100644 --- a/tensorflow/lite/kernels/internal/reference/transpose.h +++ b/tensorflow/lite/kernels/internal/reference/transpose.h @@ -176,6 +176,9 @@ template void Transpose(const TransposeParams& params, const RuntimeShape& input_shape, const T* input_data, const RuntimeShape& output_shape, T* output_data) { + if (input_shape.FlatSize() == 0) { + return; + } using transpose_internal::SetupTransposeStrides; using transpose_internal::TransposeImpl; using transpose_internal::TransposeStorageType; diff --git a/tensorflow/lite/kernels/internal/types.h b/tensorflow/lite/kernels/internal/types.h index 1cfc43d1662..c640e18abde 100644 --- a/tensorflow/lite/kernels/internal/types.h +++ b/tensorflow/lite/kernels/internal/types.h @@ -19,10 +19,40 @@ limitations under the License. #include #include #include +#include #include #include "tensorflow/lite/kernels/internal/compatibility.h" #include "tensorflow/lite/kernels/internal/runtime_shape.h" +#include "tensorflow/lite/types/half.h" + +namespace std { +template <> +class numeric_limits { + public: + static constexpr bool is_specialized = + true; // NOLINT(readability-identifier-naming) + static constexpr tflite::half min() noexcept { + return tflite::half::smallest_normal(); + } + static constexpr tflite::half max() noexcept { return tflite::half::max(); } + static constexpr tflite::half lowest() noexcept { + return tflite::half::min(); + } + static constexpr tflite::half epsilon() noexcept { + return tflite::half::epsilon(); + } + static constexpr tflite::half quiet_NaN() noexcept { +#if TFLITE_ARCH_FLOAT16 + return tflite::half(__builtin_nanf("")); +#else + return tflite::half::from_bits(0x7e00); +#endif + } + static constexpr bool is_signed = + true; // NOLINT(readability-identifier-naming) +}; +} // namespace std namespace tflite { @@ -170,11 +200,13 @@ inline bool NextIndex(const int num_dims, const int* dims, IndexType* current) { } TFLITE_DCHECK(dims != nullptr); TFLITE_DCHECK(current != nullptr); + for (int i = 0; i < num_dims; ++i) { + TFLITE_DCHECK_GE(dims[i], 0); + } int carry = 1; for (int idx = num_dims - 1; idx >= 0; --idx) { IndexType current_val = current[idx] + carry; - TFLITE_DCHECK_GE(dims[idx], current_val); - if (dims[idx] == current_val) { + if (current_val >= dims[idx]) { current[idx] = 0; } else { current[idx] = current_val; @@ -999,7 +1031,7 @@ struct TanhParams { int input_left_shift; }; -constexpr int kTransposeMaxDimensions = 6; +constexpr int kTransposeMaxDimensions = 8; struct TransposeParams { int8_t perm_count; @@ -1075,6 +1107,12 @@ inline void GetActivationParams(const P& params, float* min, float* max) { *max = params.float_activation_max; } +template +inline void GetActivationParams(const P& params, half* min, half* max) { + *min = static_cast(params.float_activation_min); + *max = static_cast(params.float_activation_max); +} + template inline void GetActivationParams(const P& params, int64_t* min, int64_t* max) { *min = params.int64_activation_min; diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc index 62feffc1c0a..d281bffd608 100644 --- a/tensorflow/lite/kernels/kernel_util.cc +++ b/tensorflow/lite/kernels/kernel_util.cc @@ -25,6 +25,7 @@ limitations under the License. #ifndef TF_LITE_STATIC_MEMORY #include +#include "absl/types/span.h" #include "tensorflow/lite/array.h" #endif // TF_LITE_STATIC_MEMORY @@ -34,6 +35,10 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/cppmath.h" #include "tensorflow/lite/kernels/internal/quantization_util.h" +#ifndef TF_LITE_STATIC_MEMORY +#include "tensorflow/lite/util.h" +#endif + #if defined(__APPLE__) #include "TargetConditionals.h" #endif @@ -101,9 +106,8 @@ inline TfLiteStatus GetMutableInputSafe(const TfLiteContext* context, const TfLiteNode* node, int index, const TfLiteTensor** tensor) { int tensor_index; - TF_LITE_ENSURE_OK( - context, ValidateTensorIndexingSafe(context, index, node->inputs->size, - node->inputs->data, &tensor_index)); + TF_LITE_ENSURE_STATUS(ValidateTensorIndexingSafe( + context, index, node->inputs->size, node->inputs->data, &tensor_index)); *tensor = GetTensorAtIndex(context, tensor_index); return kTfLiteOk; } @@ -140,9 +144,8 @@ TfLiteTensor* GetOutput(TfLiteContext* context, const TfLiteNode* node, TfLiteStatus GetOutputSafe(const TfLiteContext* context, const TfLiteNode* node, int index, TfLiteTensor** tensor) { int tensor_index; - TF_LITE_ENSURE_OK( - context, ValidateTensorIndexingSafe(context, index, node->outputs->size, - node->outputs->data, &tensor_index)); + TF_LITE_ENSURE_STATUS(ValidateTensorIndexingSafe( + context, index, node->outputs->size, node->outputs->data, &tensor_index)); *tensor = GetTensorAtIndex(context, tensor_index); return kTfLiteOk; } @@ -167,8 +170,8 @@ TfLiteStatus GetTemporarySafe(const TfLiteContext* context, const TfLiteNode* node, int index, TfLiteTensor** tensor) { int tensor_index; - TF_LITE_ENSURE_OK(context, ValidateTensorIndexingSafe( - context, index, node->temporaries->size, + TF_LITE_ENSURE_STATUS( + ValidateTensorIndexingSafe(context, index, node->temporaries->size, node->temporaries->data, &tensor_index)); *tensor = GetTensorAtIndex(context, tensor_index); return kTfLiteOk; @@ -188,8 +191,8 @@ TfLiteStatus GetIntermediatesSafe(const TfLiteContext* context, const TfLiteNode* node, int index, TfLiteTensor** tensor) { int tensor_index; - TF_LITE_ENSURE_OK(context, ValidateTensorIndexingSafe( - context, index, node->intermediates->size, + TF_LITE_ENSURE_STATUS( + ValidateTensorIndexingSafe(context, index, node->intermediates->size, node->intermediates->data, &tensor_index)); *tensor = GetTensorAtIndex(context, tensor_index); return kTfLiteOk; @@ -595,4 +598,27 @@ bool HasUnspecifiedDimension(const TfLiteTensor* tensor) { return false; } +#ifndef TF_LITE_STATIC_MEMORY +TfLiteStatus CheckedShapeProduct(TfLiteContext* context, + absl::Span dims, + const char* error_message, size_t& product) { + // The CheckedNumElements function already checks for negative dimensions, so + // we don't do it here. + TF_LITE_ENSURE_MSG(context, CheckedNumElements(dims, product) == kTfLiteOk, + "%s", error_message); + return kTfLiteOk; +} + +TfLiteStatus CheckedShapeProductToInt(TfLiteContext* context, + absl::Span dims, + const char* error_message, int& product) { + for (const int dim : dims) { + TF_LITE_ENSURE_MSG(context, dim >= 0, "Encountered a negative dimension."); + } + TF_LITE_ENSURE_MSG(context, CheckedNumElements(dims, product) == kTfLiteOk, + "%s", error_message); + return kTfLiteOk; +} +#endif + } // namespace tflite diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h index 25e5386ccb6..70436965710 100644 --- a/tensorflow/lite/kernels/kernel_util.h +++ b/tensorflow/lite/kernels/kernel_util.h @@ -17,9 +17,11 @@ limitations under the License. #include +#include #include #ifndef TF_LITE_STATIC_MEMORY #include +#include "absl/types/span.h" #endif // TF_LITE_STATIC_MEMORY #include "tensorflow/lite/core/c/builtin_op_data.h" @@ -341,6 +343,31 @@ bool IsMobilePlatform(); // Returns whether there is unspecified dimension in the tensor's dim signature. bool HasUnspecifiedDimension(const TfLiteTensor* tensor); +#ifndef TF_LITE_STATIC_MEMORY +/** + * Calculates the product of the given dimensions. Returns an error if any of + * the dimensions is negative or if the product overflows. + * @param context The context to use for error reporting. + * @param dims The dimensions to multiply. + * @param error_message The error message to use if an error is encountered. + * @param product The output parameter to store the product. + */ +TfLiteStatus CheckedShapeProduct(TfLiteContext* context, + absl::Span dims, + const char* error_message, size_t& product); + +/** + * Calculates the product of the given dimensions. Returns an error if any of + * the dimensions is negative or if the product overflows. + * @param context The context to use for error reporting. + * @param dims The dimensions to multiply. + * @param error_message The error message to use if an error is encountered. + * @param product The output parameter to store the product. + */ +TfLiteStatus CheckedShapeProductToInt(TfLiteContext* context, + absl::Span dims, + const char* error_message, int& product); +#endif } // namespace tflite #endif // TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_ diff --git a/tensorflow/lite/tools/flatbuffer_utils_test.py b/tensorflow/lite/tools/flatbuffer_utils_test.py index 13074aaca5e..e8a2e46b9be 100644 --- a/tensorflow/lite/tools/flatbuffer_utils_test.py +++ b/tensorflow/lite/tools/flatbuffer_utils_test.py @@ -18,9 +18,9 @@ import subprocess import sys -from tflite_micro.tensorflow.lite.python import schema_py_generated as schema # pylint:disable=g-direct-tensorflow-import -from tflite_micro.tensorflow.lite.tools import flatbuffer_utils -from tflite_micro.tensorflow.lite.tools import test_utils +from tflite_micro.tensorflow.lite_micro.tensorflow.lite.python import schema_py_generated as schema # pylint:disable=g-direct-tensorflow-import +from tflite_micro.tensorflow.lite_micro.tensorflow.lite.tools import flatbuffer_utils +from tflite_micro.tensorflow.lite_micro.tensorflow.lite.tools import test_utils from tensorflow.python.framework import test_util from tensorflow.python.platform import test diff --git a/tensorflow/lite/tools/test_utils.py b/tensorflow/lite/tools/test_utils.py index 44157143d5d..582fbd2879b 100644 --- a/tensorflow/lite/tools/test_utils.py +++ b/tensorflow/lite/tools/test_utils.py @@ -18,7 +18,7 @@ """ import flatbuffers -from tflite_micro.tensorflow.lite.python import schema_py_generated as schema_fb +from tflite_micro.tensorflow.lite_micro.tensorflow.lite.python import schema_py_generated as schema_fb TFLITE_SCHEMA_VERSION = 3 diff --git a/tensorflow/lite/tools/visualize_test.py b/tensorflow/lite/tools/visualize_test.py index 68de38cc9d7..4cbb01f2b58 100644 --- a/tensorflow/lite/tools/visualize_test.py +++ b/tensorflow/lite/tools/visualize_test.py @@ -16,8 +16,8 @@ import os import re -from tflite_micro.tensorflow.lite.tools import test_utils -from tflite_micro.tensorflow.lite.tools import visualize +from tflite_micro.tensorflow.lite_micro.tensorflow.lite.tools import test_utils +from tflite_micro.tensorflow.lite_micro.tensorflow.lite.tools import visualize from tensorflow.python.framework import test_util from tensorflow.python.platform import test diff --git a/tensorflow/lite/types/BUILD b/tensorflow/lite/types/BUILD new file mode 100644 index 00000000000..d3894462cb0 --- /dev/null +++ b/tensorflow/lite/types/BUILD @@ -0,0 +1,11 @@ +load("@rules_cc//cc:cc_library.bzl", "cc_library") + +cc_library( + name = "half", + hdrs = [ + "half.h", + ], + visibility = [ + "//tensorflow/lite:__subpackages__", + ], +) diff --git a/tensorflow/lite/types/half.h b/tensorflow/lite/types/half.h new file mode 100644 index 00000000000..48b796a2107 --- /dev/null +++ b/tensorflow/lite/types/half.h @@ -0,0 +1,83 @@ +/* Copyright 2025 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_LITE_TYPES_HALF_H_ +#define TENSORFLOW_LITE_TYPES_HALF_H_ + +#include + +namespace tflite { + +class half { + private: + // We need this hoop jumping to enable implementing a constexpr `from_bits`. + struct zero_initializer {}; + explicit constexpr half(zero_initializer) : bits_(0) {} + + public: + half() = default; + + // Disabled in TFLM to avoid dependencies on external fp16 conversion + // libraries. This is safe because TFLM does not currently support + // Float16 kernels, meaning reference kernel templates are never instantiated + // with `tflite::half`. + // + // If Float16 support is needed in the future, the build target must be + // updated to depend on a proper fp16 library and this code re-enabled. +#if 0 + half(float x) : bits_(fp16_ieee_from_fp32_value(x)) {} // NOLINT + explicit half(int x) + : bits_(fp16_ieee_from_fp32_value(static_cast(x))) {} + + operator float() const { return fp16_ieee_to_fp32_value(bits_); } // NOLINT +#endif + + static constexpr half from_bits(uint16_t bits) { + half result{zero_initializer{}}; + result.bits_ = bits; + return result; + } + + constexpr uint16_t to_bits() const { return bits_; } + + bool is_zero() const { + // Check for +/- zero (0x0000/0x8000). uint16 overflow is well defined to + // wrap around. + return static_cast(bits_ * 2) == 0; + } + + static constexpr half epsilon() { + return half::from_bits(0x1400); // 2^-10 = 0.0009765625 + } + static constexpr half infinity() { return from_bits(0x7c00); } + static constexpr half min() { return from_bits(0xfbff); } + static constexpr half max() { return from_bits(0x7bff); } + static constexpr half smallest_normal() { + return from_bits(0x0400); // 2^-14 + } + static constexpr half min_identity() { return from_bits(0x7c00); } + static constexpr half max_identity() { return from_bits(0xfc00); } + static constexpr half sum_identity() { return from_bits(0); } + + // Not private due to -Werror=class-memaccess, which can't be disabled: + // - via a --copt, because it seems to have no effect. + // - via .bazelrc, because it then applies to C code, and the compiler says + // this flag is not valid in C. + uint16_t bits_; +}; + +} // namespace tflite + +#endif // TENSORFLOW_LITE_TYPES_HALF_H_ From 6ad2479cbc412c9fa8804d60bc16a8a92f16b342 Mon Sep 17 00:00:00 2001 From: Esun Kim Date: Thu, 18 Jun 2026 11:05:53 -0700 Subject: [PATCH 2/2] No abseil --- tensorflow/lite/kernels/kernel_util.cc | 45 +++++++++++++++++--------- tensorflow/lite/kernels/kernel_util.h | 8 ++--- 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc index d281bffd608..3575f845163 100644 --- a/tensorflow/lite/kernels/kernel_util.cc +++ b/tensorflow/lite/kernels/kernel_util.cc @@ -25,7 +25,6 @@ limitations under the License. #ifndef TF_LITE_STATIC_MEMORY #include -#include "absl/types/span.h" #include "tensorflow/lite/array.h" #endif // TF_LITE_STATIC_MEMORY @@ -35,9 +34,7 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/cppmath.h" #include "tensorflow/lite/kernels/internal/quantization_util.h" -#ifndef TF_LITE_STATIC_MEMORY -#include "tensorflow/lite/util.h" -#endif + #if defined(__APPLE__) #include "TargetConditionals.h" @@ -598,27 +595,43 @@ bool HasUnspecifiedDimension(const TfLiteTensor* tensor) { return false; } -#ifndef TF_LITE_STATIC_MEMORY TfLiteStatus CheckedShapeProduct(TfLiteContext* context, - absl::Span dims, + std::initializer_list dims, const char* error_message, size_t& product) { - // The CheckedNumElements function already checks for negative dimensions, so - // we don't do it here. - TF_LITE_ENSURE_MSG(context, CheckedNumElements(dims, product) == kTfLiteOk, - "%s", error_message); + size_t checked_count = 1; + for (const int d : dims) { + if (d < 0) { + TF_LITE_ENSURE_MSG(context, false, "%s", error_message); + } + if (checked_count > 0 && + static_cast(d) > std::numeric_limits::max() / checked_count) { + TF_LITE_ENSURE_MSG(context, false, "%s", error_message); + } + checked_count *= d; + } + product = checked_count; return kTfLiteOk; } TfLiteStatus CheckedShapeProductToInt(TfLiteContext* context, - absl::Span dims, + std::initializer_list dims, const char* error_message, int& product) { - for (const int dim : dims) { - TF_LITE_ENSURE_MSG(context, dim >= 0, "Encountered a negative dimension."); + size_t checked_count = 1; + for (const int d : dims) { + if (d < 0) { + TF_LITE_ENSURE_MSG(context, false, "Encountered a negative dimension."); + } + if (checked_count > 0 && + static_cast(d) > std::numeric_limits::max() / checked_count) { + TF_LITE_ENSURE_MSG(context, false, "%s", error_message); + } + checked_count *= d; } - TF_LITE_ENSURE_MSG(context, CheckedNumElements(dims, product) == kTfLiteOk, - "%s", error_message); + if (checked_count > std::numeric_limits::max()) { + TF_LITE_ENSURE_MSG(context, false, "%s", error_message); + } + product = static_cast(checked_count); return kTfLiteOk; } -#endif } // namespace tflite diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h index 70436965710..5ef47194a70 100644 --- a/tensorflow/lite/kernels/kernel_util.h +++ b/tensorflow/lite/kernels/kernel_util.h @@ -18,10 +18,10 @@ limitations under the License. #include #include +#include #include #ifndef TF_LITE_STATIC_MEMORY #include -#include "absl/types/span.h" #endif // TF_LITE_STATIC_MEMORY #include "tensorflow/lite/core/c/builtin_op_data.h" @@ -343,7 +343,6 @@ bool IsMobilePlatform(); // Returns whether there is unspecified dimension in the tensor's dim signature. bool HasUnspecifiedDimension(const TfLiteTensor* tensor); -#ifndef TF_LITE_STATIC_MEMORY /** * Calculates the product of the given dimensions. Returns an error if any of * the dimensions is negative or if the product overflows. @@ -353,7 +352,7 @@ bool HasUnspecifiedDimension(const TfLiteTensor* tensor); * @param product The output parameter to store the product. */ TfLiteStatus CheckedShapeProduct(TfLiteContext* context, - absl::Span dims, + std::initializer_list dims, const char* error_message, size_t& product); /** @@ -365,9 +364,8 @@ TfLiteStatus CheckedShapeProduct(TfLiteContext* context, * @param product The output parameter to store the product. */ TfLiteStatus CheckedShapeProductToInt(TfLiteContext* context, - absl::Span dims, + std::initializer_list dims, const char* error_message, int& product); -#endif } // namespace tflite #endif // TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_