diff --git a/tensorflow/compiler/mlir/lite/core/c/tflite_types.h b/tensorflow/compiler/mlir/lite/core/c/tflite_types.h index c14cb5d2e81..3890f4e1c12 100644 --- a/tensorflow/compiler/mlir/lite/core/c/tflite_types.h +++ b/tensorflow/compiler/mlir/lite/core/c/tflite_types.h @@ -66,6 +66,8 @@ typedef enum { kTfLiteBFloat16 = 19, kTfLiteInt2 = 20, kTfLiteUInt4 = 21, + kTfLiteFloat8E4M3FN = 22, + kTfLiteFloat8E5M2 = 23, } TfLiteType; // LINT.ThenChange(//tensorflow/lite/profiling/proto/model_runtime_info.proto:EdgeDataType) diff --git a/tensorflow/compiler/mlir/lite/schema/schema.fbs b/tensorflow/compiler/mlir/lite/schema/schema.fbs index 76d92df99f9..217b6f4f201 100644 --- a/tensorflow/compiler/mlir/lite/schema/schema.fbs +++ b/tensorflow/compiler/mlir/lite/schema/schema.fbs @@ -63,6 +63,8 @@ enum TensorType : byte { BFLOAT16 = 18, INT2 = 19, UINT4 = 20, + FLOAT8_E4M3FN = 21, + FLOAT8_E5M2 = 22, } // Custom quantization parameters for experimenting with new quantization diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc index 4fc99a07e5b..6790a92df44 100644 --- a/tensorflow/lite/core/api/flatbuffer_conversions.cc +++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc @@ -1101,6 +1101,12 @@ TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type, case TensorType_UINT4: *type = kTfLiteUInt4; return kTfLiteOk; + case TensorType_FLOAT8_E4M3FN: + *type = kTfLiteFloat8E4M3FN; + return kTfLiteOk; + case TensorType_FLOAT8_E5M2: + *type = kTfLiteFloat8E5M2; + return kTfLiteOk; default: *type = kTfLiteNoType; TF_LITE_REPORT_ERROR(error_reporter, diff --git a/tensorflow/lite/core/c/common.cc b/tensorflow/lite/core/c/common.cc index 6d247309ec5..b842bca95f9 100644 --- a/tensorflow/lite/core/c/common.cc +++ b/tensorflow/lite/core/c/common.cc @@ -20,6 +20,7 @@ limitations under the License. #endif // TF_LITE_STATIC_MEMORY #include +#include #include #include @@ -111,6 +112,7 @@ TfLiteSparsity TfLiteSparsityClone(const TfLiteSparsity& src) { if (src.dim_metadata) { dst.dim_metadata = reinterpret_cast( calloc(1, sizeof(TfLiteDimensionMetadata) * src.dim_metadata_size)); + if (src.dim_metadata_size > 0 && !dst.dim_metadata) return TfLiteSparsity(); for (int i = 0; i < src.dim_metadata_size; ++i) { dst.dim_metadata[i] = src.dim_metadata[i]; dst.dim_metadata[i].array_segments = @@ -129,6 +131,7 @@ TfLiteSparsity* TfLiteSparsityClone(const TfLiteSparsity* const src) { } TfLiteSparsity* dst = reinterpret_cast(calloc(1, sizeof(TfLiteSparsity))); + if (!dst) return nullptr; *dst = TfLiteSparsityClone(*src); return dst; } @@ -147,6 +150,7 @@ TfLiteQuantization TfLiteQuantizationClone(const TfLiteQuantization& src) { break; case kTfLiteAffineQuantization: { dst.params = calloc(1, sizeof(TfLiteAffineQuantization)); + if (!dst.params) return TfLiteQuantization(); const TfLiteAffineQuantization* const src_params = reinterpret_cast(src.params); TfLiteAffineQuantization* const dst_params = @@ -158,6 +162,7 @@ TfLiteQuantization TfLiteQuantizationClone(const TfLiteQuantization& src) { } case kTfLiteBlockwiseQuantization: { dst.params = calloc(1, sizeof(TfLiteBlockwiseQuantization)); + if (!dst.params) return TfLiteQuantization(); const TfLiteBlockwiseQuantization* const src_params = (TfLiteBlockwiseQuantization*)(src.params); TfLiteBlockwiseQuantization* const dst_params = @@ -219,6 +224,9 @@ TfLiteFloatArray* TfLiteFloatArrayCopy(const TfLiteFloatArray* src) { void TfLiteFloatArrayFree(TfLiteFloatArray* a) { TfLiteVarArrayFree(a); } void TfLiteTensorDataFree(TfLiteTensor* t) { + if (t == nullptr) { + return; + } if (t->allocation_type == kTfLiteVariantObject && t->data.data) { delete static_cast(t->data.data); } else if (t->allocation_type == kTfLiteDynamic || @@ -238,6 +246,9 @@ void TfLiteTensorDataFree(TfLiteTensor* t) { } void TfLiteQuantizationFree(TfLiteQuantization* quantization) { + if (quantization == nullptr) { + return; + } if (quantization->type == kTfLiteAffineQuantization) { TfLiteAffineQuantization* q_params = reinterpret_cast(quantization->params); @@ -294,6 +305,9 @@ void TfLiteSparsityFree(TfLiteSparsity* sparsity) { } void TfLiteTensorFree(TfLiteTensor* t) { + if (t == nullptr) { + return; + } TfLiteTensorDataFree(t); if (t->dims) TfLiteIntArrayFree(t->dims); t->dims = nullptr; @@ -308,7 +322,7 @@ void TfLiteTensorFree(TfLiteTensor* t) { t->sparsity = nullptr; } -TfLiteTensor TfLiteTensorClone(const TfLiteTensor src) { +TfLiteTensor TfLiteTensorClone(TfLiteTensor src) { // We copy all of the source data first, then we clone the fields that can't // be shared between two tensor instances. TfLiteTensor dst = src; @@ -335,16 +349,18 @@ TfLiteTensor TfLiteTensorClone(const TfLiteTensor src) { break; case kTfLiteAllocationStrategyMalloc: dst.data.data = malloc(src.bytes); + if (src.bytes > 0 && !dst.data.data) return TfLiteTensor(); std::memcpy(dst.data.data, src.data.data, src.bytes); break; case kTfLiteAllocationStrategyNew: // Special case for variant objects. They are allocated using new/delete // but require using the `CloneTo` function. if (src.allocation_type == kTfLiteVariantObject) { - dst.data.data = reinterpret_cast(src.data.data) - ->CloneTo(nullptr); + dst.data.data = + static_cast(src.data.data)->CloneTo(nullptr); } else { - dst.data.data = new char[src.bytes]; + dst.data.data = new (std::nothrow) char[src.bytes]; + if (src.bytes > 0 && !dst.data.data) return TfLiteTensor(); std::memcpy(dst.data.data, src.data.data, src.bytes); } break; @@ -394,13 +410,21 @@ TfLiteStatus TfLiteTensorCopy(const TfLiteTensor* src, TfLiteTensor* dst) { } auto* dst_vd = static_cast(dst->data.data); auto* src_vd = static_cast(src->data.data); + if (!src_vd) return kTfLiteError; // `CloneTo` will handle the case when `dst_vd` is nullptr, so it is safe // to `CloneTo` something which was "freed". Also, returning from `CloneTo` // will implicitly cast to `VariantData`; don't need static cast here. dst->data.data = src_vd->CloneTo(dst_vd); } else { - memcpy(dst->data.raw, src->data.raw, src->bytes); + if (dst->allocation_type == kTfLiteVariantObject) { + TfLiteTensorDataFree(dst); + dst->allocation_type = src->allocation_type; + } + if (src->bytes > 0) { + if (!dst->data.raw || !src->data.raw) return kTfLiteError; + memcpy(dst->data.raw, src->data.raw, src->bytes); + } } dst->buffer_handle = src->buffer_handle; dst->data_is_stale = src->data_is_stale; @@ -513,6 +537,10 @@ const char* TfLiteTypeGetName(TfLiteType type) { return "INT2"; case kTfLiteUInt4: return "UINT4"; + case kTfLiteFloat8E4M3FN: + return "FLOAT8_E4M3FN"; + case kTfLiteFloat8E5M2: + return "FLOAT8_E5M2"; } return "Unknown type"; } diff --git a/tensorflow/lite/core/c/common.h b/tensorflow/lite/core/c/common.h index a3b0dbd7492..8ea233516b3 100644 --- a/tensorflow/lite/core/c/common.h +++ b/tensorflow/lite/core/c/common.h @@ -56,6 +56,7 @@ limitations under the License. #include #include #include +#include #include "tensorflow/lite/core/c/c_api_types.h" // IWYU pragma: export @@ -277,13 +278,34 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a); } \ } while (0) -#define TF_LITE_ENSURE_OK(context, status) \ - do { \ - const TfLiteStatus s = (status); \ - if ((s) != kTfLiteOk) { \ - return s; \ - } \ +#ifndef TF_LITE_STRIP_ERROR_STRINGS +#define TF_LITE_VAR_ARG_HEAD(FIRST, ...) FIRST +#define TF_LITE_STRINGIFY_HELPER(x) #x +#define TF_LITE_STRINGIFY(x) TF_LITE_STRINGIFY_HELPER(x) +// Checks that `status` evaluates to `kTfLiteOk`. +// +// Can take a printf style log message and its parameters after the status. The +// message will be printed using `TF_LITE_KERNEL_LOG` in case of error. +#define TF_LITE_ENSURE_OK(context, status, ...) \ + do { \ + const TfLiteStatus s = (status); \ + if (s != kTfLiteOk) { \ + if (sizeof(TF_LITE_VAR_ARG_HEAD("" __VA_ARGS__)) > sizeof("")) { \ + TF_LITE_MAYBE_KERNEL_LOG((context), __FILE__ ":" TF_LITE_STRINGIFY( \ + __LINE__) ": " __VA_ARGS__); \ + } \ + return s; \ + } \ } while (0) +#else +#define TF_LITE_ENSURE_OK(context, status, ...) \ + do { \ + const TfLiteStatus s = (status); \ + if ((s) != kTfLiteOk) { \ + return s; \ + } \ + } while (0) +#endif // `std::unreachable` not available until CC23. #ifdef __GNUC__ // GCC, Clang, ICC @@ -1060,6 +1082,13 @@ typedef struct TfLiteContext { /// WARNING: This is an experimental interface that is subject to change. TfLiteStatus (*ReleaseSubgraphContext)(struct TfLiteContext* context, int subgraph_index); +#if defined(_WIN32) + /// Create a array of a given `size` (uninitialized entries). + TfLiteIntArray* (*TfLiteIntArrayCreate)(int size); // NOLINT + + /// Free memory of array `a`. + void (*TfLiteIntArrayFree)(TfLiteIntArray* a); // NOLINT +#endif // defined(_WIN32) } TfLiteContext; /// `TfLiteOperator` is an external version of `TfLiteRegistration` diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h index 4d990d70aa0..929168b7098 100644 --- a/tensorflow/lite/kernels/internal/common.h +++ b/tensorflow/lite/kernels/internal/common.h @@ -78,7 +78,11 @@ bool ReduceDimensionsForBroadcast(const RuntimeShape& input1_shape, if (!broadcast_input1) { broadcast_input1 = true; broadcast_input2 = false; + if (num_compressed_dims >= MAX_DIM) return false; num_compressed_dims++; + if (num_compressed_dims > MAX_DIM) { + return false; + } } compressed_input2_shape[num_compressed_dims - 1] *= input2_dim; compressed_output_shape[num_compressed_dims - 1] *= input2_dim; @@ -86,7 +90,11 @@ bool ReduceDimensionsForBroadcast(const RuntimeShape& input1_shape, if (!broadcast_input2) { broadcast_input1 = false; broadcast_input2 = true; + if (num_compressed_dims >= MAX_DIM) return false; num_compressed_dims++; + if (num_compressed_dims > MAX_DIM) { + return false; + } } compressed_input1_shape[num_compressed_dims - 1] *= input1_dim; compressed_output_shape[num_compressed_dims - 1] *= input1_dim; @@ -95,7 +103,11 @@ bool ReduceDimensionsForBroadcast(const RuntimeShape& input1_shape, if (broadcast_input1 || broadcast_input2 || first_nonunit) { broadcast_input1 = false; broadcast_input2 = false; + if (num_compressed_dims >= MAX_DIM) return false; num_compressed_dims++; + if (num_compressed_dims > MAX_DIM) { + return false; + } } compressed_input1_shape[num_compressed_dims - 1] *= input1_dim; compressed_input2_shape[num_compressed_dims - 1] *= input1_dim; @@ -105,7 +117,11 @@ bool ReduceDimensionsForBroadcast(const RuntimeShape& input1_shape, } if (num_input1_dims > num_input2_dims) { if (!broadcast_input2) { + if (num_compressed_dims >= MAX_DIM) return false; num_compressed_dims++; + if (num_compressed_dims > MAX_DIM) { + return false; + } } for (size_t i = 0; i < num_input1_dims - num_input2_dims; i++) { const size_t input1_dim = input1_dims[i]; @@ -117,7 +133,11 @@ bool ReduceDimensionsForBroadcast(const RuntimeShape& input1_shape, } } else if (num_input2_dims > num_input1_dims) { if (!broadcast_input1) { + if (num_compressed_dims >= MAX_DIM) return false; num_compressed_dims++; + if (num_compressed_dims > MAX_DIM) { + return false; + } } for (size_t i = 0; i < num_input2_dims - num_input1_dims; i++) { const size_t input2_dim = input2_dims[i]; diff --git a/tensorflow/lite/kernels/internal/reference/add.h b/tensorflow/lite/kernels/internal/reference/add.h index 5b520bd1e13..395a623a028 100644 --- a/tensorflow/lite/kernels/internal/reference/add.h +++ b/tensorflow/lite/kernels/internal/reference/add.h @@ -23,6 +23,7 @@ limitations under the License. #include "fixedpoint/fixedpoint.h" #include "tensorflow/lite/kernels/internal/common.h" #include "tensorflow/lite/kernels/internal/compatibility.h" +#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h" namespace tflite { @@ -39,7 +40,7 @@ inline void Add(const ArithmeticParams& params, const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape); for (int i = 0; i < flat_size; ++i) { - output_data[i] = ActivationFunctionWithMinMax( + output_data[i] = ActivationFunctionWithMinMax( input1_data[i] + input2_data[i], activation_min, activation_max); } } @@ -266,57 +267,6 @@ inline void AddElementwise(const int32_t* input1_data, } } -template -inline void BroadcastAddRecursiveDimensions( - int dimension, size_t* input1_offset_p, size_t* input2_offset_p, - size_t* output_offset, size_t* compressed_input1_stride, - size_t* compressed_input2_stride, size_t* compressed_output_shape, - T activation_min, T activation_max, const T* input1_data, - const T* input2_data, T* output_data) { - if (dimension > 0) { - for (size_t c = 0; c < compressed_output_shape[dimension]; ++c) { - size_t input1_offset_c = *input1_offset_p; - size_t input2_offset_c = *input2_offset_p; - BroadcastAddRecursiveDimensions( - dimension - 1, &input1_offset_c, &input2_offset_c, output_offset, - compressed_input1_stride, compressed_input2_stride, - compressed_output_shape, activation_min, activation_max, input1_data, - input2_data, output_data); - *input1_offset_p += compressed_input1_stride[dimension]; - *input2_offset_p += compressed_input2_stride[dimension]; - } - } else { - TFLITE_DCHECK(dimension == 0); - bool input1_is_broadcast = compressed_input1_stride[dimension] == 0; - bool input2_is_broadcast = compressed_input2_stride[dimension] == 0; - TFLITE_DCHECK(!(input1_is_broadcast && input2_is_broadcast)); - const T* input1_data_ptr = input1_data + *input1_offset_p; - const T* input2_data_ptr = input2_data + *input2_offset_p; - T* output_data_ptr = output_data + *output_offset; - if (input1_is_broadcast) { - // input1 is broadcast. - AddBroadcast(input2_data_ptr, input1_data_ptr, output_data_ptr, - compressed_output_shape[dimension], activation_min, - activation_max); - *input2_offset_p += compressed_output_shape[dimension]; - } else if (input2_is_broadcast) { - // input2 is broadcast. - AddBroadcast(input1_data_ptr, input2_data_ptr, output_data_ptr, - compressed_output_shape[dimension], activation_min, - activation_max); - *input1_offset_p += compressed_output_shape[dimension]; - } else { - // Add element-wise. - AddElementwise(input1_data_ptr, input2_data_ptr, output_data_ptr, - compressed_output_shape[dimension], activation_min, - activation_max); - *input1_offset_p += compressed_output_shape[dimension]; - *input2_offset_p += compressed_output_shape[dimension]; - } - *output_offset += compressed_output_shape[dimension]; - } -} - template @@ -325,90 +275,14 @@ BroadcastAdd6DSlow(const ArithmeticParams& params, const RuntimeShape& input1_shape, const T* input1_data, const RuntimeShape& input2_shape, const T* input2_data, const RuntimeShape& output_shape, T* output_data) { - constexpr int kMaxBroadcastDim = 6; T activation_min, activation_max; GetActivationParams(params, &activation_min, &activation_max); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - size_t compressed_input1_stride[kMaxBroadcastDim]; - size_t compressed_input2_stride[kMaxBroadcastDim]; - size_t compressed_output_shape[kMaxBroadcastDim]; - bool broadcastable_shape = ReduceDimensionsForBroadcast( - input1_shape, input2_shape, compressed_input1_stride, - compressed_input2_stride, compressed_output_shape); - // Skip broadcasting for degenerate shapes. - if (!broadcastable_shape) { - return; - } - - size_t input1_offset = 0; - size_t input2_offset = 0; - size_t output_offset = 0; - BroadcastAddRecursiveDimensions( - kMaxBroadcastDim - 1, &input1_offset, &input2_offset, &output_offset, - compressed_input1_stride, compressed_input2_stride, - compressed_output_shape, activation_min, activation_max, input1_data, - input2_data, output_data); -} - -// This function is used for 8-bit as well as for 16-bit, but the accumulator -// is 32-bit for both cases. The overflow does not happen due to the -// choice of the shift (20 or 15, accordingly - see add.cc for more comments). -template -inline void BroadcastAddRecursiveDimensions( - const ArithmeticParams& params, int dimension, size_t* input1_offset_p, - size_t* input2_offset_p, size_t* output_offset, - size_t* compressed_input1_stride, size_t* compressed_input2_stride, - size_t* compressed_output_shape, const T* input1_data, const T* input2_data, - T* output_data) { - for (size_t c = 0; c < compressed_output_shape[dimension]; ++c) { - if (dimension > 0) { - size_t input1_offset_c = *input1_offset_p; - size_t input2_offset_c = *input2_offset_p; - BroadcastAddRecursiveDimensions( - params, dimension - 1, &input1_offset_c, &input2_offset_c, - output_offset, compressed_input1_stride, compressed_input2_stride, - compressed_output_shape, input1_data, input2_data, output_data); - } else { - TFLITE_DCHECK(dimension == 0); - const int32_t input1_val = - params.input1_offset + input1_data[*input1_offset_p]; - const int32_t input2_val = - params.input2_offset + input2_data[*input2_offset_p]; - const int32_t shifted_input1_val = input1_val * (1 << params.left_shift); - const int32_t shifted_input2_val = input2_val * (1 << params.left_shift); - const int32_t scaled_input1_val = - MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input1_val, params.input1_multiplier, - params.input1_shift); - const int32_t scaled_input2_val = - MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input2_val, params.input2_multiplier, - params.input2_shift); - const int32_t raw_sum = scaled_input1_val + scaled_input2_val; - const int32_t raw_output = - MultiplyByQuantizedMultiplierSmallerThanOneExp( - raw_sum, params.output_multiplier, params.output_shift) + - params.output_offset; - const int32_t clamped_output = - std::min(params.quantized_activation_max, - std::max(params.quantized_activation_min, raw_output)); - output_data[*output_offset] = static_cast(clamped_output); - ++(*output_offset); - } - *input1_offset_p += compressed_input1_stride[dimension]; - *input2_offset_p += compressed_input2_stride[dimension]; - } + auto op = [activation_min, activation_max](T a, T b) { + return ActivationFunctionWithMinMax(a + b, activation_min, + activation_max); + }; + BroadcastBinaryOpSimple(input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data, op); } // This function is used for 8-bit as well as for 16-bit, but the accumulator @@ -420,37 +294,29 @@ BroadcastAdd6DSlow(const ArithmeticParams& params, const RuntimeShape& input1_shape, const T* input1_data, const RuntimeShape& input2_shape, const T* input2_data, const RuntimeShape& output_shape, T* output_data) { - constexpr int kMaxBroadcastDim = 6; - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - size_t compressed_input1_stride[kMaxBroadcastDim]; - size_t compressed_input2_stride[kMaxBroadcastDim]; - size_t compressed_output_shape[kMaxBroadcastDim]; - bool broadcastable_shape = ReduceDimensionsForBroadcast( - input1_shape, input2_shape, compressed_input1_stride, - compressed_input2_stride, compressed_output_shape); - // Skip broadcasting for degenerate shapes. - if (!broadcastable_shape) { - return; - } - - size_t input1_offset = 0; - size_t input2_offset = 0; - size_t output_offset = 0; - BroadcastAddRecursiveDimensions( - params, kMaxBroadcastDim - 1, &input1_offset, &input2_offset, - &output_offset, compressed_input1_stride, compressed_input2_stride, - compressed_output_shape, input1_data, input2_data, output_data); + auto op = [¶ms](T a, T b) { + const int32_t input1_val = params.input1_offset + a; + const int32_t input2_val = params.input2_offset + b; + const int32_t shifted_input1_val = input1_val * (1 << params.left_shift); + const int32_t shifted_input2_val = input2_val * (1 << params.left_shift); + const int32_t scaled_input1_val = + MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input1_val, params.input1_multiplier, params.input1_shift); + const int32_t scaled_input2_val = + MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input2_val, params.input2_multiplier, params.input2_shift); + const int32_t raw_sum = scaled_input1_val + scaled_input2_val; + const int32_t raw_output = + MultiplyByQuantizedMultiplierSmallerThanOneExp( + raw_sum, params.output_multiplier, params.output_shift) + + params.output_offset; + const int32_t clamped_output = + std::min(params.quantized_activation_max, + std::max(params.quantized_activation_min, raw_output)); + return static_cast(clamped_output); + }; + BroadcastBinaryOpSimple(input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data, op); } template diff --git a/tensorflow/lite/kernels/internal/reference/batch_matmul.h b/tensorflow/lite/kernels/internal/reference/batch_matmul.h index 71f456703a3..a0853526233 100644 --- a/tensorflow/lite/kernels/internal/reference/batch_matmul.h +++ b/tensorflow/lite/kernels/internal/reference/batch_matmul.h @@ -17,6 +17,7 @@ limitations under the License. #include #include +#include #include "tensorflow/lite/kernels/internal/common.h" #include "tensorflow/lite/kernels/internal/compatibility.h" @@ -38,13 +39,13 @@ inline int broadcast_dim(int lhs_dim, int rhs_dim) { // Compute the "extent" for iterating on this dimension. // If we are broadcasting, then don't advance (i.e return 0). -inline int extent(const RuntimeShape& shape, int x) { +inline size_t extent(const RuntimeShape& shape, int x) { if (shape.Dims(x) == 1) { return 0; } - int prod = 1; + size_t prod = 1; for (int i = x + 1; i < shape.DimensionsCount(); ++i) { - prod *= shape.Dims(i); + prod *= static_cast(shape.Dims(i)); } return prod; } @@ -60,45 +61,45 @@ inline void BatchMatMul(const RuntimeShape& lhs_shape, const Ta* lhs_data, const RuntimeShape extended_rhs_shape = RuntimeShape::ExtendedShape(5, rhs_shape); - const int batch_dim0 = batch_matmul::broadcast_dim( - extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0)); - const int batch_dim1 = batch_matmul::broadcast_dim( - extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1)); - const int batch_dim2 = batch_matmul::broadcast_dim( - extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2)); + const size_t batch_dim0 = static_cast(batch_matmul::broadcast_dim( + extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0))); + const size_t batch_dim1 = static_cast(batch_matmul::broadcast_dim( + extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1))); + const size_t batch_dim2 = static_cast(batch_matmul::broadcast_dim( + extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2))); - const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0); - const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1); - const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2); - const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0); - const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1); - const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2); + const size_t lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0); + const size_t lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1); + const size_t lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2); + const size_t rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0); + const size_t rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1); + const size_t rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2); // Set params for each matrix multiply. - const int lhs_rows = extended_lhs_shape.Dims(3); - const int rhs_cols = extended_rhs_shape.Dims(4); - const int accum_depth = extended_lhs_shape.Dims(4); + const size_t lhs_rows = static_cast(extended_lhs_shape.Dims(3)); + const size_t rhs_cols = static_cast(extended_rhs_shape.Dims(4)); + const size_t accum_depth = static_cast(extended_lhs_shape.Dims(4)); - for (int b0 = 0; b0 < batch_dim0; ++b0) { + for (size_t b0 = 0; b0 < batch_dim0; ++b0) { const Ta* lhs_ptr0 = lhs_data + (b0 * lhs_ext0); const Tb* rhs_ptr0 = rhs_data + (b0 * rhs_ext0); - for (int b1 = 0; b1 < batch_dim1; ++b1) { + for (size_t b1 = 0; b1 < batch_dim1; ++b1) { const Ta* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1; const Tb* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1; - for (int b2 = 0; b2 < batch_dim2; ++b2) { + for (size_t b2 = 0; b2 < batch_dim2; ++b2) { const Ta* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2; const Tb* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2; Tout* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) * lhs_rows * rhs_cols; - for (int j = 0; j < rhs_cols; ++j) { - for (int i = 0; i < lhs_rows; ++i) { + for (size_t j = 0; j < rhs_cols; ++j) { + for (size_t i = 0; i < lhs_rows; ++i) { Tout total = 0; - for (int k = 0; k < accum_depth; ++k) { + for (size_t k = 0; k < accum_depth; ++k) { total += static_cast(lhs_ptr2[accum_depth * i + k]) * static_cast(rhs_ptr2[j * accum_depth + k]); } - int idx = lhs_rows * j + i; + size_t idx = lhs_rows * j + i; out_ptr[idx] = total; } } @@ -119,57 +120,62 @@ inline void BatchMatMul(const RuntimeShape& lhs_shape, const int8_t* lhs_data, const RuntimeShape extended_rhs_shape = RuntimeShape::ExtendedShape(5, rhs_shape); - const int batch_dim0 = batch_matmul::broadcast_dim( - extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0)); - const int batch_dim1 = batch_matmul::broadcast_dim( - extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1)); - const int batch_dim2 = batch_matmul::broadcast_dim( - extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2)); + const size_t batch_dim0 = static_cast(batch_matmul::broadcast_dim( + extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0))); + const size_t batch_dim1 = static_cast(batch_matmul::broadcast_dim( + extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1))); + const size_t batch_dim2 = static_cast(batch_matmul::broadcast_dim( + extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2))); - const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0); - const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1); - const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2); - const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0); - const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1); - const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2); + const size_t lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0); + const size_t lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1); + const size_t lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2); + const size_t rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0); + const size_t rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1); + const size_t rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2); // Set params for each matrix multiply. - const int lhs_rows = extended_lhs_shape.Dims(3); - const int rhs_cols = extended_rhs_shape.Dims(4); - const int accum_depth = extended_lhs_shape.Dims(4); + const size_t lhs_rows = static_cast(extended_lhs_shape.Dims(3)); + const size_t rhs_cols = static_cast(extended_rhs_shape.Dims(4)); + const size_t accum_depth = static_cast(extended_lhs_shape.Dims(4)); - const int ioff_ext0 = rhs_ext0 == 0 ? 0 : rhs_cols; - const int ioff_ext1 = rhs_ext1 == 0 ? 0 : rhs_cols; - const int ioff_ext2 = rhs_ext2 == 0 ? 0 : rhs_cols; - const int woff_ext0 = lhs_ext0 == 0 ? 0 : lhs_rows; - const int woff_ext1 = lhs_ext1 == 0 ? 0 : lhs_rows; - const int woff_ext2 = lhs_ext2 == 0 ? 0 : lhs_rows; + const size_t ioff_ext0 = rhs_ext0 == 0 ? 0 : rhs_cols; + const size_t ioff_ext1 = rhs_ext1 == 0 ? 0 : rhs_cols; + const size_t ioff_ext2 = rhs_ext2 == 0 ? 0 : rhs_cols; + const size_t woff_ext0 = lhs_ext0 == 0 ? 0 : lhs_rows; + const size_t woff_ext1 = lhs_ext1 == 0 ? 0 : lhs_rows; + const size_t woff_ext2 = lhs_ext2 == 0 ? 0 : lhs_rows; if (!compute_row_sums || *compute_row_sums) { - int num_weights_matrices = 1; + size_t num_weights_matrices = 1; for (int i = 1; i < extended_lhs_shape.DimensionsCount() - 2; ++i) { - num_weights_matrices *= extended_lhs_shape.Dims(i); + num_weights_matrices *= static_cast(extended_lhs_shape.Dims(i)); } + TFLITE_DCHECK_LE(num_weights_matrices * lhs_rows, + static_cast(std::numeric_limits::max())); + TFLITE_DCHECK_LE(accum_depth, + static_cast(std::numeric_limits::max())); tensor_utils::ReductionSumVector( - lhs_data, row_sums, num_weights_matrices * lhs_rows, accum_depth); + lhs_data, row_sums, static_cast(num_weights_matrices * lhs_rows), + static_cast(accum_depth)); if (compute_row_sums) { *compute_row_sums = false; } } - for (int b0 = 0; b0 < batch_dim0; ++b0) { + for (size_t b0 = 0; b0 < batch_dim0; ++b0) { const int8_t* lhs_ptr0 = lhs_data + (b0 * lhs_ext0); const int8_t* rhs_ptr0 = rhs_data + (b0 * rhs_ext0); const int32_t* ioff_ptr0 = input_offset + (b0 * ioff_ext0); const float* scale_ptr0 = scaling_factors + (b0 * ioff_ext0); const int32_t* woff_ptr0 = row_sums + (b0 * woff_ext0); - for (int b1 = 0; b1 < batch_dim1; ++b1) { + for (size_t b1 = 0; b1 < batch_dim1; ++b1) { const int8_t* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1; const int8_t* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1; const int32_t* ioff_ptr1 = ioff_ptr0 + (b1 * ioff_ext1); const float* scale_ptr1 = scale_ptr0 + (b1 * ioff_ext1); const int32_t* woff_ptr1 = woff_ptr0 + (b1 * woff_ext1); - for (int b2 = 0; b2 < batch_dim2; ++b2) { + for (size_t b2 = 0; b2 < batch_dim2; ++b2) { const int8_t* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2; const int8_t* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2; const int32_t* ioff_ptr2 = ioff_ptr1 + (b2 * ioff_ext2); @@ -178,18 +184,18 @@ inline void BatchMatMul(const RuntimeShape& lhs_shape, const int8_t* lhs_data, float* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) * lhs_rows * rhs_cols; - for (int j = 0; j < rhs_cols; ++j) { + for (size_t j = 0; j < rhs_cols; ++j) { const float batch_scaling_factor = scale_ptr2[j]; const float batch_offset = static_cast(ioff_ptr2[j]); - for (int i = 0; i < lhs_rows; ++i) { + for (size_t i = 0; i < lhs_rows; ++i) { int32_t total = 0; - for (int k = 0; k < accum_depth; ++k) { + for (size_t k = 0; k < accum_depth; ++k) { total += lhs_ptr2[accum_depth * i + k] * rhs_ptr2[j * accum_depth + k]; } int32_t row_sum = woff_ptr2[i]; total -= row_sum * batch_offset; - int idx = lhs_rows * j + i; + size_t idx = lhs_rows * j + i; float scale = batch_scaling_factor; if (per_channel_scales) { scale *= per_channel_scales[i]; @@ -214,24 +220,24 @@ inline void BatchMatMul(const FullyConnectedParams& params, const RuntimeShape extended_rhs_shape = RuntimeShape::ExtendedShape(5, rhs_shape); - const int batch_dim0 = batch_matmul::broadcast_dim( - extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0)); - const int batch_dim1 = batch_matmul::broadcast_dim( - extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1)); - const int batch_dim2 = batch_matmul::broadcast_dim( - extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2)); + const size_t batch_dim0 = static_cast(batch_matmul::broadcast_dim( + extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0))); + const size_t batch_dim1 = static_cast(batch_matmul::broadcast_dim( + extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1))); + const size_t batch_dim2 = static_cast(batch_matmul::broadcast_dim( + extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2))); - const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0); - const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1); - const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2); - const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0); - const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1); - const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2); + const size_t lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0); + const size_t lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1); + const size_t lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2); + const size_t rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0); + const size_t rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1); + const size_t rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2); // Set params for each matrix multiply. - const int lhs_rows = extended_lhs_shape.Dims(3); - const int rhs_cols = extended_rhs_shape.Dims(4); - const int accum_depth = extended_lhs_shape.Dims(4); + const size_t lhs_rows = static_cast(extended_lhs_shape.Dims(3)); + const size_t rhs_cols = static_cast(extended_rhs_shape.Dims(4)); + const size_t accum_depth = static_cast(extended_lhs_shape.Dims(4)); const int32_t input_offset = params.input_offset; const int32_t filter_offset = params.weights_offset; @@ -242,23 +248,23 @@ inline void BatchMatMul(const FullyConnectedParams& params, const int32_t output_activation_max = params.quantized_activation_max; TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - for (int b0 = 0; b0 < batch_dim0; ++b0) { + for (size_t b0 = 0; b0 < batch_dim0; ++b0) { const lhsT* lhs_ptr0 = lhs_data + (b0 * lhs_ext0); const rhsT* rhs_ptr0 = rhs_data + (b0 * rhs_ext0); - for (int b1 = 0; b1 < batch_dim1; ++b1) { + for (size_t b1 = 0; b1 < batch_dim1; ++b1) { const lhsT* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1; const rhsT* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1; - for (int b2 = 0; b2 < batch_dim2; ++b2) { + for (size_t b2 = 0; b2 < batch_dim2; ++b2) { const lhsT* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2; const rhsT* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2; outputT* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) * lhs_rows * rhs_cols; - for (int j = 0; j < rhs_cols; ++j) { - for (int i = 0; i < lhs_rows; ++i) { + for (size_t j = 0; j < rhs_cols; ++j) { + for (size_t i = 0; i < lhs_rows; ++i) { AccumT total = 0; - for (int k = 0; k < accum_depth; ++k) { + for (size_t k = 0; k < accum_depth; ++k) { AccumT lhs_val = lhs_ptr2[accum_depth * i + k]; AccumT rhs_val = rhs_ptr2[accum_depth * j + k]; total += (lhs_val + filter_offset) * (rhs_val + input_offset); @@ -268,7 +274,7 @@ inline void BatchMatMul(const FullyConnectedParams& params, total_scaled += output_offset; total_scaled = std::max(total_scaled, output_activation_min); total_scaled = std::min(total_scaled, output_activation_max); - const int idx = lhs_rows * j + i; + const size_t idx = lhs_rows * j + i; out_ptr[idx] = static_cast(total_scaled); } } diff --git a/tensorflow/lite/kernels/internal/reference/binary_function.h b/tensorflow/lite/kernels/internal/reference/binary_function.h index 0b124af87f0..7611dd27ec5 100644 --- a/tensorflow/lite/kernels/internal/reference/binary_function.h +++ b/tensorflow/lite/kernels/internal/reference/binary_function.h @@ -17,6 +17,7 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/common.h" #include "tensorflow/lite/kernels/internal/compatibility.h" +#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h" #include "tensorflow/lite/kernels/internal/types.h" namespace tflite { @@ -32,42 +33,10 @@ inline void BroadcastBinaryFunction4DSlow( const RuntimeShape& unextended_input2_shape, const T2* input2_data, const RuntimeShape& unextended_output_shape, R* output_data, R (*func)(T1, T2)) { - TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); - TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); - TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); - const RuntimeShape output_shape = - RuntimeShape::ExtendedShape(4, unextended_output_shape); - - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, - unextended_input2_shape, &desc1, &desc2); - - const int* dims_data = - reinterpret_cast(output_shape.DimsDataUpTo5D()); - for (int b = 0; b < output_shape.Dims(0); ++b) { - int out_idx_b = b * dims_data[1]; - int in_idx1_b = desc1.strides[0] * b; - int in_idx2_b = desc2.strides[0] * b; - for (int y = 0; y < output_shape.Dims(1); ++y) { - int out_idx_y = (out_idx_b + y) * dims_data[2]; - int in_idx1_y = in_idx1_b + desc1.strides[1] * y; - int in_idx2_y = in_idx2_b + desc2.strides[1] * y; - for (int x = 0; x < output_shape.Dims(2); ++x) { - int out_idx_x = (out_idx_y + x) * dims_data[3]; - int in1_idx = in_idx1_y + desc1.strides[2] * x; - int in2_idx = in_idx2_y + desc2.strides[2] * x; - for (int c = 0; c < output_shape.Dims(3); ++c) { - auto out_idx = out_idx_x + c; - auto in1_val = input1_data[in1_idx]; - auto in2_val = input2_data[in2_idx]; - output_data[out_idx] = func(in1_val, in2_val); - in1_idx += desc1.strides[3]; - in2_idx += desc2.strides[3]; - } - } - } - } + auto op = [func](T1 a, T2 b) { return func(a, b); }; + BroadcastBinaryOpSimple(unextended_input1_shape, input1_data, + unextended_input2_shape, input2_data, + unextended_output_shape, output_data, op); } // R: Result type. T1: Input 1 type. T2: Input 2 type. diff --git a/tensorflow/lite/kernels/internal/reference/broadcast_to.h b/tensorflow/lite/kernels/internal/reference/broadcast_to.h index 66d86af4a50..3eb3fbb30f8 100644 --- a/tensorflow/lite/kernels/internal/reference/broadcast_to.h +++ b/tensorflow/lite/kernels/internal/reference/broadcast_to.h @@ -16,6 +16,7 @@ limitations under the License. #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_TO_H_ #include +#include #include "tensorflow/lite/kernels/internal/common.h" #include "tensorflow/lite/kernels/kernel_util.h" @@ -100,6 +101,47 @@ inline void BroadcastTo(const RuntimeShape& unextended_input_shape, BroadcastImpl(input_desc, input_data, output_desc, output_data, indexes, 0, last_broadcast_dim, TfLiteTypeGetSize(data_type)); } + +inline void BroadcastTo(const RuntimeShape& unextended_input_shape, + const char* input_data, + const RuntimeShape& unextended_output_shape, + char* output_data, TfLiteType data_type) { + const int dims = unextended_output_shape.DimensionsCount(); + const RuntimeShape input_shape = + RuntimeShape::ExtendedShape(dims, unextended_input_shape); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(dims, unextended_output_shape); + const int type_size = TfLiteTypeGetSize(data_type); + if (dims == 0) { + memcpy(output_data, input_data, type_size); + return; + } + + std::vector input_strides(dims); + std::vector output_strides(dims); + input_strides[dims - 1] = 1; + output_strides[dims - 1] = 1; + for (int i = dims - 2; i >= 0; --i) { + input_strides[i] = input_strides[i + 1] * input_shape.Dims(i + 1); + output_strides[i] = output_strides[i + 1] * output_shape.Dims(i + 1); + } + + const int output_flat_size = unextended_output_shape.FlatSize(); + for (int output_index = 0; output_index < output_flat_size; ++output_index) { + int remaining_index = output_index; + int input_index = 0; + for (int dim = 0; dim < dims; ++dim) { + const int coordinate = remaining_index / output_strides[dim]; + remaining_index %= output_strides[dim]; + if (input_shape.Dims(dim) != 1) { + input_index += coordinate * input_strides[dim]; + } + } + memcpy(output_data + static_cast(output_index) * type_size, + input_data + static_cast(input_index) * type_size, + type_size); + } +} } // namespace reference_ops } // namespace tflite #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_TO_H_ diff --git a/tensorflow/lite/kernels/internal/reference/comparisons.h b/tensorflow/lite/kernels/internal/reference/comparisons.h index e40e4045cc7..4c1165a7dd9 100644 --- a/tensorflow/lite/kernels/internal/reference/comparisons.h +++ b/tensorflow/lite/kernels/internal/reference/comparisons.h @@ -20,6 +20,7 @@ limitations under the License. #include "tensorflow/lite/core/c/common.h" #include "tensorflow/lite/core/macros.h" #include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h" #include "tensorflow/lite/kernels/internal/runtime_shape.h" #include "tensorflow/lite/kernels/internal/types.h" @@ -110,40 +111,15 @@ inline void ComparisonWithScaling( } } -struct BroadcastComparison4DSlowCommon { - const RuntimeShape output_shape; - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; -}; - -TFLITE_NOINLINE -BroadcastComparison4DSlowCommon BroadcastComparison4DSlowPreprocess( - const RuntimeShape& unextended_input1_shape, - const RuntimeShape& unextended_input2_shape, - const RuntimeShape& unextended_output_shape); - template F> inline void BroadcastComparison4DSlowImpl( const ComparisonParams& op_params, const RuntimeShape& unextended_input1_shape, const T* input1_data, const RuntimeShape& unextended_input2_shape, const T* input2_data, const RuntimeShape& unextended_output_shape, bool* output_data) { - const BroadcastComparison4DSlowCommon dims = - BroadcastComparison4DSlowPreprocess(unextended_input1_shape, - unextended_input2_shape, - unextended_output_shape); - - for (int b = 0; b < dims.output_shape.Dims(0); ++b) { - for (int y = 0; y < dims.output_shape.Dims(1); ++y) { - for (int x = 0; x < dims.output_shape.Dims(2); ++x) { - for (int c = 0; c < dims.output_shape.Dims(3); ++c) { - output_data[Offset(dims.output_shape, b, y, x, c)] = - F(input1_data[SubscriptToIndex(dims.desc1, b, y, x, c)], - input2_data[SubscriptToIndex(dims.desc2, b, y, x, c)]); - } - } - } - } + BroadcastBinaryOpSimple(unextended_input1_shape, input1_data, + unextended_input2_shape, input2_data, + unextended_output_shape, output_data, F); } template F> @@ -165,11 +141,6 @@ inline void BroadcastComparison4DSlowWithScaling( const RuntimeShape& unextended_input1_shape, const T* input1_data, const RuntimeShape& unextended_input2_shape, const T* input2_data, const RuntimeShape& unextended_output_shape, bool* output_data) { - const BroadcastComparison4DSlowCommon dims = - BroadcastComparison4DSlowPreprocess(unextended_input1_shape, - unextended_input2_shape, - unextended_output_shape); - int left_shift = op_params.left_shift; int32_t input1_offset = op_params.input1_offset; int32_t input1_multiplier = op_params.input1_multiplier; @@ -178,30 +149,23 @@ inline void BroadcastComparison4DSlowWithScaling( int32_t input2_multiplier = op_params.input2_multiplier; int input2_shift = op_params.input2_shift; - for (int b = 0; b < dims.output_shape.Dims(0); ++b) { - for (int y = 0; y < dims.output_shape.Dims(1); ++y) { - for (int x = 0; x < dims.output_shape.Dims(2); ++x) { - for (int c = 0; c < dims.output_shape.Dims(3); ++c) { - const int32_t input1_val = - input1_offset + - input1_data[SubscriptToIndex(dims.desc1, b, y, x, c)]; - const int32_t input2_val = - input2_offset + - input2_data[SubscriptToIndex(dims.desc2, b, y, x, c)]; - const int32_t shifted_input1_val = input1_val * (1 << left_shift); - const int32_t shifted_input2_val = input2_val * (1 << left_shift); - const int32_t scaled_input1_val = - MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input1_val, input1_multiplier, input1_shift); - const int32_t scaled_input2_val = - MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input2_val, input2_multiplier, input2_shift); - output_data[Offset(dims.output_shape, b, y, x, c)] = - F(scaled_input1_val, scaled_input2_val); - } - } - } - } + auto op = [=](T a, T b) { + const int32_t input1_val = input1_offset + a; + const int32_t input2_val = input2_offset + b; + const int32_t shifted_input1_val = input1_val * (1 << left_shift); + const int32_t shifted_input2_val = input2_val * (1 << left_shift); + const int32_t scaled_input1_val = + MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input1_val, input1_multiplier, input1_shift); + const int32_t scaled_input2_val = + MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input2_val, input2_multiplier, input2_shift); + return F(scaled_input1_val, scaled_input2_val); + }; + + BroadcastBinaryOpSimple(unextended_input1_shape, input1_data, + unextended_input2_shape, input2_data, + unextended_output_shape, output_data, op); } #define TFLITE_COMPARISON_OP(name) \ diff --git a/tensorflow/lite/kernels/internal/reference/concatenation.h b/tensorflow/lite/kernels/internal/reference/concatenation.h index 4a82d7c502d..bbf2e39dcc9 100644 --- a/tensorflow/lite/kernels/internal/reference/concatenation.h +++ b/tensorflow/lite/kernels/internal/reference/concatenation.h @@ -17,6 +17,7 @@ limitations under the License. #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONCATENATION_H_ #include +#include #include "tensorflow/lite/kernels/internal/common.h" #include "tensorflow/lite/kernels/internal/compatibility.h" @@ -60,9 +61,9 @@ inline void Concatenation(const ConcatenationParams& params, } Scalar* output_ptr = output_data; - for (int k = 0; k < outer_size; k++) { + for (int64_t k = 0; k < outer_size; k++) { for (int i = 0; i < inputs_count; ++i) { - const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size; + const int64_t copy_size = input_shapes[i]->Dims(axis) * base_inner_size; const Scalar* input_ptr = input_data[i] + k * copy_size; memcpy(output_ptr, input_ptr, copy_size * sizeof(Scalar)); output_ptr += copy_size; @@ -107,12 +108,16 @@ inline void Concatenation(const ConcatenationParams& params, // We can't guarantee that the output buffer is initialized to 0, so we have // to clear it to ensure the high/low nibbles not currently being written are // not garbage. - // Note: output_shape.FlatSize() gives number of elements (nibbles). + // Note: The total number of elements (nibbles) is outer_size * + // output_shape.Dims(axis) * base_inner_size. We use int64_t to avoid + // overflow issues with FlatSize(). + int64_t total_elements = + outer_size * output_shape.Dims(axis) * base_inner_size; // Bytes needed: (elements + 1) / 2. - memset(output_ptr, 0, (output_shape.FlatSize() + 1) / 2); + memset(output_ptr, 0, (static_cast(total_elements) + 1) / 2); int64_t output_offset = 0; - for (int k = 0; k < outer_size; k++) { + for (int64_t k = 0; k < outer_size; k++) { for (int i = 0; i < inputs_count; ++i) { const int64_t copy_size = input_shapes[i]->Dims(axis) * base_inner_size; const uint8_t* input_ptr = @@ -124,7 +129,7 @@ inline void Concatenation(const ConcatenationParams& params, // So current offset in elements is k * copy_size. int64_t input_offset = k * copy_size; - for (int j = 0; j < copy_size; ++j) { + for (int64_t j = 0; j < copy_size; ++j) { int64_t in_idx = input_offset + j; uint8_t val = input_ptr[in_idx / 2]; uint8_t nibble = (in_idx % 2 == 0) ? (val & 0x0F) : ((val >> 4) & 0x0F); @@ -184,9 +189,9 @@ inline void ConcatenationWithScaling(const ConcatenationParams& params, const float inverse_output_scale = 1.f / output_scale; uint8_t* output_ptr = output_data; - for (int k = 0; k < outer_size; k++) { + for (int64_t k = 0; k < outer_size; k++) { for (int i = 0; i < inputs_count; ++i) { - const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size; + const int64_t copy_size = input_shapes[i]->Dims(axis) * base_inner_size; const uint8_t* input_ptr = input_data[i] + k * copy_size; if (input_zeropoint[i] == output_zeropoint && input_scale[i] == output_scale) { @@ -194,7 +199,7 @@ inline void ConcatenationWithScaling(const ConcatenationParams& params, } else { const float scale = input_scale[i] * inverse_output_scale; const float bias = -input_zeropoint[i] * scale; - for (int j = 0; j < copy_size; ++j) { + for (int64_t j = 0; j < copy_size; ++j) { const int32_t value = static_cast(tflite::TfLiteRound( input_ptr[j] * scale + bias)) + output_zeropoint; diff --git a/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h b/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h index d4fba1399fb..be9f5fcbe0c 100644 --- a/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h +++ b/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h @@ -153,7 +153,8 @@ struct DepthwiseConvBasicKernel { for (int out_x = 0; out_x < output_width; ++out_x) { for (int ic = 0; ic < input_depth; ++ic) { for (int m = 0; m < depth_multiplier; m++) { - const int oc = m + ic * depth_multiplier; + const int64_t oc = + m + static_cast(ic) * depth_multiplier; const int in_x_origin = (out_x * stride_width) - pad_width; const int in_y_origin = (out_y * stride_height) - pad_height; int32_t acc = 0; @@ -240,7 +241,8 @@ struct DepthwiseConvBasicKernel { for (int out_x = 0; out_x < output_width; ++out_x) { for (int in_channel = 0; in_channel < input_depth; ++in_channel) { for (int m = 0; m < depth_multiplier; ++m) { - const int output_channel = m + in_channel * depth_multiplier; + const int64_t output_channel = + m + static_cast(in_channel) * depth_multiplier; const int in_x_origin = (out_x * stride_width) - pad_width; const int in_y_origin = (out_y * stride_height) - pad_height; int32_t acc = 0; diff --git a/tensorflow/lite/kernels/internal/reference/div.h b/tensorflow/lite/kernels/internal/reference/div.h index 5f26d3b8e6d..e9f09c802c8 100644 --- a/tensorflow/lite/kernels/internal/reference/div.h +++ b/tensorflow/lite/kernels/internal/reference/div.h @@ -18,6 +18,7 @@ limitations under the License. #include #include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h" namespace tflite { @@ -117,29 +118,13 @@ inline void BroadcastDivSlowQuantized( const T* input1_data, const RuntimeShape& unextended_input2_shape, const T* input2_data, const RuntimeShape& unextended_output_shape, T* output_data) { - TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N); - TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N); - TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N); - - NdArrayDesc desc1; - NdArrayDesc desc2; - NdArrayDesc output_desc; - NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, - unextended_input2_shape, &desc1, &desc2); - CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape), - &output_desc); - DivCheckArithmeticParams(params); - auto div_func = [&](int indexes[N]) { - int32_t input1_val = - params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)]; - int32_t input2_val = - params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)]; + auto op = [¶ms](T a, T b) { + int32_t input1_val = params.input1_offset + a; + int32_t input2_val = params.input2_offset + b; TFLITE_DCHECK_NE(input2_val, 0); if (input2_val < 0) { - // Invert signs to avoid a negative input2_val as input2_inv needs to be - // positive to be used as multiplier of MultiplyByQuantizedMultiplier. input1_val = -input1_val; input2_val = -input2_val; } @@ -157,10 +142,12 @@ inline void BroadcastDivSlowQuantized( const int32_t clamped_output = std::min(params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result)); - output_data[SubscriptToIndex(output_desc, indexes)] = - static_cast(clamped_output); + return static_cast(clamped_output); }; - NDOpsHelper(output_desc, div_func); + + BroadcastBinaryOpSimple(unextended_input1_shape, input1_data, + unextended_input2_shape, input2_data, + unextended_output_shape, output_data, op); } template @@ -202,10 +189,6 @@ inline void BroadcastDivSlow(const ArithmeticParams& params, input2_data, unextended_output_shape, output_data); } -// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary -// dimensionality if the runtime code does a single loop over one dimension -// that handles broadcasting as the base case. The code generator would then -// generate max(D1, D2) nested for loops. template void BroadcastDivSlow(const ArithmeticParams& params, const RuntimeShape& unextended_input1_shape, @@ -218,34 +201,14 @@ void BroadcastDivSlow(const ArithmeticParams& params, T output_activation_max; GetActivationParams(params, &output_activation_min, &output_activation_max); - TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N); - TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N); - TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N); - - NdArrayDesc desc1; - NdArrayDesc desc2; - NdArrayDesc output_desc; - NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, - unextended_input2_shape, &desc1, &desc2); - CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape), - &output_desc); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest - // stride, typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - - auto div_func = [&](int indexes[N]) { - output_data[SubscriptToIndex(output_desc, indexes)] = - ActivationFunctionWithMinMax( - input1_data[SubscriptToIndex(desc1, indexes)] / - input2_data[SubscriptToIndex(desc2, indexes)], - output_activation_min, output_activation_max); + auto op = [output_activation_min, output_activation_max](T a, T b) { + return ActivationFunctionWithMinMax(a / b, output_activation_min, + output_activation_max); }; - NDOpsHelper(output_desc, div_func); + + BroadcastBinaryOpSimple(unextended_input1_shape, input1_data, + unextended_input2_shape, input2_data, + unextended_output_shape, output_data, op); } template diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/add.h b/tensorflow/lite/kernels/internal/reference/integer_ops/add.h index c2a0e0f082c..692ac926ed9 100644 --- a/tensorflow/lite/kernels/internal/reference/integer_ops/add.h +++ b/tensorflow/lite/kernels/internal/reference/integer_ops/add.h @@ -20,6 +20,7 @@ limitations under the License. #include #include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h" #include "tensorflow/lite/kernels/internal/types.h" namespace tflite { @@ -71,58 +72,6 @@ void ElementWise(int size, const ArithmeticParams& params, const T* input1_data, } } -template -inline void BroadcastAddRecursiveDimensions( - const ArithmeticParams& params, int dimension, size_t* input1_offset_p, - size_t* input2_offset_p, size_t* output_offset, - size_t* compressed_input1_stride, size_t* compressed_input2_stride, - size_t* compressed_output_shape, const T* input1_data, const T* input2_data, - T* output_data, void (*check_arithmetic_params)(const ArithmeticParams&), - T (*binary_func)(T, T, const ArithmeticParams&)) { - if (dimension > 0) { - for (size_t c = 0; c < compressed_output_shape[dimension]; ++c) { - size_t input1_offset_c = *input1_offset_p; - size_t input2_offset_c = *input2_offset_p; - BroadcastAddRecursiveDimensions( - params, dimension - 1, &input1_offset_c, &input2_offset_c, - output_offset, compressed_input1_stride, compressed_input2_stride, - compressed_output_shape, input1_data, input2_data, output_data, - check_arithmetic_params, binary_func); - *input1_offset_p += compressed_input1_stride[dimension]; - *input2_offset_p += compressed_input2_stride[dimension]; - } - } else { - TFLITE_DCHECK(dimension == 0); - bool input1_is_broadcast = compressed_input1_stride[dimension] == 0; - bool input2_is_broadcast = compressed_input2_stride[dimension] == 0; - TFLITE_DCHECK(!(input1_is_broadcast && input2_is_broadcast)); - const T* input1_data_ptr = input1_data + *input1_offset_p; - const T* input2_data_ptr = input2_data + *input2_offset_p; - T* output_data_ptr = output_data + *output_offset; - if (input1_is_broadcast) { - // input1 is broadcast. - BroadcastInput1(compressed_output_shape[dimension], params, - input1_data_ptr, input2_data_ptr, output_data_ptr, - check_arithmetic_params, binary_func); - *input2_offset_p += compressed_output_shape[dimension]; - } else if (input2_is_broadcast) { - // input2 is broadcast. - BroadcastInput2(compressed_output_shape[dimension], params, - input1_data_ptr, input2_data_ptr, output_data_ptr, - check_arithmetic_params, binary_func); - *input1_offset_p += compressed_output_shape[dimension]; - } else { - // Add element-wise. - ElementWise(compressed_output_shape[dimension], params, - input1_data_ptr, input2_data_ptr, output_data_ptr, - check_arithmetic_params, binary_func); - *input1_offset_p += compressed_output_shape[dimension]; - *input2_offset_p += compressed_output_shape[dimension]; - } - *output_offset += compressed_output_shape[dimension]; - } -} - // TODO: b/270589088 - move to a more appropriate file. (b/270589088#comment2) template void BroadcastBinaryFunction6DSlow( @@ -131,38 +80,13 @@ void BroadcastBinaryFunction6DSlow( const T* input2_data, const RuntimeShape& output_shape, T* output_data, void (*check_arithmetic_params)(const ArithmeticParams&), T (*binary_func)(T, T, const ArithmeticParams&)) { - constexpr int kMaxBroadcastDim = 6; - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - size_t compressed_input1_stride[kMaxBroadcastDim]; - size_t compressed_input2_stride[kMaxBroadcastDim]; - size_t compressed_output_shape[kMaxBroadcastDim]; - bool broadcastable_shape = ReduceDimensionsForBroadcast( - input1_shape, input2_shape, compressed_input1_stride, - compressed_input2_stride, compressed_output_shape); - // Skip broadcasting for degenerate shapes. - if (!broadcastable_shape) { - return; - } - - size_t input1_offset = 0; - size_t input2_offset = 0; - size_t output_offset = 0; - BroadcastAddRecursiveDimensions( - params, kMaxBroadcastDim - 1, &input1_offset, &input2_offset, - &output_offset, compressed_input1_stride, compressed_input2_stride, - compressed_output_shape, input1_data, input2_data, output_data, - check_arithmetic_params, binary_func); + check_arithmetic_params(params); + auto op = [¶ms, binary_func](T a, T b) { + return binary_func(a, b, params); + }; + reference_ops::BroadcastBinaryOpSimple(input1_shape, input1_data, + input2_shape, input2_data, + output_shape, output_data, op); } template diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h index a57056d5b17..ab3f1843e24 100644 --- a/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h +++ b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h @@ -20,6 +20,7 @@ limitations under the License. #include "fixedpoint/fixedpoint.h" #include "ruy/profiler/instrumentation.h" // from @ruy #include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h" namespace tflite { namespace reference_integer_ops { @@ -96,88 +97,22 @@ inline void BroadcastMul6DSlow( const T* input1_data, const RuntimeShape& input2_shape, const T* input2_data, const RuntimeShape& output_shape, T* output_data) { ruy::profiler::ScopeLabel label("BroadcastMul6DSlow"); - - NdArrayDesc desc1; - NdArrayDesc desc2; - // The input shapes are extended as part of NdArrayDesc initialization. - NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, - &desc2); - const RuntimeShape extended_output_shape = - RuntimeShape::ExtendedShape(kMaxMulBroadcastDim, output_shape); - // Cache output shape dimensions. - int32_t extended_output_shape_dims[kMaxMulBroadcastDim]; - std::memcpy(extended_output_shape_dims, extended_output_shape.DimsData(), - sizeof(extended_output_shape_dims)); - - size_t input1_offset_a = 0; - size_t input2_offset_a = 0; - size_t output_offset_a = 0; - for (int a = 0; a < extended_output_shape_dims[0]; ++a) { - size_t input1_offset_d = input1_offset_a; - size_t input2_offset_d = input2_offset_a; - size_t output_offset_d = output_offset_a; - for (int d = 0; d < extended_output_shape_dims[1]; ++d) { - size_t input1_offset_b = input1_offset_d; - size_t input2_offset_b = input2_offset_d; - size_t output_offset_b = output_offset_d; - for (int b = 0; b < extended_output_shape_dims[2]; ++b) { - size_t input1_offset_y = input1_offset_b; - size_t input2_offset_y = input2_offset_b; - size_t output_offset_y = output_offset_b; - for (int y = 0; y < extended_output_shape_dims[3]; ++y) { - size_t input1_offset_x = input1_offset_y; - size_t input2_offset_x = input2_offset_y; - size_t output_offset_x = output_offset_y; - for (int x = 0; x < extended_output_shape_dims[4]; ++x) { - size_t input1_offset_c = input1_offset_x; - size_t input2_offset_c = input2_offset_x; - size_t output_offset_c = output_offset_x; - for (int c = 0; c < extended_output_shape_dims[5]; ++c) { - const int32_t input1_val = - params.input1_offset + input1_data[input1_offset_c]; - const int32_t input2_val = - params.input2_offset + input2_data[input2_offset_c]; - const int32_t unclamped_result = - params.output_offset + - MultiplyByQuantizedMultiplier(input1_val * input2_val, - params.output_multiplier, - params.output_shift); - const int32_t clamped_output = std::min( - params.quantized_activation_max, - std::max(params.quantized_activation_min, unclamped_result)); - output_data[output_offset_c] = static_cast(clamped_output); - input1_offset_c += desc1.strides[5]; - input2_offset_c += desc2.strides[5]; - ++output_offset_c; - } - input1_offset_x += desc1.strides[4]; - input2_offset_x += desc2.strides[4]; - output_offset_x += extended_output_shape_dims[5]; - } - input1_offset_y += desc1.strides[3]; - input2_offset_y += desc2.strides[3]; - output_offset_y += - extended_output_shape_dims[4] * extended_output_shape_dims[5]; - } - input1_offset_b += desc1.strides[2]; - input2_offset_b += desc2.strides[2]; - output_offset_b += extended_output_shape_dims[3] * - extended_output_shape_dims[4] * - extended_output_shape_dims[5]; - } - input1_offset_d += desc1.strides[1]; - input2_offset_d += desc2.strides[1]; - output_offset_d += - extended_output_shape_dims[2] * extended_output_shape_dims[3] * - extended_output_shape_dims[4] * extended_output_shape_dims[5]; - } - input1_offset_a += desc1.strides[0]; - input2_offset_a += desc2.strides[0]; - output_offset_a += - extended_output_shape_dims[1] * extended_output_shape_dims[2] * - extended_output_shape_dims[3] * extended_output_shape_dims[4] * - extended_output_shape_dims[5]; - } + auto op = [¶ms](T a, T b) { + const int32_t input1_val = params.input1_offset + a; + const int32_t input2_val = params.input2_offset + b; + const int32_t unclamped_result = + params.output_offset + + MultiplyByQuantizedMultiplier(input1_val * input2_val, + params.output_multiplier, + params.output_shift); + const int32_t clamped_output = + std::min(params.quantized_activation_max, + std::max(params.quantized_activation_min, unclamped_result)); + return static_cast(clamped_output); + }; + reference_ops::BroadcastBinaryOpSimple(input1_shape, input1_data, + input2_shape, input2_data, + output_shape, output_data, op); } template diff --git a/tensorflow/lite/kernels/internal/reference/maximum_minimum.h b/tensorflow/lite/kernels/internal/reference/maximum_minimum.h index cd11b4191ac..3efcc3caf8b 100644 --- a/tensorflow/lite/kernels/internal/reference/maximum_minimum.h +++ b/tensorflow/lite/kernels/internal/reference/maximum_minimum.h @@ -16,6 +16,7 @@ limitations under the License. #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MAXIMUM_MINIMUM_H_ #include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h" #include "tensorflow/lite/kernels/internal/types.h" namespace tflite { @@ -37,24 +38,9 @@ void MaximumMinimumBroadcastSlow(const RuntimeShape& unextended_input1_shape, output_data[i] = op(input1_data[i], input2_data[i]); } } else { - TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N); - TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N); - TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N); - - NdArrayDesc desc1; - NdArrayDesc desc2; - NdArrayDesc output_desc; - NdArrayDescsForElementwiseBroadcast( - unextended_input1_shape, unextended_input2_shape, &desc1, &desc2); - CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape), - &output_desc); - - auto maxmin_func = [&](int indexes[N]) { - output_data[SubscriptToIndex(output_desc, indexes)] = - op(input1_data[SubscriptToIndex(desc1, indexes)], - input2_data[SubscriptToIndex(desc2, indexes)]); - }; - NDOpsHelper(output_desc, maxmin_func); + BroadcastBinaryOpSimple(unextended_input1_shape, input1_data, + unextended_input2_shape, input2_data, + unextended_output_shape, output_data, op); } } diff --git a/tensorflow/lite/kernels/internal/reference/mul.h b/tensorflow/lite/kernels/internal/reference/mul.h index fca74a32c12..ea66dae226f 100644 --- a/tensorflow/lite/kernels/internal/reference/mul.h +++ b/tensorflow/lite/kernels/internal/reference/mul.h @@ -19,6 +19,7 @@ limitations under the License. #include #include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h" namespace tflite { @@ -91,38 +92,6 @@ inline void Mul(const ArithmeticParams& params, MulElementwise(flat_size, params, input1_data, input2_data, output_data); } -template -void BroadcastMulRecursiveDimensions( - const ArithmeticParams& params, int dimension, const T* input1_data, - const T* input2_data, T* output_data, size_t* input1_offset_p, - size_t* input2_offset_p, size_t* output_offset, - const NdArrayDesc& desc1, - const NdArrayDesc& desc2, - const int32_t extended_output_shape_dims[kMaxMulBroadcastDim], - F binary_func) { - if (dimension == kMaxMulBroadcastDim - 1) { - for (int c = 0; c < extended_output_shape_dims[dimension]; ++c) { - const T input1_val = input1_data[*input1_offset_p]; - const T input2_val = input2_data[*input2_offset_p]; - output_data[*output_offset] = binary_func(params, input1_val, input2_val); - *input1_offset_p += desc1.strides[dimension]; - *input2_offset_p += desc2.strides[dimension]; - ++(*output_offset); - } - } else { - for (int a = 0; a < extended_output_shape_dims[dimension]; ++a) { - size_t input1_offset_c = *input1_offset_p; - size_t input2_offset_c = *input2_offset_p; - BroadcastMulRecursiveDimensions( - params, dimension + 1, input1_data, input2_data, output_data, - &input1_offset_c, &input2_offset_c, output_offset, desc1, desc2, - extended_output_shape_dims, binary_func); - *input1_offset_p += desc1.strides[dimension]; - *input2_offset_p += desc2.strides[dimension]; - } - } -} - inline void BroadcastMul6DSlow(const ArithmeticParams& params, const RuntimeShape& input1_shape, const uint8_t* input1_data, @@ -130,37 +99,21 @@ inline void BroadcastMul6DSlow(const ArithmeticParams& params, const uint8_t* input2_data, const RuntimeShape& output_shape, uint8_t* output_data) { - NdArrayDesc desc1; - NdArrayDesc desc2; - NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, - &desc2); - const RuntimeShape extended_output_shape = - RuntimeShape::ExtendedShape(kMaxMulBroadcastDim, output_shape); - // Cache output shape dimensions. - int32_t extended_output_shape_dims[kMaxMulBroadcastDim]; - std::memcpy(extended_output_shape_dims, extended_output_shape.DimsData(), - sizeof(extended_output_shape_dims)); - - size_t input1_offset = 0; - size_t input2_offset = 0; - size_t output_offset = 0; - BroadcastMulRecursiveDimensions( - params, 0, input1_data, input2_data, output_data, &input1_offset, - &input2_offset, &output_offset, desc1, desc2, extended_output_shape_dims, - [](const ArithmeticParams& params, const uint8_t input1_val, - const uint8_t input2_val) { - const int32_t offsetted_input1_val = params.input1_offset + input1_val; - const int32_t offsetted_input2_val = params.input2_offset + input2_val; - const int32_t unclamped_result = - params.output_offset + - MultiplyByQuantizedMultiplier( - offsetted_input1_val * offsetted_input2_val, - params.output_multiplier, params.output_shift); - const int32_t clamped_output = std::min( - params.quantized_activation_max, - std::max(params.quantized_activation_min, unclamped_result)); - return static_cast(clamped_output); - }); + auto op = [¶ms](uint8_t input1_val, uint8_t input2_val) { + const int32_t offsetted_input1_val = params.input1_offset + input1_val; + const int32_t offsetted_input2_val = params.input2_offset + input2_val; + const int32_t unclamped_result = + params.output_offset + MultiplyByQuantizedMultiplier( + offsetted_input1_val * offsetted_input2_val, + params.output_multiplier, + params.output_shift); + const int32_t clamped_output = + std::min(params.quantized_activation_max, + std::max(params.quantized_activation_min, unclamped_result)); + return static_cast(clamped_output); + }; + BroadcastBinaryOpSimple(input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data, op); } template desc1; - NdArrayDesc desc2; - NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, - unextended_input2_shape, &desc1, &desc2); - const RuntimeShape extended_output_shape = - RuntimeShape::ExtendedShape(kMaxMulBroadcastDim, unextended_output_shape); - // Cache output shape dimensions. - int32_t extended_output_shape_dims[kMaxMulBroadcastDim]; - std::memcpy(extended_output_shape_dims, extended_output_shape.DimsData(), - sizeof(extended_output_shape_dims)); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest - // stride, typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for - // the best cache behavior. - size_t input1_offset = 0; - size_t input2_offset = 0; - size_t output_offset = 0; - BroadcastMulRecursiveDimensions( - params, 0, input1_data, input2_data, output_data, &input1_offset, - &input2_offset, &output_offset, desc1, desc2, extended_output_shape_dims, - [](const ArithmeticParams& params, const T input1_val, - const T input2_val) { - T output_activation_min; - T output_activation_max; - GetActivationParams(params, &output_activation_min, - &output_activation_max); - return ActivationFunctionWithMinMax(input1_val * input2_val, - output_activation_min, - output_activation_max); - }); + T output_activation_min; + T output_activation_max; + GetActivationParams(params, &output_activation_min, &output_activation_max); + auto op = [output_activation_min, output_activation_max](T a, T b) { + return ActivationFunctionWithMinMax(a * b, output_activation_min, + output_activation_max); + }; + BroadcastBinaryOpSimple(unextended_input1_shape, input1_data, + unextended_input2_shape, input2_data, + unextended_output_shape, output_data, op); } inline void BroadcastMul6DSlow(const ArithmeticParams& params, @@ -225,31 +147,10 @@ inline void BroadcastMul6DSlow(const ArithmeticParams& params, const std::complex* input2_data, const RuntimeShape& unextended_output_shape, std::complex* output_data) { - TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 6); - TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 6); - TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 6); - - NdArrayDesc desc1; - NdArrayDesc desc2; - NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, - unextended_input2_shape, &desc1, &desc2); - const RuntimeShape extended_output_shape = - RuntimeShape::ExtendedShape(kMaxMulBroadcastDim, unextended_output_shape); - // Cache output shape dimensions. - int32_t extended_output_shape_dims[kMaxMulBroadcastDim]; - std::memcpy(extended_output_shape_dims, extended_output_shape.DimsData(), - sizeof(extended_output_shape_dims)); - - size_t input1_offset = 0; - size_t input2_offset = 0; - size_t output_offset = 0; - BroadcastMulRecursiveDimensions( - params, 0, input1_data, input2_data, output_data, &input1_offset, - &input2_offset, &output_offset, desc1, desc2, extended_output_shape_dims, - [](const ArithmeticParams& params, const std::complex input1_val, - const std::complex input2_val) { - return input1_val * input2_val; - }); + auto op = [](std::complex a, std::complex b) { return a * b; }; + BroadcastBinaryOpSimple(unextended_input1_shape, input1_data, + unextended_input2_shape, input2_data, + unextended_output_shape, output_data, op); } template diff --git a/tensorflow/lite/kernels/internal/reference/prelu.h b/tensorflow/lite/kernels/internal/reference/prelu.h index 1a5ef0cb1f4..244b6b3c461 100644 --- a/tensorflow/lite/kernels/internal/reference/prelu.h +++ b/tensorflow/lite/kernels/internal/reference/prelu.h @@ -19,6 +19,7 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/common.h" #include "tensorflow/lite/kernels/internal/compatibility.h" +#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h" #include "tensorflow/lite/kernels/internal/types.h" namespace tflite { @@ -31,47 +32,30 @@ inline void BroadcastPrelu4DSlow( const PreluParams& params, const RuntimeShape& input_shape, const T* input_data, const RuntimeShape& alpha_shape, const U* alpha_data, const RuntimeShape& output_shape, T* output_data) { - TFLITE_DCHECK_LE(input_shape.DimensionsCount(), 4); - TFLITE_DCHECK_LE(alpha_shape.DimensionsCount(), 4); - TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 4); - const RuntimeShape extended_output_shape = - RuntimeShape::ExtendedShape(4, output_shape); - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input_shape, alpha_shape, &desc1, &desc2); - - for (int b = 0; b < extended_output_shape.Dims(0); ++b) { - for (int y = 0; y < extended_output_shape.Dims(1); ++y) { - for (int x = 0; x < extended_output_shape.Dims(2); ++x) { - for (int c = 0; c < extended_output_shape.Dims(3); ++c) { - int output_index = Offset(extended_output_shape, b, y, x, c); - int input_index = SubscriptToIndex(desc1, b, y, x, c); - const int32_t input_value = - params.input_offset + input_data[input_index]; - int32_t output_value; - if (input_value >= 0) { - output_value = MultiplyByQuantizedMultiplier( - input_value, params.output_multiplier_1, params.output_shift_1); - } else { - auto alpha_index = SubscriptToIndex(desc2, b, y, x, c); - const int32_t alpha_value = - params.alpha_offset + alpha_data[alpha_index]; - - output_value = MultiplyByQuantizedMultiplier( - input_value * alpha_value, params.output_multiplier_2, - params.output_shift_2); - } - output_value += params.output_offset; - - const int32_t quantized_min = std::numeric_limits::min(); - const int32_t quantized_max = std::numeric_limits::max(); - const int32_t clamped_output = - std::min(quantized_max, std::max(quantized_min, output_value)); - output_data[output_index] = static_cast(clamped_output); - } - } + const int32_t quantized_min = std::numeric_limits::min(); + const int32_t quantized_max = std::numeric_limits::max(); + + auto op = [¶ms, quantized_min, quantized_max](T input_val, U alpha_val) { + const int32_t input_value = params.input_offset + input_val; + int32_t output_value; + if (input_value >= 0) { + output_value = MultiplyByQuantizedMultiplier( + input_value, params.output_multiplier_1, params.output_shift_1); + } else { + const int32_t alpha_value = params.alpha_offset + alpha_val; + output_value = MultiplyByQuantizedMultiplier(input_value * alpha_value, + params.output_multiplier_2, + params.output_shift_2); } - } + output_value += params.output_offset; + + const int32_t clamped_output = + std::min(quantized_max, std::max(quantized_min, output_value)); + return static_cast(clamped_output); + }; + + BroadcastBinaryOpSimple(input_shape, input_data, alpha_shape, alpha_data, + output_shape, output_data, op); } template diff --git a/tensorflow/lite/kernels/internal/reference/select.h b/tensorflow/lite/kernels/internal/reference/select.h index 82b6097c218..4939d067433 100644 --- a/tensorflow/lite/kernels/internal/reference/select.h +++ b/tensorflow/lite/kernels/internal/reference/select.h @@ -15,10 +15,13 @@ limitations under the License. #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SELECT_H_ #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SELECT_H_ +#include #include +#include #include "ruy/profiler/instrumentation.h" // from @ruy #include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h" #include "tensorflow/lite/kernels/internal/types.h" namespace tflite { @@ -74,6 +77,157 @@ void RankOneSelect(const RuntimeShape& input_condition_shape, } } +template +void RunSelectOp(const D* cond, const T* x, const T* y, T* output, + const size_t* cond_stride, const size_t* x_stride, + const size_t* y_stride, const size_t* output_stride, + const size_t* output_shape, int dim) { + TFLITE_DCHECK_GE(dim, 0); + size_t output_shape_0 = output_shape[dim]; + size_t output_stride_0 = output_stride[dim]; + size_t cond_stride_0 = cond_stride[dim]; + size_t x_stride_0 = x_stride[dim]; + size_t y_stride_0 = y_stride[dim]; + if (dim == 0) { + TFLITE_DCHECK_EQ(output_stride_0, 1); + if (cond_stride_0 == 0) { + if (*cond) { + if (x_stride_0 == 0) { + std::fill_n(output, output_shape_0, *x); + } else { + TFLITE_DCHECK_EQ(x_stride_0, 1); + std::memcpy(output, x, output_shape_0 * sizeof(T)); + } + } else { + if (y_stride_0 == 0) { + std::fill_n(output, output_shape_0, *y); + } else { + TFLITE_DCHECK_EQ(y_stride_0, 1); + std::memcpy(output, y, output_shape_0 * sizeof(T)); + } + } + } else { + TFLITE_DCHECK_EQ(cond_stride_0, 1); + if (x_stride_0 == 0 && y_stride_0 == 0) { + for (size_t i = 0; i < output_shape_0; ++i) { + output[i] = cond[i] ? *x : *y; + } + } else if (x_stride_0 == 0) { + TFLITE_DCHECK_EQ(y_stride_0, 1); + for (size_t i = 0; i < output_shape_0; ++i) { + output[i] = cond[i] ? *x : y[i]; + } + } else if (y_stride_0 == 0) { + TFLITE_DCHECK_EQ(x_stride_0, 1); + for (size_t i = 0; i < output_shape_0; ++i) { + output[i] = cond[i] ? x[i] : *y; + } + } else { + TFLITE_DCHECK_EQ(x_stride_0, 1); + TFLITE_DCHECK_EQ(y_stride_0, 1); + for (size_t i = 0; i < output_shape_0; ++i) { + output[i] = cond[i] ? x[i] : y[i]; + } + } + } + } else { + dim -= 1; + for (size_t i = 0; i < output_shape_0; ++i) { + RunSelectOp(cond, x, y, output, cond_stride, x_stride, y_stride, + output_stride, output_shape, dim); + cond += cond_stride_0; + x += x_stride_0; + y += y_stride_0; + output += output_stride_0; + } + } +} + +template +inline void BroadcastSelectSimple(const RuntimeShape& cond_shape, + const D* cond_data, + const RuntimeShape& x_shape, const T* x_data, + const RuntimeShape& y_shape, const T* y_data, + const RuntimeShape& output_shape, + T* output_data) { + constexpr int kMaxRank = 8; + const int dims_count = std::max( + output_shape.DimensionsCount(), + std::max(cond_shape.DimensionsCount(), + std::max(x_shape.DimensionsCount(), y_shape.DimensionsCount()))); + if (dims_count <= 0) { + *output_data = *cond_data ? *x_data : *y_data; + return; + } + + TFLITE_DCHECK_LE(dims_count, kMaxRank); + + const RuntimeShape extended_output_shape = + RuntimeShape::ExtendedShape(dims_count, output_shape); + const RuntimeShape extended_cond_shape = + RuntimeShape::ExtendedShape(dims_count, cond_shape); + const RuntimeShape extended_x_shape = + RuntimeShape::ExtendedShape(dims_count, x_shape); + const RuntimeShape extended_y_shape = + RuntimeShape::ExtendedShape(dims_count, y_shape); + + size_t cond_strides[kMaxRank]; + size_t x_strides[kMaxRank]; + size_t y_strides[kMaxRank]; + size_t o_strides[kMaxRank]; + size_t o_shape[kMaxRank]; + + size_t cond_accum_stride = 1; + size_t x_accum_stride = 1; + size_t y_accum_stride = 1; + size_t o_accum_stride = 1; + int next_dim_idx = -1; + for (int i = dims_count - 1; i >= 0; --i) { + const int cond_dim = extended_cond_shape.Dims(i); + const int x_dim = extended_x_shape.Dims(i); + const int y_dim = extended_y_shape.Dims(i); + const int output_dim = extended_output_shape.Dims(i); + if (cond_dim <= 0 || x_dim <= 0 || y_dim <= 0 || output_dim <= 0) { + // Empty operation. + return; + } + size_t cond_stride = + (cond_dim == 1 && output_dim != 1) ? 0 : cond_accum_stride; + size_t x_stride = (x_dim == 1 && output_dim != 1) ? 0 : x_accum_stride; + size_t y_stride = (y_dim == 1 && output_dim != 1) ? 0 : y_accum_stride; + size_t o_stride = o_accum_stride; + + if (next_dim_idx >= 0 && + CanFuseLoops(output_dim, cond_dim, cond_stride, cond_accum_stride, + cond_strides[next_dim_idx]) && + CanFuseLoops(output_dim, x_dim, x_stride, x_accum_stride, + x_strides[next_dim_idx]) && + CanFuseLoops(output_dim, y_dim, y_stride, y_accum_stride, + y_strides[next_dim_idx]) && + CanFuseLoops(output_dim, output_dim, o_stride, o_accum_stride, + o_strides[next_dim_idx])) { + // This dimension can be fused into one loop with the previous + // dimension. + o_shape[next_dim_idx] *= output_dim; + } else { + ++next_dim_idx; + cond_strides[next_dim_idx] = cond_stride; + x_strides[next_dim_idx] = x_stride; + y_strides[next_dim_idx] = y_stride; + o_strides[next_dim_idx] = o_stride; + o_shape[next_dim_idx] = output_dim; + } + + cond_accum_stride *= cond_dim; + x_accum_stride *= x_dim; + y_accum_stride *= y_dim; + o_accum_stride *= output_dim; + } + + RunSelectOp(cond_data, x_data, y_data, output_data, cond_strides, x_strides, + y_strides, o_strides, o_shape, next_dim_idx); +} + template void BroadcastSelect5DSlow(const RuntimeShape& input_condition_shape, const D* input_condition_data, @@ -88,61 +242,9 @@ void BroadcastSelect5DSlow(const RuntimeShape& input_condition_shape, TFLITE_DCHECK_LE(input_y_shape.DimensionsCount(), 5); TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 5); - NdArrayDesc<5> desc_condition; - NdArrayDesc<5> desc_x; - NdArrayDesc<5> desc_y; - NdArrayDesc<5> desc_output; - const RuntimeShape extended_output_shape = - RuntimeShape::ExtendedShape(5, output_shape); - CopyDimsToDesc(extended_output_shape, &desc_output); - NdArrayDescsForElementwiseBroadcast(input_condition_shape, input_x_shape, - input_y_shape, &desc_condition, &desc_x, - &desc_y); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest - // stride, typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for - // the best cache behavior. - for (int n = 0; n < desc_output.extents[0]; ++n) { - int out_idx_n = desc_output.extents[1] * n; - int cond_idx_n = desc_condition.strides[0] * n; - int in_idx1_n = desc_x.strides[0] * n; - int in_idx2_n = desc_y.strides[0] * n; - for (int b = 0; b < desc_output.extents[1]; ++b) { - int out_idx_b = (out_idx_n + b) * desc_output.extents[2]; - int cond_idx_b = cond_idx_n + desc_condition.strides[1] * b; - int in_idx1_b = in_idx1_n + desc_x.strides[1] * b; - int in_idx2_b = in_idx2_n + desc_y.strides[1] * b; - for (int y = 0; y < desc_output.extents[2]; ++y) { - int out_idx_y = (out_idx_b + y) * desc_output.extents[3]; - int cond_idx_y = cond_idx_b + desc_condition.strides[2] * y; - int in_idx1_y = in_idx1_b + desc_x.strides[2] * y; - int in_idx2_y = in_idx2_b + desc_y.strides[2] * y; - for (int x = 0; x < desc_output.extents[3]; ++x) { - int out_idx = (out_idx_y + x) * desc_output.extents[4]; - int cond_idx = cond_idx_y + desc_condition.strides[3] * x; - int in_idx1 = in_idx1_y + desc_x.strides[3] * x; - int in_idx2 = in_idx2_y + desc_y.strides[3] * x; - for (int c = 0; c < desc_output.extents[4]; ++c) { - output_data[out_idx] = input_condition_data[cond_idx] - ? input_x_data[in_idx1] - : input_y_data[in_idx2]; - out_idx++; - cond_idx += desc_condition.strides[4]; - in_idx1 += desc_x.strides[4]; - in_idx2 += desc_y.strides[4]; - } - } - } - } - } + BroadcastSelectSimple(input_condition_shape, input_condition_data, + input_x_shape, input_x_data, input_y_shape, + input_y_data, output_shape, output_data); } } // namespace reference_ops diff --git a/tensorflow/lite/kernels/internal/reference/softmax.h b/tensorflow/lite/kernels/internal/reference/softmax.h index 2930217b61f..27018436503 100644 --- a/tensorflow/lite/kernels/internal/reference/softmax.h +++ b/tensorflow/lite/kernels/internal/reference/softmax.h @@ -17,6 +17,7 @@ limitations under the License. #include #include +#include #include "fixedpoint/fixedpoint.h" #include "tensorflow/lite/kernels/internal/common.h" @@ -28,9 +29,11 @@ limitations under the License. namespace tflite { namespace reference_ops { +template ::value, int>::type = 0> inline void Softmax(const SoftmaxParams& params, - const RuntimeShape& input_shape, const float* input_data, - const RuntimeShape& output_shape, float* output_data) { + const RuntimeShape& input_shape, const T* input_data, + const RuntimeShape& output_shape, T* output_data) { const int trailing_dim = input_shape.DimensionsCount() - 1; const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); @@ -38,26 +41,24 @@ inline void Softmax(const SoftmaxParams& params, MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); for (int i = 0; i < outer_size; ++i) { - // Find max element value which we'll use to ensure numerical stability - // taking advantage of the following equality: - // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) - float max = std::numeric_limits::lowest(); + T max = std::numeric_limits::lowest(); for (int c = 0; c < depth; ++c) { max = std::max(max, input_data[i * depth + c]); } - // Compute sum. float sum = 0.f; for (int c = 0; c < depth; ++c) { - const float exp_c = std::exp((input_data[i * depth + c] - max) * - static_cast(params.beta)); - output_data[i * depth + c] = exp_c; + const float exp_c = + std::exp((static_cast(input_data[i * depth + c]) - + static_cast(max)) * + static_cast(params.beta)); + output_data[i * depth + c] = static_cast(exp_c); sum += exp_c; } - // Compute result. for (int c = 0; c < depth; ++c) { - output_data[i * depth + c] = output_data[i * depth + c] / sum; + output_data[i * depth + c] = + static_cast(static_cast(output_data[i * depth + c]) / sum); } } } diff --git a/tensorflow/lite/kernels/internal/reference/sub.h b/tensorflow/lite/kernels/internal/reference/sub.h index 1a74aebeafc..c9009182927 100644 --- a/tensorflow/lite/kernels/internal/reference/sub.h +++ b/tensorflow/lite/kernels/internal/reference/sub.h @@ -18,179 +18,17 @@ limitations under the License. #include #include -#include -#include #include "ruy/profiler/instrumentation.h" // from @ruy #include "tensorflow/lite/kernels/internal/common.h" #include "tensorflow/lite/kernels/internal/compatibility.h" +#include "tensorflow/lite/kernels/internal/reference/broadcast_loop.h" #include "tensorflow/lite/kernels/internal/types.h" namespace tflite { namespace reference_ops { -template -struct SubImpl { - template - static void BroadcastInput1(const ArithmeticParams& params, - const T* input1_data, const T* input2_data, - T* output_data, size_t size, F binary_func) { - for (size_t c = 0; c < size; ++c) { - output_data[c] = binary_func(input1_data[0], input2_data[c], params); - } - } - - template - static void BroadcastInput2(const ArithmeticParams& params, - const T* input1_data, const T* input2_data, - T* output_data, size_t size, F binary_func) { - for (size_t c = 0; c < size; ++c) { - output_data[c] = binary_func(input1_data[c], input2_data[0], params); - } - } - - template - static void ElementWise(const ArithmeticParams& params, const T* input1_data, - const T* input2_data, T* output_data, size_t size, - F binary_func) { - for (size_t c = 0; c < size; ++c) { - output_data[c] = binary_func(input1_data[c], input2_data[c], params); - } - } -}; - -template <> -struct SubImpl { - template - static void BroadcastInput1(const ArithmeticParams& params, - const int32_t* input1_data, - const int32_t* input2_data, int32_t* output_data, - size_t size, F binary_func) { - size_t c = 0; - int32_t activation_min, activation_max; - GetActivationParams(params, &activation_min, &activation_max); -#ifdef USE_NEON - const int32x4_t vmax = vdupq_n_s32(activation_max); - const int32x4_t vmin = vdupq_n_s32(activation_min); - const int32x4_t va = vdupq_n_s32(input1_data[0]); - for (; c + 4 <= size; c += 4) { - const int32x4_t vb = vld1q_s32(&input2_data[c]); - int32x4_t vres = vsubq_s32(va, vb); - vres = vmaxq_s32(vmin, vres); - vres = vminq_s32(vmax, vres); - vst1q_s32(&output_data[c], vres); - } -#endif - for (; c < size; ++c) { - output_data[c] = binary_func(input1_data[0], input2_data[c], params); - } - } - - template - static void BroadcastInput2(const ArithmeticParams& params, - const int32_t* input1_data, - const int32_t* input2_data, int32_t* output_data, - size_t size, F binary_func) { - size_t c = 0; - int32_t activation_min, activation_max; - GetActivationParams(params, &activation_min, &activation_max); -#ifdef USE_NEON - const int32x4_t vmax = vdupq_n_s32(activation_max); - const int32x4_t vmin = vdupq_n_s32(activation_min); - const int32x4_t vb = vdupq_n_s32(input2_data[0]); - for (; c + 4 <= size; c += 4) { - const int32x4_t va = vld1q_s32(&input1_data[c]); - int32x4_t vres = vsubq_s32(va, vb); - vres = vmaxq_s32(vmin, vres); - vres = vminq_s32(vmax, vres); - vst1q_s32(&output_data[c], vres); - } -#endif - for (; c < size; ++c) { - output_data[c] = binary_func(input1_data[c], input2_data[0], params); - } - } - - template - static void ElementWise(const ArithmeticParams& params, - const int32_t* input1_data, - const int32_t* input2_data, int32_t* output_data, - size_t size, F binary_func) { - size_t c = 0; - int32_t activation_min, activation_max; - GetActivationParams(params, &activation_min, &activation_max); -#ifdef USE_NEON - int32x4_t vmax = vdupq_n_s32(activation_max); - int32x4_t vmin = vdupq_n_s32(activation_min); - for (; c + 4 <= size; c += 4) { - const int32x4_t va = vld1q_s32(&input1_data[c]); - const int32x4_t vb = vld1q_s32(&input2_data[c]); - int32x4_t vres = vsubq_s32(va, vb); - vres = vmaxq_s32(vmin, vres); - vres = vminq_s32(vmax, vres); - vst1q_s32(&output_data[c], vres); - } -#endif - for (; c < size; ++c) { - output_data[c] = binary_func(input1_data[c], input2_data[c], params); - } - } -}; - -template -inline void BroadcastSubRecursiveDimensions( - int dimension, const ArithmeticParams& params, const T* input1_data, - const T* input2_data, T* output_data, size_t* input1_offset_p, - size_t* input2_offset_p, size_t* output_offset, - size_t* compressed_input1_stride, size_t* compressed_input2_stride, - size_t* compressed_output_shape, F binary_func) { - if (dimension > 0) { - for (size_t c = 0; c < compressed_output_shape[dimension]; ++c) { - size_t input1_offset_c = *input1_offset_p; - size_t input2_offset_c = *input2_offset_p; - BroadcastSubRecursiveDimensions( - dimension - 1, params, input1_data, input2_data, output_data, - &input1_offset_c, &input2_offset_c, output_offset, - compressed_input1_stride, compressed_input2_stride, - compressed_output_shape, binary_func); - *input1_offset_p += compressed_input1_stride[dimension]; - *input2_offset_p += compressed_input2_stride[dimension]; - } - } else { - TFLITE_DCHECK(dimension == 0); - bool input1_is_broadcast = compressed_input1_stride[dimension] == 0; - bool input2_is_broadcast = compressed_input2_stride[dimension] == 0; - TFLITE_DCHECK(!(input1_is_broadcast && input2_is_broadcast)); - const T* input1_data_ptr = input1_data + *input1_offset_p; - const T* input2_data_ptr = input2_data + *input2_offset_p; - T* output_data_ptr = output_data + *output_offset; - if (input1_is_broadcast) { - // input1 is broadcast. - SubImpl::BroadcastInput1( - params, input1_data_ptr, input2_data_ptr, output_data_ptr, - compressed_output_shape[dimension], binary_func); - *input2_offset_p += compressed_output_shape[dimension]; - } else if (input2_is_broadcast) { - // input2 is broadcast. - SubImpl::BroadcastInput2( - params, input1_data_ptr, input2_data_ptr, output_data_ptr, - compressed_output_shape[dimension], binary_func); - *input1_offset_p += compressed_output_shape[dimension]; - } else { - // Add element-wise. - SubImpl::ElementWise(params, input1_data_ptr, input2_data_ptr, - output_data_ptr, - compressed_output_shape[dimension], binary_func); - *input1_offset_p += compressed_output_shape[dimension]; - *input2_offset_p += compressed_output_shape[dimension]; - } - *output_offset += compressed_output_shape[dimension]; - } -} - -// TODO: b/296510380 - we may be able to factor out this to common.h for all -// binary arithmetic ops (add, sub, mul). template inline void BroadcastSubCommon(const ArithmeticParams& params, const RuntimeShape& input1_shape, @@ -199,59 +37,13 @@ inline void BroadcastSubCommon(const ArithmeticParams& params, const T* input2_data, const RuntimeShape& output_shape, T* output_data, F binary_func) { - constexpr int kMaxBroadcastDim = 6; - TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), kMaxBroadcastDim); - TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), kMaxBroadcastDim); - TFLITE_DCHECK_LE(output_shape.DimensionsCount(), kMaxBroadcastDim); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - - size_t compressed_input1_stride[kMaxBroadcastDim]; - size_t compressed_input2_stride[kMaxBroadcastDim]; - size_t compressed_output_shape[kMaxBroadcastDim]; - bool broadcastable_shape = ReduceDimensionsForBroadcast( - input1_shape, input2_shape, compressed_input1_stride, - compressed_input2_stride, compressed_output_shape); - // Skip broadcasting for degenerate shapes. - if (!broadcastable_shape) { - return; - } - - size_t input1_offset = 0; - size_t input2_offset = 0; - size_t output_offset = 0; - BroadcastSubRecursiveDimensions( - kMaxBroadcastDim - 1, params, input1_data, input2_data, output_data, - &input1_offset, &input2_offset, &output_offset, compressed_input1_stride, - compressed_input2_stride, compressed_output_shape, binary_func); + auto op = [¶ms, binary_func](T a, T b) { + return binary_func(a, b, params); + }; + BroadcastBinaryOpSimple(input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data, op); } -// TODO(b/151345304): We can implement BroadcastSub on buffers of arbitrary -// dimensionality if the runtime code does a single loop over one dimension -// that handles broadcasting as the base case. The code generator would then -// generate max(D1, D2) nested for loops. template void BroadcastSubSlow(const ArithmeticParams& params, const RuntimeShape& input1_shape, const T* input1_data, diff --git a/tensorflow/lite/kernels/internal/reference/transpose.h b/tensorflow/lite/kernels/internal/reference/transpose.h index 7e2bf7b266a..fbd991141a7 100644 --- a/tensorflow/lite/kernels/internal/reference/transpose.h +++ b/tensorflow/lite/kernels/internal/reference/transpose.h @@ -176,6 +176,9 @@ template void Transpose(const TransposeParams& params, const RuntimeShape& input_shape, const T* input_data, const RuntimeShape& output_shape, T* output_data) { + if (input_shape.FlatSize() == 0) { + return; + } using transpose_internal::SetupTransposeStrides; using transpose_internal::TransposeImpl; using transpose_internal::TransposeStorageType; diff --git a/tensorflow/lite/kernels/internal/types.h b/tensorflow/lite/kernels/internal/types.h index 1cfc43d1662..3b26fee0327 100644 --- a/tensorflow/lite/kernels/internal/types.h +++ b/tensorflow/lite/kernels/internal/types.h @@ -19,10 +19,40 @@ limitations under the License. #include #include #include +#include #include #include "tensorflow/lite/kernels/internal/compatibility.h" #include "tensorflow/lite/kernels/internal/runtime_shape.h" +#include "tensorflow/lite/types/half.h" + +namespace std { +template <> +class numeric_limits { + public: + static constexpr bool is_specialized = + true; // NOLINT(readability-identifier-naming) + static constexpr tflite::half min() noexcept { + return tflite::half::smallest_normal(); + } + static constexpr tflite::half max() noexcept { return tflite::half::max(); } + static constexpr tflite::half lowest() noexcept { + return tflite::half::min(); + } + static constexpr tflite::half epsilon() noexcept { + return tflite::half::epsilon(); + } + static constexpr tflite::half quiet_NaN() noexcept { +#if TFLITE_ARCH_FLOAT16 + return tflite::half(__builtin_nanf("")); +#else + return tflite::half::from_bits(0x7e00); +#endif + } + static constexpr bool is_signed = + true; // NOLINT(readability-identifier-naming) +}; +} // namespace std namespace tflite { @@ -170,11 +200,13 @@ inline bool NextIndex(const int num_dims, const int* dims, IndexType* current) { } TFLITE_DCHECK(dims != nullptr); TFLITE_DCHECK(current != nullptr); + for (int i = 0; i < num_dims; ++i) { + TFLITE_DCHECK_GE(dims[i], 0); + } int carry = 1; for (int idx = num_dims - 1; idx >= 0; --idx) { IndexType current_val = current[idx] + carry; - TFLITE_DCHECK_GE(dims[idx], current_val); - if (dims[idx] == current_val) { + if (current_val >= dims[idx]) { current[idx] = 0; } else { current[idx] = current_val; @@ -683,10 +715,10 @@ struct ArithmeticParams { }; struct ConcatenationParams { - int8_t axis; + int32_t axis; const int32_t* input_zeropoint; const float* input_scale; - uint16_t inputs_count; + int32_t inputs_count; int32_t output_zeropoint; float output_scale; }; @@ -999,7 +1031,7 @@ struct TanhParams { int input_left_shift; }; -constexpr int kTransposeMaxDimensions = 6; +constexpr int kTransposeMaxDimensions = 8; struct TransposeParams { int8_t perm_count; @@ -1075,6 +1107,12 @@ inline void GetActivationParams(const P& params, float* min, float* max) { *max = params.float_activation_max; } +template +inline void GetActivationParams(const P& params, half* min, half* max) { + *min = static_cast(params.float_activation_min); + *max = static_cast(params.float_activation_max); +} + template inline void GetActivationParams(const P& params, int64_t* min, int64_t* max) { *min = params.int64_activation_min; diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc index 62feffc1c0a..ae888896fba 100644 --- a/tensorflow/lite/kernels/kernel_util.cc +++ b/tensorflow/lite/kernels/kernel_util.cc @@ -25,6 +25,7 @@ limitations under the License. #ifndef TF_LITE_STATIC_MEMORY #include +#include "absl/types/span.h" #include "tensorflow/lite/array.h" #endif // TF_LITE_STATIC_MEMORY @@ -33,6 +34,7 @@ limitations under the License. #include "tensorflow/lite/core/c/common.h" #include "tensorflow/lite/kernels/internal/cppmath.h" #include "tensorflow/lite/kernels/internal/quantization_util.h" +#include "tensorflow/lite/util.h" #if defined(__APPLE__) #include "TargetConditionals.h" @@ -101,9 +103,8 @@ inline TfLiteStatus GetMutableInputSafe(const TfLiteContext* context, const TfLiteNode* node, int index, const TfLiteTensor** tensor) { int tensor_index; - TF_LITE_ENSURE_OK( - context, ValidateTensorIndexingSafe(context, index, node->inputs->size, - node->inputs->data, &tensor_index)); + TF_LITE_ENSURE_STATUS(ValidateTensorIndexingSafe( + context, index, node->inputs->size, node->inputs->data, &tensor_index)); *tensor = GetTensorAtIndex(context, tensor_index); return kTfLiteOk; } @@ -140,9 +141,8 @@ TfLiteTensor* GetOutput(TfLiteContext* context, const TfLiteNode* node, TfLiteStatus GetOutputSafe(const TfLiteContext* context, const TfLiteNode* node, int index, TfLiteTensor** tensor) { int tensor_index; - TF_LITE_ENSURE_OK( - context, ValidateTensorIndexingSafe(context, index, node->outputs->size, - node->outputs->data, &tensor_index)); + TF_LITE_ENSURE_STATUS(ValidateTensorIndexingSafe( + context, index, node->outputs->size, node->outputs->data, &tensor_index)); *tensor = GetTensorAtIndex(context, tensor_index); return kTfLiteOk; } @@ -167,8 +167,8 @@ TfLiteStatus GetTemporarySafe(const TfLiteContext* context, const TfLiteNode* node, int index, TfLiteTensor** tensor) { int tensor_index; - TF_LITE_ENSURE_OK(context, ValidateTensorIndexingSafe( - context, index, node->temporaries->size, + TF_LITE_ENSURE_STATUS( + ValidateTensorIndexingSafe(context, index, node->temporaries->size, node->temporaries->data, &tensor_index)); *tensor = GetTensorAtIndex(context, tensor_index); return kTfLiteOk; @@ -188,8 +188,8 @@ TfLiteStatus GetIntermediatesSafe(const TfLiteContext* context, const TfLiteNode* node, int index, TfLiteTensor** tensor) { int tensor_index; - TF_LITE_ENSURE_OK(context, ValidateTensorIndexingSafe( - context, index, node->intermediates->size, + TF_LITE_ENSURE_STATUS( + ValidateTensorIndexingSafe(context, index, node->intermediates->size, node->intermediates->data, &tensor_index)); *tensor = GetTensorAtIndex(context, tensor_index); return kTfLiteOk; @@ -546,6 +546,8 @@ int TfLiteTypeGetSizeBits(TfLiteType type) { return 4; case kTfLiteUInt8: case kTfLiteInt8: + case kTfLiteFloat8E4M3FN: + case kTfLiteFloat8E5M2: return 8; case kTfLiteUInt16: case kTfLiteInt16: @@ -595,4 +597,25 @@ bool HasUnspecifiedDimension(const TfLiteTensor* tensor) { return false; } +TfLiteStatus CheckedShapeProduct(TfLiteContext* context, + absl::Span dims, + const char* error_message, size_t& product) { + // The CheckedNumElements function already checks for negative dimensions, so + // we don't do it here. + TF_LITE_ENSURE_MSG(context, CheckedNumElements(dims, product) == kTfLiteOk, + "%s", error_message); + return kTfLiteOk; +} + +TfLiteStatus CheckedShapeProductToInt(TfLiteContext* context, + absl::Span dims, + const char* error_message, int& product) { + for (const int dim : dims) { + TF_LITE_ENSURE_MSG(context, dim >= 0, "Encountered a negative dimension."); + } + TF_LITE_ENSURE_MSG(context, CheckedNumElements(dims, product) == kTfLiteOk, + "%s", error_message); + return kTfLiteOk; +} + } // namespace tflite diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h index 25e5386ccb6..6b649cc8e9b 100644 --- a/tensorflow/lite/kernels/kernel_util.h +++ b/tensorflow/lite/kernels/kernel_util.h @@ -17,11 +17,13 @@ limitations under the License. #include +#include #include #ifndef TF_LITE_STATIC_MEMORY #include #endif // TF_LITE_STATIC_MEMORY +#include "absl/types/span.h" #include "tensorflow/lite/core/c/builtin_op_data.h" #include "tensorflow/lite/core/c/common.h" #ifndef NDEBUG @@ -341,6 +343,30 @@ bool IsMobilePlatform(); // Returns whether there is unspecified dimension in the tensor's dim signature. bool HasUnspecifiedDimension(const TfLiteTensor* tensor); +/** + * Calculates the product of the given dimensions. Returns an error if any of + * the dimensions is negative or if the product overflows. + * @param context The context to use for error reporting. + * @param dims The dimensions to multiply. + * @param error_message The error message to use if an error is encountered. + * @param product The output parameter to store the product. + */ +TfLiteStatus CheckedShapeProduct(TfLiteContext* context, + absl::Span dims, + const char* error_message, size_t& product); + +/** + * Calculates the product of the given dimensions. Returns an error if any of + * the dimensions is negative or if the product overflows. + * @param context The context to use for error reporting. + * @param dims The dimensions to multiply. + * @param error_message The error message to use if an error is encountered. + * @param product The output parameter to store the product. + */ +TfLiteStatus CheckedShapeProductToInt(TfLiteContext* context, + absl::Span dims, + const char* error_message, int& product); + } // namespace tflite #endif // TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_ diff --git a/tensorflow/lite/python/schema_py_generated.py b/tensorflow/lite/python/schema_py_generated.py index 083034df03f..6e690f431e4 100755 --- a/tensorflow/lite/python/schema_py_generated.py +++ b/tensorflow/lite/python/schema_py_generated.py @@ -28,6 +28,8 @@ class TensorType(object): BFLOAT16 = 18 INT2 = 19 UINT4 = 20 + FLOAT8_E4M3FN = 21 + FLOAT8_E5M2 = 22 class QuantizationDetails(object): diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h index 210cd40a2d3..67cafaca66c 100755 --- a/tensorflow/lite/schema/schema_generated.h +++ b/tensorflow/lite/schema/schema_generated.h @@ -713,11 +713,13 @@ enum TensorType : int8_t { TensorType_BFLOAT16 = 18, TensorType_INT2 = 19, TensorType_UINT4 = 20, + TensorType_FLOAT8_E4M3FN = 21, + TensorType_FLOAT8_E5M2 = 22, TensorType_MIN = TensorType_FLOAT32, - TensorType_MAX = TensorType_UINT4 + TensorType_MAX = TensorType_FLOAT8_E5M2 }; -inline const TensorType (&EnumValuesTensorType())[21] { +inline const TensorType (&EnumValuesTensorType())[23] { static const TensorType values[] = { TensorType_FLOAT32, TensorType_FLOAT16, @@ -739,13 +741,15 @@ inline const TensorType (&EnumValuesTensorType())[21] { TensorType_INT4, TensorType_BFLOAT16, TensorType_INT2, - TensorType_UINT4 + TensorType_UINT4, + TensorType_FLOAT8_E4M3FN, + TensorType_FLOAT8_E5M2 }; return values; } inline const char * const *EnumNamesTensorType() { - static const char * const names[22] = { + static const char * const names[24] = { "FLOAT32", "FLOAT16", "INT32", @@ -767,13 +771,15 @@ inline const char * const *EnumNamesTensorType() { "BFLOAT16", "INT2", "UINT4", + "FLOAT8_E4M3FN", + "FLOAT8_E5M2", nullptr }; return names; } inline const char *EnumNameTensorType(TensorType e) { - if (::flatbuffers::IsOutRange(e, TensorType_FLOAT32, TensorType_UINT4)) return ""; + if (::flatbuffers::IsOutRange(e, TensorType_FLOAT32, TensorType_FLOAT8_E5M2)) return ""; const size_t index = static_cast(e); return EnumNamesTensorType()[index]; } diff --git a/tensorflow/lite/tools/flatbuffer_utils_test.py b/tensorflow/lite/tools/flatbuffer_utils_test.py index 13074aaca5e..e8a2e46b9be 100644 --- a/tensorflow/lite/tools/flatbuffer_utils_test.py +++ b/tensorflow/lite/tools/flatbuffer_utils_test.py @@ -18,9 +18,9 @@ import subprocess import sys -from tflite_micro.tensorflow.lite.python import schema_py_generated as schema # pylint:disable=g-direct-tensorflow-import -from tflite_micro.tensorflow.lite.tools import flatbuffer_utils -from tflite_micro.tensorflow.lite.tools import test_utils +from tflite_micro.tensorflow.lite_micro.tensorflow.lite.python import schema_py_generated as schema # pylint:disable=g-direct-tensorflow-import +from tflite_micro.tensorflow.lite_micro.tensorflow.lite.tools import flatbuffer_utils +from tflite_micro.tensorflow.lite_micro.tensorflow.lite.tools import test_utils from tensorflow.python.framework import test_util from tensorflow.python.platform import test diff --git a/tensorflow/lite/tools/test_utils.py b/tensorflow/lite/tools/test_utils.py index 44157143d5d..582fbd2879b 100644 --- a/tensorflow/lite/tools/test_utils.py +++ b/tensorflow/lite/tools/test_utils.py @@ -18,7 +18,7 @@ """ import flatbuffers -from tflite_micro.tensorflow.lite.python import schema_py_generated as schema_fb +from tflite_micro.tensorflow.lite_micro.tensorflow.lite.python import schema_py_generated as schema_fb TFLITE_SCHEMA_VERSION = 3 diff --git a/tensorflow/lite/tools/visualize_test.py b/tensorflow/lite/tools/visualize_test.py index 68de38cc9d7..4cbb01f2b58 100644 --- a/tensorflow/lite/tools/visualize_test.py +++ b/tensorflow/lite/tools/visualize_test.py @@ -16,8 +16,8 @@ import os import re -from tflite_micro.tensorflow.lite.tools import test_utils -from tflite_micro.tensorflow.lite.tools import visualize +from tflite_micro.tensorflow.lite_micro.tensorflow.lite.tools import test_utils +from tflite_micro.tensorflow.lite_micro.tensorflow.lite.tools import visualize from tensorflow.python.framework import test_util from tensorflow.python.platform import test