diff --git a/onnxruntime/core/providers/coreml/builders/impl/gather_nd_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/gather_nd_op_builder.cc new file mode 100644 index 0000000000000..75d7c9998f3da --- /dev/null +++ b/onnxruntime/core/providers/coreml/builders/impl/gather_nd_op_builder.cc @@ -0,0 +1,225 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include +#include + +#include "core/optimizer/initializer.h" +#include "core/providers/coreml/builders/impl/base_op_builder.h" +#include "core/providers/coreml/builders/impl/builder_utils.h" +#include "core/providers/coreml/builders/model_builder.h" +#include "core/providers/coreml/builders/op_builder_factory.h" +#include "core/providers/coreml/shape_utils.h" +#include "core/providers/shared/utils/utils.h" + +namespace onnxruntime { +namespace coreml { + +// ONNX GatherND(data, indices) maps to the CoreML ML Program 'gather_nd' op. +class GatherNDOpBuilder : public BaseOpBuilder { + void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override; + + Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node, + const logging::Logger& logger) const override; + + bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params, + const logging::Logger& logger) const override; + + bool HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& input_params, + const logging::Logger& logger) const override; + + bool SupportsMLProgram() const override { return true; } +}; + +Status GatherNDOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node, + const logging::Logger& logger) const { + using namespace CoreML::Specification::MILSpec; + const auto& input_defs = node.InputDefs(); + const auto& output_defs = node.OutputDefs(); + + // CoreML's gather_nd does not accept a bool 'x'. Transformer attention-mask + // graphs gather from bool tensors, so for that case the op is composed as + // cast(bool -> int32) -> gather_nd -> cast(int32 -> bool). int32 represents + // 0/1 exactly, so the round-trip is lossless. + int32_t data_type = ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED; + GetType(*input_defs[0], data_type, logger); + const bool data_is_bool = data_type == ONNX_NAMESPACE::TensorProto_DataType_BOOL; + + std::string_view gather_x_name = input_defs[0]->Name(); + if (data_is_bool) { + std::vector x_shape; + const bool has_x_shape = GetShape(*input_defs[0], x_shape, logger); + const std::string& cast_x_name = model_builder.GetUniqueName(node, "gather_nd_x_int32"); + std::unique_ptr cast_in = model_builder.CreateOperation(node, "cast"); + AddOperationInput(*cast_in, "x", input_defs[0]->Name()); + AddOperationInput(*cast_in, "dtype", + model_builder.AddScalarConstant(cast_in->type(), "dtype", std::string("int32"))); + AddIntermediateOperationOutput(*cast_in, cast_x_name, ONNX_NAMESPACE::TensorProto_DataType_INT32, + has_x_shape ? std::optional>(x_shape) + : std::nullopt); + model_builder.AddOperation(std::move(cast_in)); + gather_x_name = cast_x_name; + } + + // ONNX GatherND permits negative indices (wrapped by the corresponding data dim); CoreML's gather_nd + // does not. The indices are a constant and the indexed data dims are static (both gated in + // IsOpSupportedImpl), so wrap any negatives now and re-emit them as an int32 'indices' constant. The + // original initializer is skipped (see AddInitializersToSkip). + std::string indices_name; + { + std::vector data_shape, indices_shape; + ORT_RETURN_IF_NOT(GetShape(*input_defs[0], data_shape, logger) && + GetShape(*input_defs[1], indices_shape, logger) && !indices_shape.empty(), + "GatherND: failed to get data/indices shape"); + const size_t depth = static_cast(indices_shape.back()); + const Initializer unpacked(*model_builder.GetConstantInitializer(input_defs[1]->Name())); + int32_t indices_type = ONNX_NAMESPACE::TensorProto_DataType_INT64; + GetType(*input_defs[1], indices_type, logger); + + std::vector normalized; + const auto wrap = [&](auto src) { + normalized.reserve(src.size()); + for (size_t i = 0; i < src.size(); ++i) { + int64_t v = static_cast(src[i]); + if (v < 0) v += data_shape[i % depth]; + normalized.push_back(v); + } + }; + if (indices_type == ONNX_NAMESPACE::TensorProto_DataType_INT32) { + wrap(unpacked.DataAsSpan()); + } else { + wrap(unpacked.DataAsSpan()); + } + // AddConstant with int64 values emits an int32 'const' (CoreML uses int32 indices). + indices_name = model_builder.AddConstant(node.OpType(), "indices", normalized, + gsl::span(indices_shape)); + } + + // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.scatter_gather.gather_nd + // The iOS15 gather_nd has no batch_dims parameter and is equivalent to ONNX + // GatherND with batch_dims == 0 (other values are gated in IsOpSupportedImpl). + std::unique_ptr op = model_builder.CreateOperation(node, "gather_nd"); + AddOperationInput(*op, "x", gather_x_name); + AddOperationInput(*op, "indices", indices_name); + // CoreML docs mark validate_indices as optional, but the ML Program parser + // rejects gather_nd without it (same as the 'gather' op builder). + AddOperationInput(*op, "validate_indices", + model_builder.AddScalarConstant(op->type(), "validate_indices", false)); + + if (!data_is_bool) { + AddOperationOutput(*op, *output_defs[0]); + model_builder.AddOperation(std::move(op)); + return Status::OK(); + } + + // Cast the int32 gather_nd result back to bool to match the ONNX output type. + std::vector out_shape; + const bool has_out_shape = GetShape(*output_defs[0], out_shape, logger); + const std::string& gather_out_name = model_builder.GetUniqueName(node, "gather_nd_out_int32"); + AddIntermediateOperationOutput(*op, gather_out_name, ONNX_NAMESPACE::TensorProto_DataType_INT32, + has_out_shape ? std::optional>(out_shape) + : std::nullopt); + model_builder.AddOperation(std::move(op)); + + std::unique_ptr cast_out = model_builder.CreateOperation(node, "cast"); + AddOperationInput(*cast_out, "x", gather_out_name); + AddOperationInput(*cast_out, "dtype", + model_builder.AddScalarConstant(cast_out->type(), "dtype", std::string("bool"))); + AddOperationOutput(*cast_out, *output_defs[0]); + model_builder.AddOperation(std::move(cast_out)); + return Status::OK(); +} + +bool GatherNDOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params, + const logging::Logger& logger) const { + if (!input_params.create_mlprogram) { + LOGS(logger, VERBOSE) << "GatherND is only supported for the ML Program format."; + return false; + } + + // The iOS15 gather_nd op has no batch_dims parameter, so only batch_dims == 0 + // (the ONNX default) maps directly. + NodeAttrHelper helper(node); + const auto batch_dims = helper.Get("batch_dims", int64_t{0}); + if (batch_dims != 0) { + LOGS(logger, VERBOSE) << "GatherND only supports batch_dims == 0. Got: " << batch_dims; + return false; + } + + // CoreML's gather_nd miscomputes the result for some data/indices shape combinations when 'indices' + // is a non-constant (runtime) input -- it returns slice 0 regardless of the actual index value. With + // a constant 'indices' the op is correct (verified on-device), and constant indices is the common case + // (e.g. transformer attention-mask gathers). Require a constant 'indices' so we never silently emit + // wrong results; non-constant cases fall back to CPU. + if (!input_params.graph_viewer.IsConstantInitializer(node.InputDefs()[1]->Name(), /*check_outer_scope*/ true)) { + LOGS(logger, VERBOSE) << "GatherND: 'indices' must be a constant initializer for the CoreML EP."; + return false; + } + + // Negative indices are normalized to positive at build time (AddToModelBuilderImpl), which needs the + // indexed data dims -- the first indices.shape[-1] dims -- to be statically known. + std::vector data_shape, indices_shape; + if (!GetShape(*node.InputDefs()[0], data_shape, logger) || + !GetShape(*node.InputDefs()[1], indices_shape, logger) || indices_shape.empty()) { + LOGS(logger, VERBOSE) << "GatherND: data or indices shape is unknown."; + return false; + } + const size_t depth = static_cast(indices_shape.back()); + if (depth > data_shape.size()) { + LOGS(logger, VERBOSE) << "GatherND: index tuple depth " << depth << " exceeds data rank " << data_shape.size(); + return false; + } + for (size_t k = 0; k < depth; ++k) { + if (data_shape[k] < 0) { + LOGS(logger, VERBOSE) << "GatherND: indexed data dims must be static."; + return false; + } + } + + return true; +} + +void GatherNDOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const { + // 'indices' is re-emitted as a normalized int32 constant in AddToModelBuilderImpl, so skip the original. + model_builder.AddInitializerToSkip(node.InputDefs()[1]->Name()); +} + +bool GatherNDOpBuilder::HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& /*input_params*/, + const logging::Logger& logger) const { + const auto& input_defs = node.InputDefs(); + int32_t data_type = 0, indices_type = 0; + if (!GetType(*input_defs[0], data_type, logger) || !GetType(*input_defs[1], indices_type, logger)) { + return false; + } + + // gather_nd itself is type-agnostic over 'x' but rejects bool; bool 'data' + // (transformer mask graphs) is supported via a cast round-trip in + // AddToModelBuilderImpl. INT64 'data' is accepted because the CoreML EP + // implicitly narrows int64 to int32 at the model boundary (the int64->int32 + // input conversion in model.mm and the matching INT32 feature/output handling + // in ModelBuilder::RegisterModelInputOutput), so CoreML never sees int64. + if (data_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT && + data_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 && + data_type != ONNX_NAMESPACE::TensorProto_DataType_INT32 && + data_type != ONNX_NAMESPACE::TensorProto_DataType_INT64 && + data_type != ONNX_NAMESPACE::TensorProto_DataType_BOOL) { + LOGS(logger, VERBOSE) << "GatherND: 'data' input type not supported. Got type: " << data_type; + return false; + } + + // ONNX GatherND indices are int64; the CoreML EP converts int64 <-> int32. + if (indices_type != ONNX_NAMESPACE::TensorProto_DataType_INT64 && + indices_type != ONNX_NAMESPACE::TensorProto_DataType_INT32) { + LOGS(logger, VERBOSE) << "GatherND: 'indices' input must be int32 or int64. Got type: " << indices_type; + return false; + } + return true; +} + +void CreateGatherNDOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) { + op_registrations.builders.push_back(std::make_unique()); + op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get()); +} + +} // namespace coreml +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.cc b/onnxruntime/core/providers/coreml/builders/model_builder.cc index 658a90a8d3eb0..3b8287d7d2b50 100644 --- a/onnxruntime/core/providers/coreml/builders/model_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/model_builder.cc @@ -917,6 +917,12 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i AddInt64Output(name); } break; + case ONNX_NAMESPACE::TensorProto_DataType_BOOL: + // ArrayFeatureType has no bool, so (like int64) the external feature is INT32. The int32<->bool + // cast at the ML Program boundary is wired up below / in RewriteBoolGraphIOBoundaries(), and the + // runtime int32<->bool data conversion is handled in model.mm. + multi_array->set_datatype(ArrayFeatureType::INT32); + break; default: { // TODO: support other type return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, @@ -932,22 +938,123 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i return Status::OK(); } + const bool is_bool = data_type == ONNX_NAMESPACE::TensorProto_DataType_BOOL; + if (create_ml_program_) { if (is_input) { // the model inputs need to be wired up as args to the 'main' function. auto tensor_value_type = CreateNamedTensorValueType(node_arg, /*convert_scalar*/ true); - // Handle conversion from int64 to int32 + // Handle conversion from int64 to int32. A bool feature is exposed as int32 too, so the function + // arg is int32; the int32->bool cast is inserted immediately below so the op builders see bool. tensor_value_type.mutable_type()->mutable_tensortype()->set_datatype( - OnnxDataTypeToMILSpec(data_type)); + OnnxDataTypeToMILSpec(is_bool ? ONNX_NAMESPACE::TensorProto_DataType_INT32 : data_type)); tensor_value_type.set_name(name); mlprogram_main_fn_->mutable_inputs()->Add(std::move(tensor_value_type)); + + if (is_bool) { + // Emit the int32->bool cast now (ahead of any consumer in the block). Consumers still reference + // `name`; RewriteBoolGraphIOBoundaries() repoints them at the bool value once they've been added. + const std::string bool_name = GetUniqueName(name + "_to_bool"); + AddBoundaryCastOp(name, bool_name, ONNX_NAMESPACE::TensorProto_DataType_BOOL, shape); + bool_input_value_rename_[name] = bool_name; + } } else { // the model outputs need to be set as outputs of the Block for the 'main' function *mlprogram_main_block_->mutable_outputs()->Add() = name; + + if (is_bool) { + // The op builders produce a bool value named `name`; RewriteBoolGraphIOBoundaries() inserts a + // bool->int32 cast so the int32 feature/block-output `name` is satisfied. + bool_graph_outputs_.emplace_back(name, shape); + } + } + } + + return Status::OK(); +} + +void ModelBuilder::AddBoundaryCastOp(std::string_view input_value_name, std::string_view output_value_name, + int32_t output_onnx_type, gsl::span shape) { + auto op = std::make_unique(); + op->set_type("cast"); + (*op->mutable_attributes())["name"] = + CreateScalarTensorValue(GetUniqueName(MakeString("boundary_cast_", output_value_name))); + + AddOperationInput(*op, "x", input_value_name); + const std::string mil_dtype = + output_onnx_type == ONNX_NAMESPACE::TensorProto_DataType_BOOL ? "bool" : "int32"; + AddOperationInput(*op, "dtype", AddScalarConstant(op->type(), "dtype", mil_dtype)); + AddIntermediateOperationOutput(*op, output_value_name, output_onnx_type, shape); + + AddOperation(std::move(op)); +} + +Status ModelBuilder::RewriteBoolGraphIOBoundaries() { + if (bool_input_value_rename_.empty() && bool_graph_outputs_.empty()) { + return Status::OK(); + } + + // bool graph inputs: the int32->bool cast was already emitted (ahead of consumers) in + // RegisterModelInputOutput. Repoint each consumer at the bool value. The cast ops themselves + // legitimately reference the original int32 input, so skip any op whose output is a rename target. + if (!bool_input_value_rename_.empty()) { + std::unordered_set cast_outputs; + for (const auto& [orig, bool_name] : bool_input_value_rename_) { + cast_outputs.insert(bool_name); + } + for (auto& op : *mlprogram_main_block_->mutable_operations()) { + bool is_boundary_cast = false; + for (const auto& out : op.outputs()) { + if (Contains(cast_outputs, out.name())) { + is_boundary_cast = true; + break; + } + } + if (is_boundary_cast) { + continue; + } + for (auto& input : *op.mutable_inputs()) { + for (auto& arg : *input.second.mutable_arguments()) { + auto it = bool_input_value_rename_.find(arg.name()); + if (it != bool_input_value_rename_.end()) { + arg.set_name(it->second); + } + } + } + } + } + + // bool graph outputs: the op builders produced a bool value named `name`. Rename that producer's output + // (and any internal consumers) to a bool intermediate, then append a bool->int32 cast producing the + // int32 feature/block-output `name`. + for (const auto& [name, shape] : bool_graph_outputs_) { + const std::string pre_name = GetUniqueName(name + "_from_bool"); + bool found = false; + for (auto& op : *mlprogram_main_block_->mutable_operations()) { + for (auto& out : *op.mutable_outputs()) { + if (out.name() == name) { + out.set_name(pre_name); + found = true; + } + } + } + if (!found) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "RewriteBoolGraphIOBoundaries: bool graph output not produced by any operation: ", name); + } + for (auto& op : *mlprogram_main_block_->mutable_operations()) { + for (auto& input : *op.mutable_inputs()) { + for (auto& arg : *input.second.mutable_arguments()) { + if (arg.name() == name) { + arg.set_name(pre_name); + } + } + } } + AddBoundaryCastOp(pre_name, name, ONNX_NAMESPACE::TensorProto_DataType_INT32, shape); } return Status::OK(); @@ -994,6 +1101,7 @@ Status ModelBuilder::CreateModel() { ORT_RETURN_IF_ERROR(RegisterModelOutputs()); if (create_ml_program_) { + ORT_RETURN_IF_ERROR(RewriteBoolGraphIOBoundaries()); SanitizeNames(); } diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.h b/onnxruntime/core/providers/coreml/builders/model_builder.h index f3012e8137e8c..1430a98b5c5c6 100644 --- a/onnxruntime/core/providers/coreml/builders/model_builder.h +++ b/onnxruntime/core/providers/coreml/builders/model_builder.h @@ -199,6 +199,21 @@ class ModelBuilder { Status RegisterModelOutputs(); Status RegisterModelInputOutput(const NodeArg& node_arg, bool is_input); + // CoreML's ArrayFeatureType (the external model IO representation) has no bool, so a bool graph + // input/output is exposed as an INT32 feature, mirroring the int64 handling. Inside the ML Program + // the op builders still operate on bool tensors, so the boundary needs int32<->bool cast ops: + // - bool graph input: cast(int32 feature) -> bool, then consumers reference the bool value. + // - bool graph output: cast(internal bool) -> int32, which becomes the int32 feature. + // RewriteBoolGraphIOBoundaries() inserts those casts after the op builders have run so the builders + // stay unaware of the boundary representation. The int32<->bool data conversion happens at runtime + // in model.mm, again mirroring int64. + Status RewriteBoolGraphIOBoundaries(); + + // Append a 'cast' op (input_value_name -> output_value_name with the given ONNX output type) to the + // main block. Used only by RewriteBoolGraphIOBoundaries to bridge the int32 feature boundary. + void AddBoundaryCastOp(std::string_view input_value_name, std::string_view output_value_name, + int32_t output_onnx_type, gsl::span shape); + // Record the onnx scalar output names void AddScalarOutput(const std::string& output_name); @@ -221,6 +236,14 @@ class ModelBuilder { std::unordered_set int64_outputs_; std::unordered_map input_output_info_; + // bool graph IO exposed as INT32 features (see RewriteBoolGraphIOBoundaries). + // For inputs the int32->bool cast is emitted eagerly in RegisterModelInputOutput (so it sits ahead of + // its consumers in the block); this map records original input name -> bool value name so the consumer + // references can be rewritten after the op builders have run. + std::unordered_map bool_input_value_rename_; + // For outputs the bool->int32 cast is appended after the op builders run; {name, shape} captured here. + std::vector>> bool_graph_outputs_; + std::unordered_map initializer_usage_; std::unordered_set skipped_inputs_; diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc index 7ba8a9fe5f09c..2dccf6c6550c8 100644 --- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc +++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc @@ -78,6 +78,7 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() { CreateDepthToSpaceOpBuilder("DepthToSpace", op_registrations); CreateFlattenOpBuilder("Flatten", op_registrations); CreateGatherOpBuilder("Gather", op_registrations); + CreateGatherNDOpBuilder("GatherND", op_registrations); CreateGemmOpBuilder("Gemm", op_registrations); CreateGridSampleOpBuilder("GridSample", op_registrations); CreateIdentityOpBuilder("Identity", op_registrations); diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.h b/onnxruntime/core/providers/coreml/builders/op_builder_factory.h index d399a4f91576e..49b5779866677 100644 --- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.h +++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.h @@ -29,6 +29,7 @@ void CreateConvTransposeOpBuilder(const std::string& op_type, OpBuilderRegistrat void CreateDepthToSpaceOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateFlattenOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateGatherOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); +void CreateGatherNDOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateGemmOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateGridSampleOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateIdentityOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm index 71664021ea2fb..9d8a47ae7e5e9 100644 --- a/onnxruntime/core/providers/coreml/model/model.mm +++ b/onnxruntime/core/providers/coreml/model/model.mm @@ -146,6 +146,23 @@ Status CreateInputFeatureProvider(const std::unordered_map(ShapeSize(shape)); + const auto input_span = gsl::span{static_cast(onnx_tensor_data.buffer), num_elements}; + auto conversion_buffer = std::make_unique(num_elements); + const auto conversion_span = gsl::span{conversion_buffer.get(), num_elements}; + std::transform(input_span.begin(), input_span.end(), conversion_span.begin(), + [](bool v) { return v ? 1 : 0; }); + + conversion_buffers.emplace_back(std::move(conversion_buffer)); + data_pointer = conversion_buffers.back().get(); + + break; + } default: { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Output data type is not supported, actual type: ", onnx_tensor_data.tensor_info.data_type); @@ -250,6 +267,26 @@ Status CopyMLMultiArrayBuffer(const void* mlmultiarray_buffer, void* tensor_buff } break; } + // CoreML has no bool MLMultiArray; a bool output is produced as int32 (see the model builder's + // RewriteBoolGraphIOBoundaries) and converted back to bool here. + case ONNX_NAMESPACE::TensorProto_DataType_BOOL: { + ORT_RETURN_IF(array.dataType != MLMultiArrayDataTypeInt32, + "CoreML output data type is not MLMultiArrayDataTypeInt32"); + + const int32_t* src_buffer = static_cast(mlmultiarray_buffer); + bool* dst_buffer = static_cast(tensor_buffer); + + for (int64_t idx = 0; idx < num_blocks; ++idx) { + auto input_span = gsl::span{src_buffer, static_cast(block_size)}; + auto output_span = gsl::span{dst_buffer, static_cast(block_size)}; + std::transform(input_span.begin(), input_span.end(), output_span.begin(), + [](int32_t v) { return v != 0; }); + + src_buffer += stride; + dst_buffer += block_size; + } + break; + } default: return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Output data type is not supported, actual type: ", onnx_data_type); diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc index 77f43b60dd6f8..7a20c73da49fb 100644 --- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc +++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc @@ -282,6 +282,55 @@ TEST(CoreMLExecutionProviderTest, ShapeThenSliceAndGather) { #endif } +// GatherND on the ML Program path is only claimed when 'indices' is a constant initializer +// (see GatherNDOpBuilder::IsOpSupportedImpl -- CoreML's gather_nd miscomputes some shapes with a +// runtime indices input). This is the supported path: a multi-dimensional slice gather (index depth 1 +// on rank-3 data) with constant indices must run on CoreML and match the CPU result. +TEST(CoreMLExecutionProviderTest, GatherNDConstantIndicesMLProgram) { + std::unordered_map domain_to_version{{kOnnxDomain, 13}}; + onnxruntime::Model model("gnd_const", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {}, + DefaultLoggingManager().DefaultLogger()); + auto& graph = model.MainGraph(); + auto make_type = [](int32_t et, std::vector dims) { + ONNX_NAMESPACE::TypeProto t; + t.mutable_tensor_type()->set_elem_type(et); + for (auto d : dims) t.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(d); + return t; + }; + const auto data_t = make_type(ONNX_NAMESPACE::TensorProto_DataType_INT64, {2, 2, 2}); + const auto out_t = make_type(ONNX_NAMESPACE::TensorProto_DataType_INT64, {2, 1, 2, 2}); + auto& data = graph.GetOrCreateNodeArg("data", &data_t); + auto& out = graph.GetOrCreateNodeArg("Y", &out_t); + ONNX_NAMESPACE::TensorProto idx; + idx.set_name("indices"); + idx.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + idx.add_dims(2); + idx.add_dims(1); + idx.add_dims(1); + idx.add_int64_data(1); + idx.add_int64_data(0); + graph.AddInitializedTensor(idx); + auto& idx_arg = graph.GetOrCreateNodeArg("indices", nullptr); + graph.AddNode("gnd", "GatherND", "", {&data, &idx_arg}, {&out}); + ORT_THROW_IF_ERROR(graph.Resolve()); + std::string md; + model.ToProto().SerializeToString(&md); + gsl::span span{reinterpret_cast(md.data()), md.size()}; +#if defined(__APPLE__) + std::vector dims = {2, 2, 2}; + std::vector vals = {0, 1, 2, 3, 4, 5, 6, 7}; + OrtValue dv; + CreateMLValue(CPUAllocator::DefaultInstance(), dims, vals, &dv); + NameMLValMap feeds; + feeds.insert(std::make_pair("data", dv)); + RunAndVerifyOutputsWithEP(span, CurrentTestName(), + MakeCoreMLExecutionProvider("MLProgram"), + feeds, + EPVerificationParams{ExpectedEPNodeAssignment::All}); +#endif +} + #endif // !(ORT_MINIMAL_BUILD) TEST(CoreMLExecutionProviderTest, TestOrtFormatModel) { @@ -2450,6 +2499,130 @@ TEST(CoreMLExecutionProviderTest, CastBoolMLProgramPartition) { TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All); } +namespace { +ONNX_NAMESPACE::TypeProto MakeTensorType(int32_t elem_type, const std::vector& shape) { + ONNX_NAMESPACE::TypeProto t; + t.mutable_tensor_type()->set_elem_type(elem_type); + for (int64_t d : shape) t.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(d); + return t; +} + +// Constant int64 indices initializer {{0},{2}} (shape [2,1]). +void AddGatherNDIndices(onnxruntime::Graph& graph) { + ONNX_NAMESPACE::TensorProto indices; + indices.set_name("indices"); + indices.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + indices.add_dims(2); + indices.add_dims(1); + for (int64_t v : {0, 2}) indices.add_int64_data(v); + graph.AddInitializedTensor(indices); +} + +// GatherND(data[4,3] float input, indices[2,1] const) -> out[2,3] float. +std::string MakeGatherNDModelData() { + onnxruntime::Model model("gather_nd_test", false, DefaultLoggingManager().DefaultLogger()); + auto& graph = model.MainGraph(); + const auto float_data = MakeTensorType(ONNX_NAMESPACE::TensorProto_DataType_FLOAT, {4, 3}); + const auto indices_type = MakeTensorType(ONNX_NAMESPACE::TensorProto_DataType_INT64, {2, 1}); + const auto float_out = MakeTensorType(ONNX_NAMESPACE::TensorProto_DataType_FLOAT, {2, 3}); + + auto& data = graph.GetOrCreateNodeArg("data", &float_data); + auto& indices = graph.GetOrCreateNodeArg("indices", &indices_type); + auto& out = graph.GetOrCreateNodeArg("Out", &float_out); + AddGatherNDIndices(graph); + graph.AddNode("gather_nd", "GatherND", "gather rows", {&data, &indices}, {&out}); + + ORT_THROW_IF_ERROR(graph.Resolve()); + std::string model_data; + model.ToProto().SerializeToString(&model_data); + return model_data; +} + +// data(int32 input) -> Cast(bool) -> GatherND -> Cast(float). Exercises the +// bool-data path, which the builder lowers as cast -> gather_nd -> cast (the +// bool tensors stay internal to the CoreML partition). +std::string MakeGatherNDBoolModelData() { + onnxruntime::Model model("gather_nd_bool_test", false, DefaultLoggingManager().DefaultLogger()); + auto& graph = model.MainGraph(); + const auto int32_data = MakeTensorType(ONNX_NAMESPACE::TensorProto_DataType_INT32, {4, 3}); + const auto bool_data = MakeTensorType(ONNX_NAMESPACE::TensorProto_DataType_BOOL, {4, 3}); + const auto indices_type = MakeTensorType(ONNX_NAMESPACE::TensorProto_DataType_INT64, {2, 1}); + const auto bool_out = MakeTensorType(ONNX_NAMESPACE::TensorProto_DataType_BOOL, {2, 3}); + const auto float_out = MakeTensorType(ONNX_NAMESPACE::TensorProto_DataType_FLOAT, {2, 3}); + + auto& src = graph.GetOrCreateNodeArg("Src", &int32_data); + auto& data = graph.GetOrCreateNodeArg("data", &bool_data); + auto& indices = graph.GetOrCreateNodeArg("indices", &indices_type); + auto& gathered = graph.GetOrCreateNodeArg("gathered", &bool_out); + auto& out = graph.GetOrCreateNodeArg("Out", &float_out); + AddGatherNDIndices(graph); + + auto& to_bool = graph.AddNode("cast_to_bool", "Cast", "int32 -> bool", {&src}, {&data}); + to_bool.AddAttribute("to", static_cast(ONNX_NAMESPACE::TensorProto_DataType_BOOL)); + graph.AddNode("gather_nd", "GatherND", "gather bool rows", {&data, &indices}, {&gathered}); + auto& to_float = graph.AddNode("cast_to_float", "Cast", "bool -> float", {&gathered}, {&out}); + to_float.AddAttribute("to", static_cast(ONNX_NAMESPACE::TensorProto_DataType_FLOAT)); + + ORT_THROW_IF_ERROR(graph.Resolve()); + std::string model_data; + model.ToProto().SerializeToString(&model_data); + return model_data; +} + +// GatherND with batch_dims=1: data[2,3] input, indices[2,1] const -> out[2]. +std::string MakeGatherNDBatchDimsModelData() { + onnxruntime::Model model("gather_nd_batchdims_test", false, DefaultLoggingManager().DefaultLogger()); + auto& graph = model.MainGraph(); + const auto float_data = MakeTensorType(ONNX_NAMESPACE::TensorProto_DataType_FLOAT, {2, 3}); + const auto indices_type = MakeTensorType(ONNX_NAMESPACE::TensorProto_DataType_INT64, {2, 1}); + const auto float_out = MakeTensorType(ONNX_NAMESPACE::TensorProto_DataType_FLOAT, {2}); + + auto& data = graph.GetOrCreateNodeArg("data", &float_data); + auto& indices = graph.GetOrCreateNodeArg("indices", &indices_type); + auto& out = graph.GetOrCreateNodeArg("Out", &float_out); + + ONNX_NAMESPACE::TensorProto indices_init; + indices_init.set_name("indices"); + indices_init.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + indices_init.add_dims(2); + indices_init.add_dims(1); + for (int64_t v : {0, 1}) indices_init.add_int64_data(v); + graph.AddInitializedTensor(indices_init); + + auto& node = graph.AddNode("gather_nd", "GatherND", "batched gather", {&data, &indices}, {&out}); + node.AddAttribute("batch_dims", static_cast(1)); + + ORT_THROW_IF_ERROR(graph.Resolve()); + std::string model_data; + model.ToProto().SerializeToString(&model_data); + return model_data; +} +} // namespace + +// GatherND is lowered to the ML Program 'gather_nd' op. +TEST(CoreMLExecutionProviderTest, GatherND_MLProgram) { + const std::string model_data = MakeGatherNDModelData(); + gsl::span model_span{reinterpret_cast(model_data.data()), + model_data.size()}; + +#if defined(__APPLE__) + std::vector dims = {4, 3}; + std::vector values = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, + 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; + OrtValue data_val; + CreateMLValue(CPUAllocator::DefaultInstance(), dims, values, &data_val); + NameMLValMap feeds; + feeds.insert(std::make_pair("data", data_val)); + + EPVerificationParams params{}; + params.ep_node_assignment = ExpectedEPNodeAssignment::All; + RunAndVerifyOutputsWithEP(model_span, CurrentTestName(), + MakeCoreMLExecutionProvider("MLProgram"), feeds, params); +#else + TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All); +#endif +} + // Sin and Cos are lowered to the ML Program 'sin' / 'cos' ops. TEST(CoreMLExecutionProviderTest, SinCos_MLProgram) { const std::string model_data = MakeSinCosModelData(); @@ -2550,6 +2723,31 @@ TEST(CoreMLExecutionProviderTest, GatherScalarIndicesAxis1) { #endif } +// CoreML's gather_nd rejects bool 'x', so the builder lowers a bool-data +// GatherND as cast(bool->int32) -> gather_nd -> cast(int32->bool). This +// Cast->GatherND->Cast chain must run fully on CoreML. +TEST(CoreMLExecutionProviderTest, GatherNDBoolData_MLProgram) { + const std::string model_data = MakeGatherNDBoolModelData(); + gsl::span model_span{reinterpret_cast(model_data.data()), + model_data.size()}; + +#if defined(__APPLE__) + std::vector dims = {4, 3}; + std::vector values = {0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1}; + OrtValue src_val; + CreateMLValue(CPUAllocator::DefaultInstance(), dims, values, &src_val); + NameMLValMap feeds; + feeds.insert(std::make_pair("Src", src_val)); + + EPVerificationParams params{}; + params.ep_node_assignment = ExpectedEPNodeAssignment::All; + RunAndVerifyOutputsWithEP(model_span, CurrentTestName(), + MakeCoreMLExecutionProvider("MLProgram"), feeds, params); +#else + TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All); +#endif +} + // Sin/Cos only have an ML Program lowering (the NeuralNetwork // UnaryFunctionLayerParams has no sin/cos), so on the NeuralNetwork format // they must fall back to CPU rather than be claimed. @@ -2630,6 +2828,24 @@ TEST(CoreMLExecutionProviderTest, GatherScalarIndicesAxis0) { #endif } +// GatherND only has an ML Program lowering; on the NeuralNetwork format it +// must fall back to CPU. +TEST(CoreMLExecutionProviderTest, GatherNDNeuralNetworkNotSupported) { + const std::string model_data = MakeGatherNDModelData(); + gsl::span model_span{reinterpret_cast(model_data.data()), + model_data.size()}; + TestModelLoad(model_span, MakeCoreMLExecutionProvider(), ExpectedEPNodeAssignment::None); +} + +// The iOS15 gather_nd op has no batch_dims parameter, so GatherND with +// batch_dims != 0 must fall back to CPU. +TEST(CoreMLExecutionProviderTest, GatherNDBatchDimsNotSupported) { + const std::string model_data = MakeGatherNDBatchDimsModelData(); + gsl::span model_span{reinterpret_cast(model_data.data()), + model_data.size()}; + TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::None); +} + TEST(CoreMLExecutionProviderTest, GatherScalarIndicesNegativeAxis) { // Scalar Gather with negative axis (-1) — verifies HandleNegativeAxis is // applied when computing the squeeze axis. diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md index 0365da55bd48a..d889fb4972878 100644 --- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md +++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md @@ -18,6 +18,7 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution |ai.onnx:Elu|| |ai.onnx:Erf|| |ai.onnx:Exp|| +|ai.onnx:GatherND|batch_dims must be 0.| |ai.onnx:Gemm|Input B must be constant.| |ai.onnx:Gelu|| |ai.onnx:GlobalAveragePool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.|