diff --git a/onnxruntime/core/providers/coreml/builders/impl/gather_nd_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/gather_nd_op_builder.cc
new file mode 100644
index 0000000000000..75d7c9998f3da
--- /dev/null
+++ b/onnxruntime/core/providers/coreml/builders/impl/gather_nd_op_builder.cc
@@ -0,0 +1,225 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <optional>
+#include <vector>
+
+#include "core/optimizer/initializer.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
+#include "core/providers/coreml/builders/model_builder.h"
+#include "core/providers/coreml/builders/op_builder_factory.h"
+#include "core/providers/coreml/shape_utils.h"
+#include "core/providers/shared/utils/utils.h"
+
+namespace onnxruntime {
+namespace coreml {
+
+// ONNX GatherND(data, indices) maps to the CoreML ML Program 'gather_nd' op.
+class GatherNDOpBuilder : public BaseOpBuilder {
+  void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
+
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override;
+
+  bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
+                         const logging::Logger& logger) const override;
+
+  bool HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& input_params,
+                              const logging::Logger& logger) const override;
+
+  bool SupportsMLProgram() const override { return true; }
+};
+
+Status GatherNDOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                                                const logging::Logger& logger) const {
+  using namespace CoreML::Specification::MILSpec;
+  const auto& input_defs = node.InputDefs();
+  const auto& output_defs = node.OutputDefs();
+
+  // CoreML's gather_nd does not accept a bool 'x'. Transformer attention-mask
+  // graphs gather from bool tensors, so for that case the op is composed as
+  // cast(bool -> int32) -> gather_nd -> cast(int32 -> bool). int32 represents
+  // 0/1 exactly, so the round-trip is lossless.
+  int32_t data_type = ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED;
+  GetType(*input_defs[0], data_type, logger);
+  const bool data_is_bool = data_type == ONNX_NAMESPACE::TensorProto_DataType_BOOL;
+
+  std::string_view gather_x_name = input_defs[0]->Name();
+  if (data_is_bool) {
+    std::vector<int64_t> x_shape;
+    const bool has_x_shape = GetShape(*input_defs[0], x_shape, logger);
+    const std::string& cast_x_name = model_builder.GetUniqueName(node, "gather_nd_x_int32");
+    std::unique_ptr<Operation> cast_in = model_builder.CreateOperation(node, "cast");
+    AddOperationInput(*cast_in, "x", input_defs[0]->Name());
+    AddOperationInput(*cast_in, "dtype",
+                      model_builder.AddScalarConstant(cast_in->type(), "dtype", std::string("int32")));
+    AddIntermediateOperationOutput(*cast_in, cast_x_name, ONNX_NAMESPACE::TensorProto_DataType_INT32,
+                                   has_x_shape ? std::optional<gsl::span<const int64_t>>(x_shape)
+                                               : std::nullopt);
+    model_builder.AddOperation(std::move(cast_in));
+    gather_x_name = cast_x_name;
+  }
+
+  // ONNX GatherND permits negative indices (wrapped by the corresponding data dim); CoreML's gather_nd
+  // does not. The indices are a constant and the indexed data dims are static (both gated in
+  // IsOpSupportedImpl), so wrap any negatives now and re-emit them as an int32 'indices' constant. The
+  // original initializer is skipped (see AddInitializersToSkip).
+  std::string indices_name;
+  {
+    std::vector<int64_t> data_shape, indices_shape;
+    ORT_RETURN_IF_NOT(GetShape(*input_defs[0], data_shape, logger) &&
+                          GetShape(*input_defs[1], indices_shape, logger) && !indices_shape.empty(),
+                      "GatherND: failed to get data/indices shape");
+    const size_t depth = static_cast<size_t>(indices_shape.back());
+    const Initializer unpacked(*model_builder.GetConstantInitializer(input_defs[1]->Name()));
+    int32_t indices_type = ONNX_NAMESPACE::TensorProto_DataType_INT64;
+    GetType(*input_defs[1], indices_type, logger);
+
+    std::vector<int64_t> normalized;
+    const auto wrap = [&](auto src) {
+      normalized.reserve(src.size());
+      for (size_t i = 0; i < src.size(); ++i) {
+        int64_t v = static_cast<int64_t>(src[i]);
+        if (v < 0) v += data_shape[i % depth];
+        normalized.push_back(v);
+      }
+    };
+    if (indices_type == ONNX_NAMESPACE::TensorProto_DataType_INT32) {
+      wrap(unpacked.DataAsSpan<int32_t>());
+    } else {
+      wrap(unpacked.DataAsSpan<int64_t>());
+    }
+    // AddConstant with int64 values emits an int32 'const' (CoreML uses int32 indices).
+    indices_name = model_builder.AddConstant(node.OpType(), "indices", normalized,
+                                             gsl::span<const int64_t>(indices_shape));
+  }
+
+  // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.scatter_gather.gather_nd
+  // The iOS15 gather_nd has no batch_dims parameter and is equivalent to ONNX
+  // GatherND with batch_dims == 0 (other values are gated in IsOpSupportedImpl).
+  std::unique_ptr<Operation> op = model_builder.CreateOperation(node, "gather_nd");
+  AddOperationInput(*op, "x", gather_x_name);
+  AddOperationInput(*op, "indices", indices_name);
+  // CoreML docs mark validate_indices as optional, but the ML Program parser
+  // rejects gather_nd without it (same as the 'gather' op builder).
+  AddOperationInput(*op, "validate_indices",
+                    model_builder.AddScalarConstant(op->type(), "validate_indices", false));
+
+  if (!data_is_bool) {
+    AddOperationOutput(*op, *output_defs[0]);
+    model_builder.AddOperation(std::move(op));
+    return Status::OK();
+  }
+
+  // Cast the int32 gather_nd result back to bool to match the ONNX output type.
+  std::vector<int64_t> out_shape;
+  const bool has_out_shape = GetShape(*output_defs[0], out_shape, logger);
+  const std::string& gather_out_name = model_builder.GetUniqueName(node, "gather_nd_out_int32");
+  AddIntermediateOperationOutput(*op, gather_out_name, ONNX_NAMESPACE::TensorProto_DataType_INT32,
+                                 has_out_shape ? std::optional<gsl::span<const int64_t>>(out_shape)
+                                               : std::nullopt);
+  model_builder.AddOperation(std::move(op));
+
+  std::unique_ptr<Operation> cast_out = model_builder.CreateOperation(node, "cast");
+  AddOperationInput(*cast_out, "x", gather_out_name);
+  AddOperationInput(*cast_out, "dtype",
+                    model_builder.AddScalarConstant(cast_out->type(), "dtype", std::string("bool")));
+  AddOperationOutput(*cast_out, *output_defs[0]);
+  model_builder.AddOperation(std::move(cast_out));
+  return Status::OK();
+}
+
+bool GatherNDOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
+                                          const logging::Logger& logger) const {
+  if (!input_params.create_mlprogram) {
+    LOGS(logger, VERBOSE) << "GatherND is only supported for the ML Program format.";
+    return false;
+  }
+
+  // The iOS15 gather_nd op has no batch_dims parameter, so only batch_dims == 0
+  // (the ONNX default) maps directly.
+  NodeAttrHelper helper(node);
+  const auto batch_dims = helper.Get("batch_dims", int64_t{0});
+  if (batch_dims != 0) {
+    LOGS(logger, VERBOSE) << "GatherND only supports batch_dims == 0. Got: " << batch_dims;
+    return false;
+  }
+
+  // CoreML's gather_nd miscomputes the result for some data/indices shape combinations when 'indices'
+  // is a non-constant (runtime) input -- it returns slice 0 regardless of the actual index value. With
+  // a constant 'indices' the op is correct (verified on-device), and constant indices is the common case
+  // (e.g. transformer attention-mask gathers). Require a constant 'indices' so we never silently emit
+  // wrong results; non-constant cases fall back to CPU.
+  if (!input_params.graph_viewer.IsConstantInitializer(node.InputDefs()[1]->Name(), /*check_outer_scope*/ true)) {
+    LOGS(logger, VERBOSE) << "GatherND: 'indices' must be a constant initializer for the CoreML EP.";
+    return false;
+  }
+
+  // Negative indices are normalized to positive at build time (AddToModelBuilderImpl), which needs the
+  // indexed data dims -- the first indices.shape[-1] dims -- to be statically known.
+  std::vector<int64_t> data_shape, indices_shape;
+  if (!GetShape(*node.InputDefs()[0], data_shape, logger) ||
+      !GetShape(*node.InputDefs()[1], indices_shape, logger) || indices_shape.empty()) {
+    LOGS(logger, VERBOSE) << "GatherND: data or indices shape is unknown.";
+    return false;
+  }
+  const size_t depth = static_cast<size_t>(indices_shape.back());
+  if (depth > data_shape.size()) {
+    LOGS(logger, VERBOSE) << "GatherND: index tuple depth " << depth << " exceeds data rank " << data_shape.size();
+    return false;
+  }
+  for (size_t k = 0; k < depth; ++k) {
+    if (data_shape[k] < 0) {
+      LOGS(logger, VERBOSE) << "GatherND: indexed data dims must be static.";
+      return false;
+    }
+  }
+
+  return true;
+}
+
+void GatherNDOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
+  // 'indices' is re-emitted as a normalized int32 constant in AddToModelBuilderImpl, so skip the original.
+  model_builder.AddInitializerToSkip(node.InputDefs()[1]->Name());
+}
+
+bool GatherNDOpBuilder::HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& /*input_params*/,
+                                               const logging::Logger& logger) const {
+  const auto& input_defs = node.InputDefs();
+  int32_t data_type = 0, indices_type = 0;
+  if (!GetType(*input_defs[0], data_type, logger) || !GetType(*input_defs[1], indices_type, logger)) {
+    return false;
+  }
+
+  // gather_nd itself is type-agnostic over 'x' but rejects bool; bool 'data'
+  // (transformer mask graphs) is supported via a cast round-trip in
+  // AddToModelBuilderImpl. INT64 'data' is accepted because the CoreML EP
+  // implicitly narrows int64 to int32 at the model boundary (the int64->int32
+  // input conversion in model.mm and the matching INT32 feature/output handling
+  // in ModelBuilder::RegisterModelInputOutput), so CoreML never sees int64.
+  if (data_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT &&
+      data_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 &&
+      data_type != ONNX_NAMESPACE::TensorProto_DataType_INT32 &&
+      data_type != ONNX_NAMESPACE::TensorProto_DataType_INT64 &&
+      data_type != ONNX_NAMESPACE::TensorProto_DataType_BOOL) {
+    LOGS(logger, VERBOSE) << "GatherND: 'data' input type not supported. Got type: " << data_type;
+    return false;
+  }
+
+  // ONNX GatherND indices are int64; the CoreML EP converts int64 <-> int32.
+  if (indices_type != ONNX_NAMESPACE::TensorProto_DataType_INT64 &&
+      indices_type != ONNX_NAMESPACE::TensorProto_DataType_INT32) {
+    LOGS(logger, VERBOSE) << "GatherND: 'indices' input must be int32 or int64. Got type: " << indices_type;
+    return false;
+  }
+  return true;
+}
+
+void CreateGatherNDOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
+  op_registrations.builders.push_back(std::make_unique<GatherNDOpBuilder>());
+  op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get());
+}
+
+}  // namespace coreml
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.cc b/onnxruntime/core/providers/coreml/builders/model_builder.cc
index 658a90a8d3eb0..3b8287d7d2b50 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.cc
@@ -917,6 +917,12 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
           AddInt64Output(name);
         }
         break;
+      case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+        // ArrayFeatureType has no bool, so (like int64) the external feature is INT32. The int32<->bool
+        // cast at the ML Program boundary is wired up below / in RewriteBoolGraphIOBoundaries(), and the
+        // runtime int32<->bool data conversion is handled in model.mm.
+        multi_array->set_datatype(ArrayFeatureType::INT32);
+        break;
       default: {
         // TODO: support other type
         return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
@@ -932,22 +938,123 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
     return Status::OK();
   }
 
+  const bool is_bool = data_type == ONNX_NAMESPACE::TensorProto_DataType_BOOL;
+
   if (create_ml_program_) {
     if (is_input) {
       // the model inputs need to be wired up as args to the 'main' function.
       auto tensor_value_type = CreateNamedTensorValueType(node_arg, /*convert_scalar*/ true);
 
-      // Handle conversion from int64 to int32
+      // Handle conversion from int64 to int32. A bool feature is exposed as int32 too, so the function
+      // arg is int32; the int32->bool cast is inserted immediately below so the op builders see bool.
       tensor_value_type.mutable_type()->mutable_tensortype()->set_datatype(
-          OnnxDataTypeToMILSpec(data_type));
+          OnnxDataTypeToMILSpec(is_bool ? ONNX_NAMESPACE::TensorProto_DataType_INT32 : data_type));
 
       tensor_value_type.set_name(name);
 
       mlprogram_main_fn_->mutable_inputs()->Add(std::move(tensor_value_type));
+
+      if (is_bool) {
+        // Emit the int32->bool cast now (ahead of any consumer in the block). Consumers still reference
+        // `name`; RewriteBoolGraphIOBoundaries() repoints them at the bool value once they've been added.
+        const std::string bool_name = GetUniqueName(name + "_to_bool");
+        AddBoundaryCastOp(name, bool_name, ONNX_NAMESPACE::TensorProto_DataType_BOOL, shape);
+        bool_input_value_rename_[name] = bool_name;
+      }
     } else {
       // the model outputs need to be set as outputs of the Block for the 'main' function
       *mlprogram_main_block_->mutable_outputs()->Add() = name;
+
+      if (is_bool) {
+        // The op builders produce a bool value named `name`; RewriteBoolGraphIOBoundaries() inserts a
+        // bool->int32 cast so the int32 feature/block-output `name` is satisfied.
+        bool_graph_outputs_.emplace_back(name, shape);
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+void ModelBuilder::AddBoundaryCastOp(std::string_view input_value_name, std::string_view output_value_name,
+                                     int32_t output_onnx_type, gsl::span<const int64_t> shape) {
+  auto op = std::make_unique<MILSpec::Operation>();
+  op->set_type("cast");
+  (*op->mutable_attributes())["name"] =
+      CreateScalarTensorValue(GetUniqueName(MakeString("boundary_cast_", output_value_name)));
+
+  AddOperationInput(*op, "x", input_value_name);
+  const std::string mil_dtype =
+      output_onnx_type == ONNX_NAMESPACE::TensorProto_DataType_BOOL ? "bool" : "int32";
+  AddOperationInput(*op, "dtype", AddScalarConstant(op->type(), "dtype", mil_dtype));
+  AddIntermediateOperationOutput(*op, output_value_name, output_onnx_type, shape);
+
+  AddOperation(std::move(op));
+}
+
+Status ModelBuilder::RewriteBoolGraphIOBoundaries() {
+  if (bool_input_value_rename_.empty() && bool_graph_outputs_.empty()) {
+    return Status::OK();
+  }
+
+  // bool graph inputs: the int32->bool cast was already emitted (ahead of consumers) in
+  // RegisterModelInputOutput. Repoint each consumer at the bool value. The cast ops themselves
+  // legitimately reference the original int32 input, so skip any op whose output is a rename target.
+  if (!bool_input_value_rename_.empty()) {
+    std::unordered_set<std::string> cast_outputs;
+    for (const auto& [orig, bool_name] : bool_input_value_rename_) {
+      cast_outputs.insert(bool_name);
+    }
+    for (auto& op : *mlprogram_main_block_->mutable_operations()) {
+      bool is_boundary_cast = false;
+      for (const auto& out : op.outputs()) {
+        if (Contains(cast_outputs, out.name())) {
+          is_boundary_cast = true;
+          break;
+        }
+      }
+      if (is_boundary_cast) {
+        continue;
+      }
+      for (auto& input : *op.mutable_inputs()) {
+        for (auto& arg : *input.second.mutable_arguments()) {
+          auto it = bool_input_value_rename_.find(arg.name());
+          if (it != bool_input_value_rename_.end()) {
+            arg.set_name(it->second);
+          }
+        }
+      }
+    }
+  }
+
+  // bool graph outputs: the op builders produced a bool value named `name`. Rename that producer's output
+  // (and any internal consumers) to a bool intermediate, then append a bool->int32 cast producing the
+  // int32 feature/block-output `name`.
+  for (const auto& [name, shape] : bool_graph_outputs_) {
+    const std::string pre_name = GetUniqueName(name + "_from_bool");
+    bool found = false;
+    for (auto& op : *mlprogram_main_block_->mutable_operations()) {
+      for (auto& out : *op.mutable_outputs()) {
+        if (out.name() == name) {
+          out.set_name(pre_name);
+          found = true;
+        }
+      }
+    }
+    if (!found) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                             "RewriteBoolGraphIOBoundaries: bool graph output not produced by any operation: ", name);
+    }
+    for (auto& op : *mlprogram_main_block_->mutable_operations()) {
+      for (auto& input : *op.mutable_inputs()) {
+        for (auto& arg : *input.second.mutable_arguments()) {
+          if (arg.name() == name) {
+            arg.set_name(pre_name);
+          }
+        }
+      }
     }
+    AddBoundaryCastOp(pre_name, name, ONNX_NAMESPACE::TensorProto_DataType_INT32, shape);
   }
 
   return Status::OK();
@@ -994,6 +1101,7 @@ Status ModelBuilder::CreateModel() {
   ORT_RETURN_IF_ERROR(RegisterModelOutputs());
 
   if (create_ml_program_) {
+    ORT_RETURN_IF_ERROR(RewriteBoolGraphIOBoundaries());
     SanitizeNames();
   }
 
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.h b/onnxruntime/core/providers/coreml/builders/model_builder.h
index f3012e8137e8c..1430a98b5c5c6 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.h
@@ -199,6 +199,21 @@ class ModelBuilder {
   Status RegisterModelOutputs();
   Status RegisterModelInputOutput(const NodeArg& node_arg, bool is_input);
 
+  // CoreML's ArrayFeatureType (the external model IO representation) has no bool, so a bool graph
+  // input/output is exposed as an INT32 feature, mirroring the int64 handling. Inside the ML Program
+  // the op builders still operate on bool tensors, so the boundary needs int32<->bool cast ops:
+  //   - bool graph input:  cast(int32 feature) -> bool, then consumers reference the bool value.
+  //   - bool graph output: cast(internal bool) -> int32, which becomes the int32 feature.
+  // RewriteBoolGraphIOBoundaries() inserts those casts after the op builders have run so the builders
+  // stay unaware of the boundary representation. The int32<->bool data conversion happens at runtime
+  // in model.mm, again mirroring int64.
+  Status RewriteBoolGraphIOBoundaries();
+
+  // Append a 'cast' op (input_value_name -> output_value_name with the given ONNX output type) to the
+  // main block. Used only by RewriteBoolGraphIOBoundaries to bridge the int32 feature boundary.
+  void AddBoundaryCastOp(std::string_view input_value_name, std::string_view output_value_name,
+                         int32_t output_onnx_type, gsl::span<const int64_t> shape);
+
   // Record the onnx scalar output names
   void AddScalarOutput(const std::string& output_name);
 
@@ -221,6 +236,14 @@ class ModelBuilder {
   std::unordered_set<std::string> int64_outputs_;
   std::unordered_map<std::string, OnnxTensorInfo> input_output_info_;
 
+  // bool graph IO exposed as INT32 features (see RewriteBoolGraphIOBoundaries).
+  // For inputs the int32->bool cast is emitted eagerly in RegisterModelInputOutput (so it sits ahead of
+  // its consumers in the block); this map records original input name -> bool value name so the consumer
+  // references can be rewritten after the op builders have run.
+  std::unordered_map<std::string, std::string> bool_input_value_rename_;
+  // For outputs the bool->int32 cast is appended after the op builders run; {name, shape} captured here.
+  std::vector<std::pair<std::string, std::vector<int64_t>>> bool_graph_outputs_;
+
   std::unordered_map<std::string, int> initializer_usage_;
   std::unordered_set<std::string> skipped_inputs_;
 
diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
index 7ba8a9fe5f09c..2dccf6c6550c8 100644
--- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
+++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
@@ -78,6 +78,7 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
   CreateDepthToSpaceOpBuilder("DepthToSpace", op_registrations);
   CreateFlattenOpBuilder("Flatten", op_registrations);
   CreateGatherOpBuilder("Gather", op_registrations);
+  CreateGatherNDOpBuilder("GatherND", op_registrations);
   CreateGemmOpBuilder("Gemm", op_registrations);
   CreateGridSampleOpBuilder("GridSample", op_registrations);
   CreateIdentityOpBuilder("Identity", op_registrations);
diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.h b/onnxruntime/core/providers/coreml/builders/op_builder_factory.h
index d399a4f91576e..49b5779866677 100644
--- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.h
+++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.h
@@ -29,6 +29,7 @@ void CreateConvTransposeOpBuilder(const std::string& op_type, OpBuilderRegistrat
 void CreateDepthToSpaceOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateFlattenOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateGatherOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+void CreateGatherNDOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateGemmOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateGridSampleOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateIdentityOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm
index 71664021ea2fb..9d8a47ae7e5e9 100644
--- a/onnxruntime/core/providers/coreml/model/model.mm
+++ b/onnxruntime/core/providers/coreml/model/model.mm
@@ -146,6 +146,23 @@ Status CreateInputFeatureProvider(const std::unordered_map<std::string, OnnxTens
 
         break;
       }
+      case ONNX_NAMESPACE::TensorProto_DataType_BOOL: {
+        // CoreML has no bool MLMultiArray; the bool feature is exposed as int32 (see the model builder's
+        // RewriteBoolGraphIOBoundaries). Convert the bool input to int32 (0/1).
+        data_type = MLMultiArrayDataTypeInt32;
+
+        const auto num_elements = narrow<size_t>(ShapeSize(shape));
+        const auto input_span = gsl::span{static_cast<const bool*>(onnx_tensor_data.buffer), num_elements};
+        auto conversion_buffer = std::make_unique<int32_t[]>(num_elements);
+        const auto conversion_span = gsl::span{conversion_buffer.get(), num_elements};
+        std::transform(input_span.begin(), input_span.end(), conversion_span.begin(),
+                       [](bool v) { return v ? 1 : 0; });
+
+        conversion_buffers.emplace_back(std::move(conversion_buffer));
+        data_pointer = conversion_buffers.back().get();
+
+        break;
+      }
       default: {
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Output data type is not supported, actual type: ",
                                onnx_tensor_data.tensor_info.data_type);
@@ -250,6 +267,26 @@ Status CopyMLMultiArrayBuffer(const void* mlmultiarray_buffer, void* tensor_buff
       }
       break;
     }
+    // CoreML has no bool MLMultiArray; a bool output is produced as int32 (see the model builder's
+    // RewriteBoolGraphIOBoundaries) and converted back to bool here.
+    case ONNX_NAMESPACE::TensorProto_DataType_BOOL: {
+      ORT_RETURN_IF(array.dataType != MLMultiArrayDataTypeInt32,
+                    "CoreML output data type is not MLMultiArrayDataTypeInt32");
+
+      const int32_t* src_buffer = static_cast<const int32_t*>(mlmultiarray_buffer);
+      bool* dst_buffer = static_cast<bool*>(tensor_buffer);
+
+      for (int64_t idx = 0; idx < num_blocks; ++idx) {
+        auto input_span = gsl::span{src_buffer, static_cast<size_t>(block_size)};
+        auto output_span = gsl::span{dst_buffer, static_cast<size_t>(block_size)};
+        std::transform(input_span.begin(), input_span.end(), output_span.begin(),
+                       [](int32_t v) { return v != 0; });
+
+        src_buffer += stride;
+        dst_buffer += block_size;
+      }
+      break;
+    }
     default:
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
                              "Output data type is not supported, actual type: ", onnx_data_type);
diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
index 77f43b60dd6f8..7a20c73da49fb 100644
--- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc
+++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
@@ -282,6 +282,55 @@ TEST(CoreMLExecutionProviderTest, ShapeThenSliceAndGather) {
 #endif
 }
 
+// GatherND on the ML Program path is only claimed when 'indices' is a constant initializer
+// (see GatherNDOpBuilder::IsOpSupportedImpl -- CoreML's gather_nd miscomputes some shapes with a
+// runtime indices input). This is the supported path: a multi-dimensional slice gather (index depth 1
+// on rank-3 data) with constant indices must run on CoreML and match the CPU result.
+TEST(CoreMLExecutionProviderTest, GatherNDConstantIndicesMLProgram) {
+  std::unordered_map<std::string, int> domain_to_version{{kOnnxDomain, 13}};
+  onnxruntime::Model model("gnd_const", false, ModelMetaData(), PathString(),
+                           IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                           DefaultLoggingManager().DefaultLogger());
+  auto& graph = model.MainGraph();
+  auto make_type = [](int32_t et, std::vector<int64_t> dims) {
+    ONNX_NAMESPACE::TypeProto t;
+    t.mutable_tensor_type()->set_elem_type(et);
+    for (auto d : dims) t.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(d);
+    return t;
+  };
+  const auto data_t = make_type(ONNX_NAMESPACE::TensorProto_DataType_INT64, {2, 2, 2});
+  const auto out_t = make_type(ONNX_NAMESPACE::TensorProto_DataType_INT64, {2, 1, 2, 2});
+  auto& data = graph.GetOrCreateNodeArg("data", &data_t);
+  auto& out = graph.GetOrCreateNodeArg("Y", &out_t);
+  ONNX_NAMESPACE::TensorProto idx;
+  idx.set_name("indices");
+  idx.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  idx.add_dims(2);
+  idx.add_dims(1);
+  idx.add_dims(1);
+  idx.add_int64_data(1);
+  idx.add_int64_data(0);
+  graph.AddInitializedTensor(idx);
+  auto& idx_arg = graph.GetOrCreateNodeArg("indices", nullptr);
+  graph.AddNode("gnd", "GatherND", "", {&data, &idx_arg}, {&out});
+  ORT_THROW_IF_ERROR(graph.Resolve());
+  std::string md;
+  model.ToProto().SerializeToString(&md);
+  gsl::span<const std::byte> span{reinterpret_cast<const std::byte*>(md.data()), md.size()};
+#if defined(__APPLE__)
+  std::vector<int64_t> dims = {2, 2, 2};
+  std::vector<int64_t> vals = {0, 1, 2, 3, 4, 5, 6, 7};
+  OrtValue dv;
+  CreateMLValue<int64_t>(CPUAllocator::DefaultInstance(), dims, vals, &dv);
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("data", dv));
+  RunAndVerifyOutputsWithEP(span, CurrentTestName(),
+                            MakeCoreMLExecutionProvider("MLProgram"),
+                            feeds,
+                            EPVerificationParams{ExpectedEPNodeAssignment::All});
+#endif
+}
+
 #endif  // !(ORT_MINIMAL_BUILD)
 
 TEST(CoreMLExecutionProviderTest, TestOrtFormatModel) {
@@ -2450,6 +2499,130 @@ TEST(CoreMLExecutionProviderTest, CastBoolMLProgramPartition) {
   TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All);
 }
 
+namespace {
+ONNX_NAMESPACE::TypeProto MakeTensorType(int32_t elem_type, const std::vector<int64_t>& shape) {
+  ONNX_NAMESPACE::TypeProto t;
+  t.mutable_tensor_type()->set_elem_type(elem_type);
+  for (int64_t d : shape) t.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(d);
+  return t;
+}
+
+// Constant int64 indices initializer {{0},{2}} (shape [2,1]).
+void AddGatherNDIndices(onnxruntime::Graph& graph) {
+  ONNX_NAMESPACE::TensorProto indices;
+  indices.set_name("indices");
+  indices.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  indices.add_dims(2);
+  indices.add_dims(1);
+  for (int64_t v : {0, 2}) indices.add_int64_data(v);
+  graph.AddInitializedTensor(indices);
+}
+
+// GatherND(data[4,3] float input, indices[2,1] const) -> out[2,3] float.
+std::string MakeGatherNDModelData() {
+  onnxruntime::Model model("gather_nd_test", false, DefaultLoggingManager().DefaultLogger());
+  auto& graph = model.MainGraph();
+  const auto float_data = MakeTensorType(ONNX_NAMESPACE::TensorProto_DataType_FLOAT, {4, 3});
+  const auto indices_type = MakeTensorType(ONNX_NAMESPACE::TensorProto_DataType_INT64, {2, 1});
+  const auto float_out = MakeTensorType(ONNX_NAMESPACE::TensorProto_DataType_FLOAT, {2, 3});
+
+  auto& data = graph.GetOrCreateNodeArg("data", &float_data);
+  auto& indices = graph.GetOrCreateNodeArg("indices", &indices_type);
+  auto& out = graph.GetOrCreateNodeArg("Out", &float_out);
+  AddGatherNDIndices(graph);
+  graph.AddNode("gather_nd", "GatherND", "gather rows", {&data, &indices}, {&out});
+
+  ORT_THROW_IF_ERROR(graph.Resolve());
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+  return model_data;
+}
+
+// data(int32 input) -> Cast(bool) -> GatherND -> Cast(float). Exercises the
+// bool-data path, which the builder lowers as cast -> gather_nd -> cast (the
+// bool tensors stay internal to the CoreML partition).
+std::string MakeGatherNDBoolModelData() {
+  onnxruntime::Model model("gather_nd_bool_test", false, DefaultLoggingManager().DefaultLogger());
+  auto& graph = model.MainGraph();
+  const auto int32_data = MakeTensorType(ONNX_NAMESPACE::TensorProto_DataType_INT32, {4, 3});
+  const auto bool_data = MakeTensorType(ONNX_NAMESPACE::TensorProto_DataType_BOOL, {4, 3});
+  const auto indices_type = MakeTensorType(ONNX_NAMESPACE::TensorProto_DataType_INT64, {2, 1});
+  const auto bool_out = MakeTensorType(ONNX_NAMESPACE::TensorProto_DataType_BOOL, {2, 3});
+  const auto float_out = MakeTensorType(ONNX_NAMESPACE::TensorProto_DataType_FLOAT, {2, 3});
+
+  auto& src = graph.GetOrCreateNodeArg("Src", &int32_data);
+  auto& data = graph.GetOrCreateNodeArg("data", &bool_data);
+  auto& indices = graph.GetOrCreateNodeArg("indices", &indices_type);
+  auto& gathered = graph.GetOrCreateNodeArg("gathered", &bool_out);
+  auto& out = graph.GetOrCreateNodeArg("Out", &float_out);
+  AddGatherNDIndices(graph);
+
+  auto& to_bool = graph.AddNode("cast_to_bool", "Cast", "int32 -> bool", {&src}, {&data});
+  to_bool.AddAttribute("to", static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_BOOL));
+  graph.AddNode("gather_nd", "GatherND", "gather bool rows", {&data, &indices}, {&gathered});
+  auto& to_float = graph.AddNode("cast_to_float", "Cast", "bool -> float", {&gathered}, {&out});
+  to_float.AddAttribute("to", static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_FLOAT));
+
+  ORT_THROW_IF_ERROR(graph.Resolve());
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+  return model_data;
+}
+
+// GatherND with batch_dims=1: data[2,3] input, indices[2,1] const -> out[2].
+std::string MakeGatherNDBatchDimsModelData() {
+  onnxruntime::Model model("gather_nd_batchdims_test", false, DefaultLoggingManager().DefaultLogger());
+  auto& graph = model.MainGraph();
+  const auto float_data = MakeTensorType(ONNX_NAMESPACE::TensorProto_DataType_FLOAT, {2, 3});
+  const auto indices_type = MakeTensorType(ONNX_NAMESPACE::TensorProto_DataType_INT64, {2, 1});
+  const auto float_out = MakeTensorType(ONNX_NAMESPACE::TensorProto_DataType_FLOAT, {2});
+
+  auto& data = graph.GetOrCreateNodeArg("data", &float_data);
+  auto& indices = graph.GetOrCreateNodeArg("indices", &indices_type);
+  auto& out = graph.GetOrCreateNodeArg("Out", &float_out);
+
+  ONNX_NAMESPACE::TensorProto indices_init;
+  indices_init.set_name("indices");
+  indices_init.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  indices_init.add_dims(2);
+  indices_init.add_dims(1);
+  for (int64_t v : {0, 1}) indices_init.add_int64_data(v);
+  graph.AddInitializedTensor(indices_init);
+
+  auto& node = graph.AddNode("gather_nd", "GatherND", "batched gather", {&data, &indices}, {&out});
+  node.AddAttribute("batch_dims", static_cast<int64_t>(1));
+
+  ORT_THROW_IF_ERROR(graph.Resolve());
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+  return model_data;
+}
+}  // namespace
+
+// GatherND is lowered to the ML Program 'gather_nd' op.
+TEST(CoreMLExecutionProviderTest, GatherND_MLProgram) {
+  const std::string model_data = MakeGatherNDModelData();
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()),
+                                        model_data.size()};
+
+#if defined(__APPLE__)
+  std::vector<int64_t> dims = {4, 3};
+  std::vector<float> values = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                               6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
+  OrtValue data_val;
+  CreateMLValue<float>(CPUAllocator::DefaultInstance(), dims, values, &data_val);
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("data", data_val));
+
+  EPVerificationParams params{};
+  params.ep_node_assignment = ExpectedEPNodeAssignment::All;
+  RunAndVerifyOutputsWithEP(model_span, CurrentTestName(),
+                            MakeCoreMLExecutionProvider("MLProgram"), feeds, params);
+#else
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All);
+#endif
+}
+
 // Sin and Cos are lowered to the ML Program 'sin' / 'cos' ops.
 TEST(CoreMLExecutionProviderTest, SinCos_MLProgram) {
   const std::string model_data = MakeSinCosModelData();
@@ -2550,6 +2723,31 @@ TEST(CoreMLExecutionProviderTest, GatherScalarIndicesAxis1) {
 #endif
 }
 
+// CoreML's gather_nd rejects bool 'x', so the builder lowers a bool-data
+// GatherND as cast(bool->int32) -> gather_nd -> cast(int32->bool). This
+// Cast->GatherND->Cast chain must run fully on CoreML.
+TEST(CoreMLExecutionProviderTest, GatherNDBoolData_MLProgram) {
+  const std::string model_data = MakeGatherNDBoolModelData();
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()),
+                                        model_data.size()};
+
+#if defined(__APPLE__)
+  std::vector<int64_t> dims = {4, 3};
+  std::vector<int32_t> values = {0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1};
+  OrtValue src_val;
+  CreateMLValue<int32_t>(CPUAllocator::DefaultInstance(), dims, values, &src_val);
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("Src", src_val));
+
+  EPVerificationParams params{};
+  params.ep_node_assignment = ExpectedEPNodeAssignment::All;
+  RunAndVerifyOutputsWithEP(model_span, CurrentTestName(),
+                            MakeCoreMLExecutionProvider("MLProgram"), feeds, params);
+#else
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All);
+#endif
+}
+
 // Sin/Cos only have an ML Program lowering (the NeuralNetwork
 // UnaryFunctionLayerParams has no sin/cos), so on the NeuralNetwork format
 // they must fall back to CPU rather than be claimed.
@@ -2630,6 +2828,24 @@ TEST(CoreMLExecutionProviderTest, GatherScalarIndicesAxis0) {
 #endif
 }
 
+// GatherND only has an ML Program lowering; on the NeuralNetwork format it
+// must fall back to CPU.
+TEST(CoreMLExecutionProviderTest, GatherNDNeuralNetworkNotSupported) {
+  const std::string model_data = MakeGatherNDModelData();
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()),
+                                        model_data.size()};
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider(), ExpectedEPNodeAssignment::None);
+}
+
+// The iOS15 gather_nd op has no batch_dims parameter, so GatherND with
+// batch_dims != 0 must fall back to CPU.
+TEST(CoreMLExecutionProviderTest, GatherNDBatchDimsNotSupported) {
+  const std::string model_data = MakeGatherNDBatchDimsModelData();
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()),
+                                        model_data.size()};
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::None);
+}
+
 TEST(CoreMLExecutionProviderTest, GatherScalarIndicesNegativeAxis) {
   // Scalar Gather with negative axis (-1) — verifies HandleNegativeAxis is
   // applied when computing the squeeze axis.
diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
index 0365da55bd48a..d889fb4972878 100644
--- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
+++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
@@ -18,6 +18,7 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:Elu||
 |ai.onnx:Erf||
 |ai.onnx:Exp||
+|ai.onnx:GatherND|batch_dims must be 0.|
 |ai.onnx:Gemm|Input B must be constant.|
 |ai.onnx:Gelu||
 |ai.onnx:GlobalAveragePool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.|