First pass addressing review comments

ericcraw · ericcraw · commit cf5a86bc046d · 2026-04-16T17:41:04.000-07:00
diff --git a/onnxruntime/core/framework/allocation_planner.cc b/onnxruntime/core/framework/allocation_planner.cc
@@ -913,11 +913,6 @@ class PlannerImpl {
           ProcessDef(index, node_output);
           OrtDevice output_device = exec_provider->GetOrtDeviceByMemType(p_kernel_def->OutputMemoryType(i));
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
-          // Downstream nodes of certain providers may require a CPU accessible location override
-          // to make sure the EP does not incur an unnecessary copy.
-          // We only do it for CPU based EPs. We are not likely to encounter
-          // non CPU devices here since they are already taken care of by using MemCpy nodes earlier.
-          // However, we still ignore them.
           if (output_device.UsesCpuMemory()) {
             const auto& output_name = node_output->Name();
             const auto consumers = graph_viewer_.GetConsumerNodes(output_name);
diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc
@@ -50,29 +50,48 @@ bool ProviderIsCpuBased(const IExecutionProvider& provider) {
   return provider.GetDevice().Type() == OrtDevice::CPU;
 }
 
-// Returns true if no data transfer is needed between the two devices.
-// HOST_ACCESSIBLE memory is a superset — accessible by both host and device — so it can satisfy
-// DEFAULT memory requirements on the same physical device without a copy.
-static bool DevicesAreMemoryCompatible(const OrtDevice& a, const OrtDevice& b) {
-  const bool a_is_cpu_mem = a.UsesCpuMemory();
-  const bool b_is_cpu_mem = b.UsesCpuMemory();
-
-  // Both CPU-accessible: compatible unless both are HOST_ACCESSIBLE on different physical devices.
-  if (a_is_cpu_mem && b_is_cpu_mem) {
-    if (a.Type() == OrtDevice::CPU || b.Type() == OrtDevice::CPU) {
-      return true;
+// Returns true if src memory can satisfy tgt's requirements without a data copy.
+//
+// HOST_ACCESSIBLE → DEFAULT is valid: the device can access HOST_ACCESSIBLE memory directly.
+// DEFAULT → HOST_ACCESSIBLE is NOT valid: HOST_ACCESSIBLE implies CPU consumers, and DEFAULT
+// memory is device-only — the CPU cannot read it.
+//
+// For the mixed case, src alignment must meet tgt's minimum requirement.
+// Alignment 0 means "unspecified" and is treated as compatible with any requirement.
+bool CanSourceSatisfyTarget(const OrtDevice& src, const OrtDevice& tgt) {
+  const bool src_is_cpu_mem = src.UsesCpuMemory();
+  const bool tgt_is_cpu_mem = tgt.UsesCpuMemory();
+
+  // Identical devices are always compatible.
+  if (src == tgt) {
+    return true;
+  }
+
+  // Alignment 0 means "unspecified" — treat as compatible with any alignment requirement.
+  const bool is_alignment_satisfied = src.GetAlignment() == 0 || tgt.GetAlignment() == 0 ||
+                                      src.GetAlignment() >= tgt.GetAlignment();
+
+  // Both are CPU-accessible (CPU type or HOST_ACCESSIBLE memory).
+  if (src_is_cpu_mem && tgt_is_cpu_mem) {
+    // CPU target can read from any CPU or HOST_ACCESSIBLE source, regardless of the source device
+    if (tgt.Type() == OrtDevice::CPU) {
+      return is_alignment_satisfied;
     }
-    return a.Type() == b.Type() &&
-           a.Vendor() == b.Vendor() &&
-           a.Id() == b.Id();
+    // Both are HOST_ACCESSIBLE on some device: require the same physical device.
+    return src.Type() == tgt.Type() &&
+           src.Vendor() == tgt.Vendor() &&
+           src.Id() == tgt.Id() && is_alignment_satisfied;
   }
 
-  // HOST_ACCESSIBLE <-> DEFAULT: compatible only on the same physical device.
-  if ((a_is_cpu_mem != b_is_cpu_mem) &&
-      a.Type() == b.Type() &&
-      a.Vendor() == b.Vendor() &&
-      a.Id() == b.Id()) {
-    return true;
+  // HOST_ACCESSIBLE source can serve a DEFAULT target on the same physical device —
+  // the device can DMA from HOST_ACCESSIBLE memory directly.
+  // The reverse (DEFAULT → HOST_ACCESSIBLE) is unsafe: HOST_ACCESSIBLE implies CPU consumers,
+  // and DEFAULT memory is device-only so the CPU cannot read it.
+  if (src_is_cpu_mem && !tgt_is_cpu_mem &&
+      src.Type() == tgt.Type() &&
+      src.Vendor() == tgt.Vendor() &&
+      src.Id() == tgt.Id()) {
+    return is_alignment_satisfied;
   }
 
   return false;
@@ -146,16 +165,19 @@ const std::string& GetNodeInputProviderType(const SessionState::NodeInfo& info)
 }
 
 // Populate device_fetches for the output-copy path.
-// Reuses a pre-allocated user buffer when the memory is compatible (same device or HOST_ACCESSIBLE
-// <-> DEFAULT on the same physical device); otherwise inserts an empty placeholder.
+// When the user pre-allocates a fetch buffer, reuse it directly as the EP's output buffer if
+// the user's buffer (tgt) can satisfy the EP's output device (src) requirements — i.e.,
+// CanSourceSatisfyTarget(tgt, src). This avoids a post-execution copy.
+// Otherwise inserts an empty placeholder for the EP to allocate into.
 static void PopulateDeviceFetches(gsl::span<const MLValueCopyInfo> fetch_copy_info,
                                   const std::vector<OrtValue>& fetches,
                                   std::vector<OrtValue>& device_fetches) {
+  ORT_ENFORCE(fetch_copy_info.size() >= fetches.size());
   device_fetches.reserve(fetches.size());
   for (size_t i = 0; i < fetches.size(); ++i) {
     const auto& src = fetch_copy_info[i].source_device;
     const auto& tgt = fetch_copy_info[i].target_device;
-    if ((src == tgt || DevicesAreMemoryCompatible(src, tgt)) && fetches[i].IsAllocated()) {
+    if (CanSourceSatisfyTarget(tgt, src) && fetches[i].IsAllocated()) {
       device_fetches.push_back(fetches[i]);
     } else {
       device_fetches.push_back({});
@@ -178,10 +200,9 @@ static Status BatchOrCopyMLValue(const SessionState& session_state,
                                  std::vector<IDataTransfer::SrcDstPair>* copy_tensor_pairs = nullptr)
 #endif
 {
-  // No data transfer needed if devices are the same or memory-compatible
-  // (e.g. HOST_ACCESSIBLE <-> DEFAULT on the same physical device).
-  if (copy_info.source_device == copy_info.target_device ||
-      DevicesAreMemoryCompatible(copy_info.source_device, copy_info.target_device)) {
+  // No data transfer needed if devices are identical, or the source can satisfy the target
+  // (HOST_ACCESSIBLE source serving a DEFAULT target on the same physical device).
+  if (CanSourceSatisfyTarget(copy_info.source_device, copy_info.target_device)) {
     target_mlvalue = source_mlvalue;
     return Status::OK();
   }
@@ -372,8 +393,7 @@ static bool FinalizeCopyInfoForFeeds(gsl::span<const OrtDevice> feed_locations,
   for (size_t i = 0, end = feed_locations.size(); i < end; ++i) {
     copy_info[i].source_device = feed_locations[i];
 
-    if (copy_info[i].source_device != copy_info[i].target_device &&
-        !DevicesAreMemoryCompatible(copy_info[i].source_device, copy_info[i].target_device)) {
+    if (!CanSourceSatisfyTarget(copy_info[i].source_device, copy_info[i].target_device)) {
       copy_needed = true;
     }
   }
@@ -394,8 +414,7 @@ static bool FinalizeCopyInfoForFetches(gsl::span<const OrtDevice* const>& fetch_
       copy_info[i].target_device = *alloc_info;
     }
 
-    if (copy_info[i].source_device != copy_info[i].target_device &&
-        !DevicesAreMemoryCompatible(copy_info[i].source_device, copy_info[i].target_device)) {
+    if (!CanSourceSatisfyTarget(copy_info[i].source_device, copy_info[i].target_device)) {
       copy_needed = true;
     }
   }
@@ -702,9 +721,7 @@ ExecuteGraphImpl(const SessionState& session_state,
       feeds_to_use = device_feeds;
     }
 
-    auto num_outputs = fetches.size();
     const auto& fetch_copy_info = feeds_fetches_manager.GetFetchesDeviceCopyInfo();
-
     if (device_copy_checks.output_copy_needed == DeviceCopyCheck::Copy) {
       PopulateDeviceFetches(fetch_copy_info, fetches, device_fetches);
       p_fetches = &device_fetches;
@@ -847,7 +864,6 @@ common::Status ExecutePartialGraphImpl(const SessionState& session_state, FeedsF
       p_feeds = device_feeds;
     }
 
-    auto num_outputs = fetches.size();
     const auto& fetch_copy_info = feeds_fetches_manager.GetFetchesDeviceCopyInfo();
 
     if (device_copy_checks.output_copy_needed == DeviceCopyCheck::Copy) {
diff --git a/onnxruntime/core/framework/utils.h b/onnxruntime/core/framework/utils.h
@@ -57,6 +57,11 @@ bool ProviderIsCpuBased(const IExecutionProvider& provider);
 
 bool IsMemcpyNode(const Node& node);
 
+// Returns true if src memory can satisfy tgt's requirements without a data copy.
+// HOST_ACCESSIBLE -> DEFAULT is valid (device can access HOST_ACCESSIBLE memory directly).
+// DEFAULT -> HOST_ACCESSIBLE is NOT valid (CPU cannot read device-only memory).
+bool CanSourceSatisfyTarget(const OrtDevice& src, const OrtDevice& tgt);
+
 common::Status CopyOneInputAcrossDevices(const SessionState& session_state, const std::string& input_name,
                                          const OrtValue& orig_mlvalue, OrtValue& new_mlvalue);
 
diff --git a/onnxruntime/core/session/provider_policy_context.cc b/onnxruntime/core/session/provider_policy_context.cc
@@ -421,8 +421,8 @@ void ProviderPolicyContext::FoldSelectedDevices(std::vector<const OrtEpDevice*>
     info.ep_factory = devices_selected[0]->ep_factory;
 
     do {
-      auto iter = std::find_if(devices_selected.begin(), devices_selected.end(), [&ep_name](const OrtEpDevice* d) {
-        return d->ep_name == ep_name;
+      auto iter = std::find_if(devices_selected.begin(), devices_selected.end(), [&ep_name, &info](const OrtEpDevice* d) {
+        return d->ep_name == ep_name && d->ep_factory == info.ep_factory;
       });
 
       if (iter != devices_selected.end()) {
diff --git a/onnxruntime/test/framework/utils_test.cc b/onnxruntime/test/framework/utils_test.cc
@@ -0,0 +1,107 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "gtest/gtest.h"
+#include "core/framework/utils.h"
+
+namespace onnxruntime {
+namespace test {
+
+constexpr OrtDevice::VendorId kTestVendor1 = 0x1234;
+constexpr OrtDevice::VendorId kTestVendor2 = 0x5678;
+
+static OrtDevice Cpu() {
+  return OrtDevice{OrtDevice::CPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::NONE, 0};
+}
+
+static OrtDevice HostAccessible(OrtDevice::VendorId vendor, OrtDevice::DeviceId id,
+                                OrtDevice::Alignment align = 0) {
+  return OrtDevice{OrtDevice::NPU, OrtDevice::MemType::HOST_ACCESSIBLE, vendor, id, align};
+}
+
+static OrtDevice Default(OrtDevice::VendorId vendor, OrtDevice::DeviceId id,
+                         OrtDevice::Alignment align = 0) {
+  return OrtDevice{OrtDevice::NPU, OrtDevice::MemType::DEFAULT, vendor, id, align};
+}
+
+TEST(CanSourceSatisfyTargetTest, CpuSourceHostAccessibleTarget) {
+  EXPECT_FALSE(utils::CanSourceSatisfyTarget(Cpu(), HostAccessible(kTestVendor1, 0)));
+}
+
+TEST(CanSourceSatisfyTargetTest, HostAccessibleSourceCpuTarget) {
+  EXPECT_TRUE(utils::CanSourceSatisfyTarget(HostAccessible(kTestVendor1, 0), Cpu()));
+}
+
+// src == tgt early return: identical devices are always compatible
+TEST(CanSourceSatisfyTargetTest, BothHostAccessibleSameDevice) {
+  auto dev = HostAccessible(kTestVendor1, 0, 16);
+  EXPECT_TRUE(utils::CanSourceSatisfyTarget(dev, dev));
+}
+
+// Branch 3: both HOST_ACCESSIBLE, different physical device
+TEST(CanSourceSatisfyTargetTest, BothHostAccessibleDifferentId) {
+  EXPECT_FALSE(utils::CanSourceSatisfyTarget(
+      HostAccessible(kTestVendor1, 0), HostAccessible(kTestVendor1, 1)));
+}
+
+TEST(CanSourceSatisfyTargetTest, BothHostAccessibleDifferentVendor) {
+  EXPECT_FALSE(utils::CanSourceSatisfyTarget(
+      HostAccessible(kTestVendor1, 0), HostAccessible(kTestVendor2, 0)));
+}
+
+TEST(CanSourceSatisfyTargetTest, BothHostAccessibleDifferentAlignment) {
+  // Different alignment => OrtDevice::operator== returns false
+  EXPECT_FALSE(utils::CanSourceSatisfyTarget(
+      HostAccessible(kTestVendor1, 0, 16), HostAccessible(kTestVendor1, 0, 32)));
+}
+
+// Branch 4: HOST_ACCESSIBLE (src) -> DEFAULT (tgt), same physical device
+TEST(CanSourceSatisfyTargetTest, HostAccessibleToDefaultSameDevice) {
+  EXPECT_TRUE(utils::CanSourceSatisfyTarget(
+      HostAccessible(kTestVendor1, 0), Default(kTestVendor1, 0)));
+}
+
+TEST(CanSourceSatisfyTargetTest, HostAccessibleToDefaultAlignmentSatisfied) {
+  // src alignment >= tgt alignment: compatible
+  EXPECT_TRUE(utils::CanSourceSatisfyTarget(
+      HostAccessible(kTestVendor1, 0, 64), Default(kTestVendor1, 0, 32)));
+}
+
+TEST(CanSourceSatisfyTargetTest, HostAccessibleToDefaultAlignmentInsufficient) {
+  // src alignment < tgt alignment: incompatible
+  EXPECT_FALSE(utils::CanSourceSatisfyTarget(
+      HostAccessible(kTestVendor1, 0, 16), Default(kTestVendor1, 0, 64)));
+}
+
+TEST(CanSourceSatisfyTargetTest, HostAccessibleToDefaultSrcAlignmentZero) {
+  // 0 = unspecified, treated as wildcard
+  EXPECT_TRUE(utils::CanSourceSatisfyTarget(
+      HostAccessible(kTestVendor1, 0, 0), Default(kTestVendor1, 0, 64)));
+}
+
+TEST(CanSourceSatisfyTargetTest, HostAccessibleToDefaultTgtAlignmentZero) {
+  // 0 = unspecified, treated as wildcard
+  EXPECT_TRUE(utils::CanSourceSatisfyTarget(
+      HostAccessible(kTestVendor1, 0, 16), Default(kTestVendor1, 0, 0)));
+}
+
+TEST(CanSourceSatisfyTargetTest, HostAccessibleToDefaultDifferentDeviceId) {
+  EXPECT_FALSE(utils::CanSourceSatisfyTarget(
+      HostAccessible(kTestVendor1, 0), Default(kTestVendor1, 1)));
+}
+
+// Branch 5: incompatible cases
+
+TEST(CanSourceSatisfyTargetTest, DefaultToHostAccessibleRejected) {
+  // Reversed direction: CPU cannot read DEFAULT (device-only) memory
+  EXPECT_FALSE(utils::CanSourceSatisfyTarget(
+      Default(kTestVendor1, 0), HostAccessible(kTestVendor1, 0)));
+}
+
+TEST(CanSourceSatisfyTargetTest, DefaultToDefaultRejected) {
+  EXPECT_FALSE(utils::CanSourceSatisfyTarget(
+      Default(kTestVendor1, 0), Default(kTestVendor2, 0)));
+}
+
+}  // namespace test
+}  // namespace onnxruntime