From 1f33802dc0bb3ce7893ffc2b23b495992336e9aa Mon Sep 17 00:00:00 2001
From: anusha godavarthy Surya <agodavar@amd.com>
Date: Sat, 9 May 2026 10:28:06 +0000
Subject: [PATCH] =?UTF-8?q?Revert=20"[hipGraph]=20Add=20graph=20signal=20p?=
 =?UTF-8?q?ool=20and=20remove=20pre/post=20markers=20for=20non-=E2=80=A6?=
 =?UTF-8?q?=20(#5333)"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit f17b05703aecf3f9f78a979239349ed00d27e289.

Reverting due to two issues observed in GraphBench:

1. Resource leak: per-launch GraphSignalPool was deleted via a forward-
   declared pointer in ~AccumulateCommand, so ~GraphSignalPool never ran
   and every per-launch HSA signal was leaked
   ("Resource leak detected by SharedSignalPool, N Signals leaked").

2. Per-launch host overhead: shifting from the runtime's rotating
   signal_list_ (which reuses signals after warmup with WaitCurrent +
   WaitNext) to a per-launch graph signal pool reintroduces N×
   signal_create + signal_destroy on every hipGraphLaunch, causing
   measurable regression in graph_bench across straight/paths2/paths4/
   full2/full4 topologies.

A safer redesign that preserves PR #5333's per-launch isolation while
recycling signals (mirroring the runtime pool's WaitCurrent/WaitNext
discipline) is needed before re-landing.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../clr/hipamd/src/hip_graph_internal.cpp     | 122 +++++++-------
 .../clr/hipamd/src/hip_graph_internal.hpp     |  10 --
 projects/clr/rocclr/device/device.hpp         |  16 --
 projects/clr/rocclr/device/rocm/rocdevice.cpp | 112 -------------
 projects/clr/rocclr/device/rocm/rocdevice.hpp |  61 -------
 .../clr/rocclr/device/rocm/rocvirtual.cpp     | 149 ++----------------
 .../clr/rocclr/device/rocm/rocvirtual.hpp     |   4 +-
 projects/clr/rocclr/platform/command.cpp      |  20 +--
 projects/clr/rocclr/platform/command.hpp      |  12 --
 9 files changed, 77 insertions(+), 429 deletions(-)

diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp
index 83d7181473a..c152d1a253a 100644
--- a/projects/clr/hipamd/src/hip_graph_internal.cpp
+++ b/projects/clr/hipamd/src/hip_graph_internal.cpp
@@ -432,7 +432,6 @@ void GraphExec::BuildSyncPlan() {
   sync_plan_.patch_list.clear();
   sync_plan_.barrier_packets.clear();
   sync_plan_.leaf_segment_ids.clear();
-  graph_irq_signal_count_ = 0;
 
   auto* device = g_devices[instantiateDeviceId_]->devices()[0];
 
@@ -552,19 +551,7 @@ void GraphExec::BuildSyncPlan() {
     if (segment.segment_ids_edges.empty()) {
       sync_plan_.leaf_segment_ids.push_back(segment.id);
     }
-
-    // Count interrupt signals needed for non-captured host nodes in this segment.
-    for (size_t i = 0; i < segment.nodes.size(); ++i) {
-      if (!segBatch.node_capture_status[i] &&
-          segment.nodes[i]->GetType() == hipGraphNodeTypeHost) {
-        graph_irq_signal_count_ += 2;
-      }
-    }
   }
-
-  // Seed GPU-only signal count with known minimum (SyncPlan HW events)
-  // so even the first launch pre-allocates these instead of growing on demand.
-  graph_signal_count_ = sync_plan_.num_segments;
 }
 
 // ================================================================================================
@@ -1728,19 +1715,19 @@ amd::Command* GraphExec::EnqueueSegmentedGraph(hip::Stream* launch_stream,
 
   auto* device = g_devices[launch_stream->DeviceId()]->devices()[0];
 
-  // Create per-launch graph signal pool: pre-allocates GPU-only and IRQ signals,
-  // acquires one hw-event per segment, and resets GetLastAcquired() so it only
-  // tracks actual dispatches. All pool method calls are encapsulated in the factory.
+  // Allocate HW events for all segments
   std::vector<void*> segment_hw_events;
-  auto* launch_pool = device->CreateGraphSignalPool(
-      graph_signal_count_, graph_irq_signal_count_,
-      static_cast<size_t>(sync_plan_.num_segments), segment_hw_events);
-  if (launch_pool == nullptr) {
-    if (out_status != nullptr) *out_status = hipErrorOutOfMemory;
-    return nullptr;
+  if (sync_plan_.num_segments > 0) {
+    if (!device->CreateHwEvents(sync_plan_.num_segments, segment_hw_events)) {
+      if (out_status != nullptr) {
+        *out_status = hipErrorOutOfMemory;
+      }
+      return nullptr;
+    }
   }
 
-  // Apply pre-computed patches — writes HW events directly into flatPacketData
+  // Apply pre-computed patches -- writes HW events directly into flatPacketData
+  // via the flat_packet pointers resolved at instantiate time, so no rebuild needed.
   if (!sync_plan_.patch_list.empty()) {
     device->ApplyHwEventPatches(sync_plan_.patch_list, segment_hw_events);
   }
@@ -1748,13 +1735,18 @@ amd::Command* GraphExec::EnqueueSegmentedGraph(hip::Stream* launch_stream,
   // Track stream assignments across levels
   std::unordered_map<int, hip::Stream*> segment_to_stream;
 
-  // Single AccumulateCommand on launch_stream manages graph pool lifetime
+  // Single AccumulateCommand on launch_stream manages all HW event lifetimes
   // and serves as the dispatch anchor for all segments across all streams.
-  // Note: SetOwnedGraphSignalPool transfers ownership — on error paths below,
-  // graph_accumulate->release() deletes the pool via ~AccumulateCommand.
   auto* graph_accumulate = new amd::AccumulateCommand(*launch_stream, {}, nullptr);
-  graph_accumulate->SetGraphSignalPool(launch_pool);
-  graph_accumulate->SetOwnedGraphSignalPool(launch_pool);
+
+  // Register HW events with graph_accumulate for lifetime tracking.
+  // addHwEvent does not retain — ownership transfers from CreateHwEvents
+  // to graph_accumulate. ~AccumulateCommand releases them after graph completion.
+  for (auto& hw_event : segment_hw_events) {
+    if (hw_event != nullptr) {
+      graph_accumulate->addHwEvent(hw_event, device);
+    }
+  }
 
   // Process segments level by level
   for (int level = 0; level <= max_dependency_level_; ++level) {
@@ -1767,13 +1759,13 @@ amd::Command* GraphExec::EnqueueSegmentedGraph(hip::Stream* launch_stream,
     AssignStreamsToSegments(segments_at_level, launch_stream, streams, segment_to_stream);
 
     if (level == 0) {
-      // Synchronize internal streams with launch stream's last command if available.
-      // These wait markers use runtime pool signals (no graph pool) for boundary sync.
+      // Synchronize internal streams with launch stream's last command if available
       amd::Command* launch_last_cmd = launch_stream->getLastQueuedCommand(true);
       if (launch_last_cmd != nullptr) {
         amd::Command::EventWaitList launch_wait_list;
         launch_wait_list.push_back(launch_last_cmd);
 
+        // For each segment at level 0, if it's on a different stream, add a wait marker
         for (int segment_id : segments_at_level) {
           hip::Stream* seg_stream = segment_to_stream[segment_id];
           if (seg_stream != launch_stream) {
@@ -1788,7 +1780,7 @@ amd::Command* GraphExec::EnqueueSegmentedGraph(hip::Stream* launch_stream,
       }
     }
 
-    // Dispatch each segment — barriers are in the batch, signals are patched
+    // Dispatch each segment -- barriers are in the batch, signals are patched
     for (int segment_id : segments_at_level) {
       const auto& segment = segments_[segment_id];
       hip::Stream* current_stream = segment_to_stream[segment_id];
@@ -1805,7 +1797,10 @@ amd::Command* GraphExec::EnqueueSegmentedGraph(hip::Stream* launch_stream,
     }
   }
 
-  // Synchronize parallel streams back to the launch stream via leaf HW events
+  // Synchronize parallel streams back to the launch stream.
+  // Each leaf segment already has a completion HW event signal patched onto its
+  // last packet. Add those HW events as dependencies on graph_accumulate so
+  // the runtime emits barrier packets when it is enqueued on the launch stream.
   if (IsLeafNodeSyncRequired()) {
     for (int seg_id : sync_plan_.leaf_segment_ids) {
       auto it = segment_to_stream.find(seg_id);
@@ -1815,12 +1810,6 @@ amd::Command* GraphExec::EnqueueSegmentedGraph(hip::Stream* launch_stream,
     }
   }
 
-  // Cache GPU-only signal count for future launches (first launch or if count grew)
-  size_t used = device->GetGraphSignalPoolUsedCount(launch_pool);
-  if (used > graph_signal_count_) {
-    graph_signal_count_ = used;
-  }
-
   graph_accumulate->enqueue();
 
   if (out_status != nullptr) {
@@ -1844,11 +1833,8 @@ hipError_t GraphExec::EnqueueSegment(const Segment& segment, hip::Stream* stream
 
   size_t batchIndex = 0;
 
-  // Lambda to dispatch the current batch at batchIndex.
-  // attach_signal: when true, places a completion signal on the last packet so a
-  // subsequent non-captured node (e.g. SDMA) can wait on it directly via
-  // WaitingSignal, avoiding an extra barrier packet in the AQL queue.
-  auto dispatchCurrentBatch = [&](bool attach_signal = false) -> hipError_t {
+  // Lambda to dispatch the current batch at batchIndex
+  auto dispatchCurrentBatch = [&]() -> hipError_t {
     if (!segBatch || batchIndex >= segBatch->packet_batches.size()) {
       return hipSuccess;
     }
@@ -1875,7 +1861,7 @@ hipError_t GraphExec::EnqueueSegment(const Segment& segment, hip::Stream* stream
 
     if (!flatData->empty()) {
       bool batchStatus = stream->vdev()->dispatchAqlPacketBatchFlat(
-          *flatData, *flatHdrs, accumulate, attach_signal, kernelNamesToDispatch, true);
+          *flatData, *flatHdrs, accumulate, false, kernelNamesToDispatch, true);
       if (!batchStatus) {
         return hipErrorUnknown;
       }
@@ -1939,6 +1925,8 @@ hipError_t GraphExec::EnqueueSegment(const Segment& segment, hip::Stream* stream
   }
 
   for (size_t i = 0; i < segment.nodes.size(); ++i) {
+    auto& node = segment.nodes[i];
+
     if (segBatch && i < segBatch->node_capture_status.size() &&
         segBatch->node_capture_status[i]) {
       // Node was successfully captured - dispatch its batch
@@ -1953,36 +1941,36 @@ hipError_t GraphExec::EnqueueSegment(const Segment& segment, hip::Stream* stream
         // Skip all consecutive captured nodes that belong to this batch
         i += packetBatch.nodeRanges.size() - 1;
 
-        // Check if an uncaptured SDMA node (memcpy/memset) follows this batch.
-        // If so, attach a signal to the last packet so the SDMA engine can wait
-        // on it directly via WaitingSignal, avoiding an extra barrier packet.
-        bool sdma_follows = false;
-        size_t next = i + 1;
-        if (next < segment.nodes.size() &&
-            next < segBatch->node_capture_status.size() &&
-            !segBatch->node_capture_status[next]) {
-          sdma_follows = (segment.nodes[next]->GetType() == hipGraphNodeTypeMemcpy);
-        }
-        if (sdma_follows) {
-          stream->vdev()->addSystemScope();
-        }
-        status = dispatchCurrentBatch(sdma_follows);
+        status = dispatchCurrentBatch();
         if (status != hipSuccess) return status;
       }
     } else {
-      // Non-captured nodes execute through the normal command pipeline.
-      auto& uncaptured_node = segment.nodes[i];
+      // Node doesn't support capture - execute individually.
+      // Flat dispatch bypasses the Barriers() tracker (ActiveSignal is skipped).
+      // Enqueue Markers before and after the non-captured node to:
+      //   Before: resync the Barriers() tracker so releaseGpuMemoryFence/WaitCurrent
+      //           can track queue progress for the node's internal operations.
+      //   After:  ensure the node's commands (e.g. host node blocking callbacks) are
+      //           fully flushed to the HW queue before the next flat batch is
+      //           dispatched, which writes directly to the HW queue.
+      auto pre_marker = new amd::Marker(*stream, kMarkerDisableFlush, {});
+      if (pre_marker != nullptr) {
+        pre_marker->enqueue();
+        pre_marker->release();
+      }
       if (DEBUG_HIP_GRAPH_DOT_PRINT) {
-        uncaptured_node->stream_id_ = stream->GetStreamId();
-        uncaptured_node->hw_queue_id_ = stream->getQueueID();
+        node->stream_id_ = stream->GetStreamId();
+        node->hw_queue_id_ = stream->getQueueID();
       }
-      uncaptured_node->SetStream(stream);
-      status = uncaptured_node->CreateCommand(uncaptured_node->GetQueue());
+      node->SetStream(stream);
+      status = node->CreateCommand(node->GetQueue());
       if (status != hipSuccess) return status;
-      if (accumulate->graphSignalPool() != nullptr) {
-        uncaptured_node->SetGraphSignalPoolOnCommands(accumulate->graphSignalPool());
+      node->EnqueueCommands(stream);
+      auto post_marker = new amd::Marker(*stream, kMarkerDisableFlush, {});
+      if (post_marker != nullptr) {
+        post_marker->enqueue();
+        post_marker->release();
       }
-      uncaptured_node->EnqueueCommands(stream);
     }
   }
 
diff --git a/projects/clr/hipamd/src/hip_graph_internal.hpp b/projects/clr/hipamd/src/hip_graph_internal.hpp
index 4651b3acbc6..4ac0abe1a43 100644
--- a/projects/clr/hipamd/src/hip_graph_internal.hpp
+++ b/projects/clr/hipamd/src/hip_graph_internal.hpp
@@ -291,12 +291,6 @@ class GraphNode : public hipGraphNodeDOTAttribute {
   int GetID() const { return id_; }
   /// Returns command for graph node
   virtual std::vector<amd::Command*>& GetCommands() { return commands_; }
-  /// Propagate graph signal pool to all commands owned by this node
-  void SetGraphSignalPoolOnCommands(amd::roc::GraphSignalPool* pool) {
-    for (auto& cmd : commands_) {
-      if (cmd != nullptr) cmd->SetGraphSignalPool(pool);
-    }
-  }
   /// Returns graph node type
   hipGraphNodeType GetType() const { return type_; }
   /// Clone graph node
@@ -1138,10 +1132,6 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
   SyncPlan sync_plan_;
 
   void BuildSyncPlan();
-
-  //! Cached signal counts for graph signal pool pre-allocation
-  size_t graph_signal_count_ = 0;      //!< GPU-only signals, cached after first launch
-  size_t graph_irq_signal_count_ = 0;  //!< Interrupt signals, determined at instantiation
 };
 
 class ChildGraphNode : public GraphNode, public GraphExec {
diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp
index 33c2c1bff69..c1d3958c1ba 100644
--- a/projects/clr/rocclr/device/device.hpp
+++ b/projects/clr/rocclr/device/device.hpp
@@ -43,7 +43,6 @@
 #include <shared_mutex>
 
 namespace amd {
-namespace roc { struct GraphSignalPool; }
 class Command;
 class CommandQueue;
 class ReadMemoryCommand;
@@ -1355,8 +1354,6 @@ class VirtualDevice : public amd::ReferenceCountedObject {
 
   //! Returns fence state of the VirtualGPU
   virtual bool isFenceDirty() const = 0;
-  //! Insert a system scope on the next dispatch
-  virtual void addSystemScope() {}
   //! Init hidden heap for device memory allocations
   virtual void HiddenHeapInit() = 0;
 
@@ -2132,19 +2129,6 @@ class Device : public RuntimeObject {
   virtual void ApplyHwEventPatches(const std::vector<HwEventPatch>& patches,
                                    const std::vector<void*>& hw_events) const {}
 
-  // Creates a fully-initialised per-launch graph signal pool.
-  // Allocates gpu_count GPU-only signals and irq_count interrupt signals,
-  // then acquires segment_count hw-event slots into hw_events and resets
-  // the last-acquired pointer so GetLastAcquired() only tracks dispatches.
-  // Returns nullptr (and leaves hw_events empty) on allocation failure.
-  virtual roc::GraphSignalPool* CreateGraphSignalPool(
-      size_t gpu_count, size_t irq_count,
-      size_t segment_count, std::vector<void*>& hw_events) const { return nullptr; }
-
-  // Returns the number of GPU-only signals consumed from the pool so far.
-  // Used by the graph executor to cache the count for the next launch.
-  virtual size_t GetGraphSignalPoolUsedCount(roc::GraphSignalPool* pool) const { return 0; }
-
   virtual const bool isFineGrainSupported() const {
     return (info().svmCapabilities_ & CL_DEVICE_SVM_ATOMICS) != 0 ? true : false;
   }
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp
index 6315c3984f8..967d5be7ae8 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp
@@ -3599,79 +3599,6 @@ void Device::DestroyHwEvent(void* hw_event) const {
   ReleaseGlobalSignal(hw_event);
 }
 
-// ================================================================================================
-// GraphSignalPool implementation
-// ================================================================================================
-ProfilingSignal* GraphSignalPool::AllocateOneSignal(bool interrupt, bool system_scope) {
-  auto* ps = new ProfilingSignal();
-  hsa_status_t status;
-  if (interrupt && system_scope) {
-    status = Hsa::signal_create(1, 0, nullptr, &ps->signal_);
-  } else {
-    status = Hsa::signal_create(1, 0, nullptr, HSA_AMD_SIGNAL_AMD_GPU_ONLY, &ps->signal_);
-  }
-  if (status != HSA_STATUS_SUCCESS) {
-    delete ps;
-    return nullptr;
-  }
-  ps->flags_.interrupt_ = interrupt;
-  return ps;
-}
-
-bool GraphSignalPool::Allocate(size_t count) {
-  signals_.reserve(count);
-  for (size_t i = signals_.size(); i < count; ++i) {
-    auto* ps = AllocateOneSignal(false, false);
-    if (ps == nullptr) return false;
-    signals_.push_back(ps);
-  }
-  capacity_.store(signals_.size(), std::memory_order_release);
-  next_idx_.store(0);
-  irq_next_idx_.store(0);
-  return true;
-}
-
-bool GraphSignalPool::AllocateIrq(size_t count, bool system_scope) {
-  irq_signals_.reserve(count);
-  for (size_t i = irq_signals_.size(); i < count; ++i) {
-    auto* ps = AllocateOneSignal(true, system_scope);
-    if (ps == nullptr) return false;
-    irq_signals_.push_back(ps);
-  }
-  irq_capacity_.store(irq_signals_.size(), std::memory_order_release);
-  irq_next_idx_.store(0);
-  return true;
-}
-
-ProfilingSignal* GraphSignalPool::GrowAndAcquire(size_t idx) {
-  amd::ScopedLock sl(lock_);
-  if (idx < signals_.size()) return signals_[idx];
-  while (signals_.size() <= idx) {
-    auto* ps = AllocateOneSignal(false, false);
-    if (ps == nullptr) return nullptr;
-    signals_.push_back(ps);
-  }
-  capacity_.store(signals_.size(), std::memory_order_release);
-  return signals_[idx];
-}
-
-ProfilingSignal* GraphSignalPool::GrowAndAcquireIrq(size_t idx, bool system_scope) {
-  amd::ScopedLock sl(lock_);
-  if (idx < irq_signals_.size()) return irq_signals_[idx];
-  while (irq_signals_.size() <= idx) {
-    auto* ps = AllocateOneSignal(true, system_scope);
-    if (ps == nullptr) return nullptr;
-    irq_signals_.push_back(ps);
-  }
-  irq_capacity_.store(irq_signals_.size(), std::memory_order_release);
-  return irq_signals_[idx];
-}
-
-GraphSignalPool::~GraphSignalPool() {
-  for (auto* ps : signals_) { ps->release(); }
-  for (auto* ps : irq_signals_) { ps->release(); }
-}
-
 // ================================================================================================
 uint8_t* Device::CreateBarrierPacket() const {
   static constexpr uint16_t kBarrierNopHeader =
@@ -3687,45 +3614,6 @@ uint8_t* Device::CreateBarrierPacket() const {
   return raw;
 }
 
-// ================================================================================================
-GraphSignalPool* Device::CreateGraphSignalPool(
-    size_t gpu_count, size_t irq_count,
-    size_t segment_count, std::vector<void*>& hw_events) const {
-  // Clear upfront so every failure path honours the contract: caller sees an
-  // empty hw_events whenever nullptr is returned, regardless of what was passed in.
-  hw_events.clear();
-
-  auto* pool = new GraphSignalPool();
-
-  if (gpu_count > 0 && !pool->Allocate(gpu_count)) {
-    delete pool;
-    return nullptr;
-  }
-  if (irq_count > 0 && !pool->AllocateIrq(irq_count, true)) {
-    delete pool;
-    return nullptr;
-  }
-
-  hw_events.resize(segment_count, nullptr);
-  for (size_t i = 0; i < segment_count; ++i) {
-    hw_events[i] = pool->Acquire();
-    if (hw_events[i] == nullptr) {
-      delete pool;
-      hw_events.clear();
-      return nullptr;
-    }
-  }
-
-  // Pre-allocation is done; reset so GetLastAcquired() only reflects GPU dispatches.
-  pool->ResetLastAcquired();
-  return pool;
-}
-
-// ================================================================================================
-size_t Device::GetGraphSignalPoolUsedCount(GraphSignalPool* pool) const {
-  return pool ? pool->UsedCount() : 0;
-}
-
 // ================================================================================================
 void Device::ApplyHwEventPatches(const std::vector<HwEventPatch>& patches,
                                  const std::vector<void*>& hw_events) const {
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp
index 130c5f57876..c4051bf0779 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.hpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp
@@ -114,63 +114,6 @@ class ProfilingSignal : public amd::ReferenceCountedObject {
   }
 };
 
-//! Graph-owned signal pool for per-launch signal allocation.
-//! Avoids runtime signal pool stalls (WaitCurrent/WaitNext) during graph execution.
-//! Two sub-pools: GPU-only signals (fast) and interrupt-capable signals (for host callbacks).
-struct GraphSignalPool {
-  ProfilingSignal* Acquire() {
-    size_t idx = next_idx_.fetch_add(1, std::memory_order_relaxed);
-    ProfilingSignal* ps;
-    if (idx < capacity_.load(std::memory_order_acquire)) {
-      ps = signals_[idx];
-    } else {
-      ps = GrowAndAcquire(idx);
-    }
-    if (ps) last_acquired_ = ps;
-    return ps;
-  }
-
-  ProfilingSignal* AcquireIrq(bool system_scope) {
-    size_t idx = irq_next_idx_.fetch_add(1, std::memory_order_relaxed);
-    ProfilingSignal* ps;
-    if (idx < irq_capacity_.load(std::memory_order_acquire)) {
-      ps = irq_signals_[idx];
-    } else {
-      ps = GrowAndAcquireIrq(idx, system_scope);
-    }
-    if (ps) last_acquired_ = ps;
-    return ps;
-  }
-
-  //! Returns the most recently acquired signal from a GPU dispatch (not pre-allocation)
-  ProfilingSignal* GetLastAcquired() const { return last_acquired_; }
-
-  //! Reset after pre-allocation so GetLastAcquired only reflects actual GPU dispatches
-  void ResetLastAcquired() { last_acquired_ = nullptr; }
-
-  bool Allocate(size_t count);
-  bool AllocateIrq(size_t count, bool system_scope);
-
-  size_t UsedCount() const { return next_idx_.load(std::memory_order_relaxed); }
-  size_t UsedIrqCount() const { return irq_next_idx_.load(std::memory_order_relaxed); }
-
-  ~GraphSignalPool();
-
- private:
-  std::vector<ProfilingSignal*> signals_;       //!< GPU-only signals
-  std::atomic<size_t> capacity_{0};             //!< Current GPU-only signal count (atomic to avoid race with growth)
-  std::atomic<size_t> next_idx_{0};             //!< Next GPU-only signal index
-  std::vector<ProfilingSignal*> irq_signals_;   //!< Interrupt-capable signals
-  std::atomic<size_t> irq_capacity_{0};         //!< Current interrupt signal count
-  std::atomic<size_t> irq_next_idx_{0};         //!< Next interrupt signal index
-  ProfilingSignal* last_acquired_ = nullptr;    //!< Most recently acquired signal
-  amd::Monitor lock_;                           //!< Protects growth of both vectors
-
-  static ProfilingSignal* AllocateOneSignal(bool interrupt, bool system_scope);
-  ProfilingSignal* GrowAndAcquire(size_t idx);
-  ProfilingSignal* GrowAndAcquireIrq(size_t idx, bool system_scope);
-};
-
 class Sampler : public device::Sampler {
  public:
   //! Constructor
@@ -545,10 +488,6 @@ class Device : public NullDevice {
   virtual uint8_t* CreateBarrierPacket() const override;
   virtual void ApplyHwEventPatches(const std::vector<HwEventPatch>& patches,
                                    const std::vector<void*>& hw_events) const override;
-  virtual GraphSignalPool* CreateGraphSignalPool(
-      size_t gpu_count, size_t irq_count,
-      size_t segment_count, std::vector<void*>& hw_events) const override;
-  virtual size_t GetGraphSignalPoolUsedCount(GraphSignalPool* pool) const override;
   virtual bool CreateUserEvent(amd::UserEvent* event) const override;
   virtual void SetUserEvent(amd::UserEvent* event) const override;
 
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index 59d183efc0a..8757a774d12 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -567,89 +567,6 @@ bool VirtualGPU::HwQueueTracker::Create() {
 hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(hsa_signal_value_t init_val, Timestamp* ts,
                                                       bool attach_signal) {
   amd::Command* cmd = gpu_.command();
-
-  // Graph signal pool fast path: skip WaitCurrent/WaitNext, use graph-owned signals.
-  // No fallback to runtime pool — graph pool grows on demand.
-  if (cmd != nullptr && cmd->graphSignalPool() != nullptr) {
-    if (!attach_signal) {
-      if (cmd->HwEvent() != nullptr) {
-        reinterpret_cast<ProfilingSignal*>(cmd->HwEvent())->release();
-      }
-      cmd->SetHwEvent(nullptr);
-      return hsa_signal_t{0};
-    }
-
-    auto* pool = cmd->graphSignalPool();
-
-    bool enqueHandler = false;
-    if (ts != nullptr) {
-      enqueHandler =
-          (ts->command().Callback() != nullptr || ts->command().GetBatchHead() != nullptr) &&
-          !ts->command().CpuWaitRequested();
-    }
-    bool use_irq = enqueHandler || (IS_WINDOWS && gpu_.ForceIrq());
-    use_irq |= !gpu_.dev().ActiveWait();
-
-    ProfilingSignal* ps;
-    if (use_irq) {
-      const Settings& settings = gpu_.dev().settings();
-      ps = pool->AcquireIrq(settings.system_scope_signal_);
-    } else {
-      ps = pool->Acquire();
-    }
-
-    if (ps == nullptr) {
-      LogError("GraphSignalPool: failed to allocate signal");
-      if (cmd->HwEvent() != nullptr) {
-        reinterpret_cast<ProfilingSignal*>(cmd->HwEvent())->release();
-      }
-      cmd->SetHwEvent(nullptr);
-      return hsa_signal_t{0};
-    }
-
-    Hsa::signal_silent_store_relaxed(ps->signal_, init_val);
-    ps->flags_.done_ = false;
-    ps->engine_ = engine_;
-    ps->flags_.isPacketDispatch_ = false;
-    ps->ResetCachedTiming();
-
-    if (cmd->HwEvent() != nullptr) {
-      reinterpret_cast<ProfilingSignal*>(cmd->HwEvent())->release();
-    }
-    cmd->SetHwEvent(ps);
-    ps->retain();
-
-    if (ts != nullptr) {
-      ts->retain();
-      ps->ts_ = ts;
-      ts->AddProfilingSignal(ps);
-
-      if (enqueHandler) {
-        uint32_t handler_init = kInitSignalValueOne;
-        if (ts->command().Callback() != nullptr) {
-          bool blocking = ts->command().Callback()->blocking_;
-          ts->SetCallbackSignal(ps->signal_, blocking);
-          if (blocking) {
-            Hsa::signal_add_relaxed(ps->signal_, 1);
-            handler_init += 1;
-          }
-        }
-        gpu_.QueuedAsyncHandlers()++;
-        ts->gpu()->retain();
-        hsa_status_t result = Hsa::signal_async_handler(
-            ps->signal_, HSA_SIGNAL_CONDITION_LT, handler_init, &HsaAmdSignalHandler, ts);
-        if (HSA_STATUS_SUCCESS != result) {
-          gpu_.QueuedAsyncHandlers()--;
-          ts->gpu()->release();
-          LogError("hsa_amd_signal_async_handler() failed in graph pool path");
-        }
-      }
-    }
-    return ps->signal_;
-  }
-
-  // --- Normal runtime pool path ---
-
   // If no signal is needed, decrement the refcount and clear the hw_event of current command
   if (!attach_signal) {
     if (nullptr != cmd) {
@@ -823,16 +740,15 @@ std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngi
   if (explicit_wait) {
     bool skip_internal_signal = false;
 
-    ProfilingSignal* last_signal = GetLastSignal();
     for (uint32_t i = 0; i < external_signals_.size(); ++i) {
       // If external signal matches internal one, then skip it
-      if (external_signals_[i]->signal_.handle == last_signal->signal_.handle) {
+      if (external_signals_[i]->signal_.handle == signal_list_[current_id_]->signal_.handle) {
         skip_internal_signal = true;
       }
     }
     // Add the oldest signal into the tracking for a wait
     if (!skip_internal_signal) {
-      external_signals_.push_back(last_signal);
+      external_signals_.push_back(signal_list_[current_id_]);
     }
   }
 
@@ -889,26 +805,14 @@ bool VirtualGPU::HwQueueTracker::CpuWaitForSignal(ProfilingSignal* signal) {
   return true;
 }
 
-// ================================================================================================
-ProfilingSignal* VirtualGPU::HwQueueTracker::GetLastSignal() const {
-  auto* cmd = gpu_.command();
-  if (cmd != nullptr && cmd->graphSignalPool() != nullptr) {
-    auto* last = cmd->graphSignalPool()->GetLastAcquired();
-    if (last != nullptr) return last;
-  }
-  return signal_list_[current_id_];
-}
-
 // ================================================================================================
 bool VirtualGPU::HwQueueTracker::WaitCurrent() {
-  ProfilingSignal* signal = GetLastSignal();
+  ProfilingSignal* signal = signal_list_[current_id_];
   return CpuWaitForSignal(signal);
 }
 
 // ================================================================================================
 void VirtualGPU::HwQueueTracker::WaitNext() {
-  auto* cmd = gpu_.command();
-  if (cmd != nullptr && cmd->graphSignalPool() != nullptr) return;
   size_t next = (current_id_ + 1) % signal_list_.size();
   ProfilingSignal* signal = signal_list_[next];
   // Only wait, there is no need to save timestamp for the next signal
@@ -918,14 +822,6 @@ void VirtualGPU::HwQueueTracker::WaitNext() {
 
 // ================================================================================================
 void VirtualGPU::HwQueueTracker::ResetCurrentSignal() {
-  auto* cmd = gpu_.command();
-  if (cmd != nullptr && cmd->graphSignalPool() != nullptr) {
-    auto* last = cmd->graphSignalPool()->GetLastAcquired();
-    if (last != nullptr) {
-      Hsa::signal_silent_store_relaxed(last->signal_, 0);
-    }
-    return;
-  }
   // Reset the signal and return
   Hsa::signal_silent_store_relaxed(signal_list_[current_id_]->signal_, 0);
   // Fallback to the previous signal
@@ -1568,16 +1464,8 @@ bool VirtualGPU::dispatchAqlPacketBatchFlat(const std::vector<uint8_t>& flatPack
              (thisChunk - firstCount) * kPacketSize);
     }
 
-    // Attach signal to the last packet when requested (before per-packet logging).
-    auto* lastSlotPtr = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
-        queueBase + ((startIndex + chunkEnd - 1) & queueMask) * kPacketSize);
-    if (isLastChunk && (attach_signal || blocking) && timestamp_ == nullptr) {
-      lastSlotPtr->completion_signal = Barriers().ActiveSignal();
-    }
-
-    // Per-packet fixups: profiling signals, kernel-name printing, and barrier logging.
-    if (timestamp_ != nullptr || IsLogEnabled(amd::LOG_DETAIL_DEBUG, amd::LOG_KERN2) ||
-        IsLogEnabled(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL)) {
+    // Per-packet fixups: profiling signals and kernel-name printing.
+    if (timestamp_ != nullptr || IsLogEnabled(amd::LOG_DETAIL_DEBUG, amd::LOG_KERN2)) {
       for (size_t i = chunkStart; i < chunkEnd; ++i) {
         const uint64_t slotIdx = (startIndex + i) & queueMask;
         auto* slot = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
@@ -1586,6 +1474,8 @@ bool VirtualGPU::dispatchAqlPacketBatchFlat(const std::vector<uint8_t>& flatPack
         const uint8_t pktType =
             extractAqlBits(hdr, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE);
         if (timestamp_ != nullptr) {
+          // When pre_patched, skip any slot whose completion_signal was already
+          // written by ApplyHwEventPatches (non-zero means pre-patched).
           bool has_prepatched_signal = pre_patched && (slot->completion_signal.handle != 0);
           if (!has_prepatched_signal) {
             slot->completion_signal =
@@ -1623,29 +1513,16 @@ bool VirtualGPU::dispatchAqlPacketBatchFlat(const std::vector<uint8_t>& flatPack
                   slot->kernel_object, slot->kernarg_address,
                   slot->completion_signal, slot->reserved2,
                   Hsa::queue_load_read_index_scacquire(gpu_queue_), slotIdx);
-        } else if (pktType == HSA_PACKET_TYPE_BARRIER_AND) {
-          auto* bpkt = reinterpret_cast<hsa_barrier_and_packet_t*>(slot);
-          ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
-                  "SWq=0x%zx, HWq=0x%zx, id=%d, Graph Barrier-AND Header = "
-                  "0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
-                  "dep_signal=[0x%zx, 0x%zx, 0x%zx, 0x%zx, 0x%zx], "
-                  "completion_signal=0x%zx, rptr=%u, wptr=%u",
-                  gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, hdr, pktType,
-                  extractAqlBits(hdr, HSA_PACKET_HEADER_BARRIER,
-                                 HSA_PACKET_HEADER_WIDTH_BARRIER),
-                  extractAqlBits(hdr, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
-                                 HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
-                  extractAqlBits(hdr, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
-                                 HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
-                  bpkt->dep_signal[0].handle, bpkt->dep_signal[1].handle,
-                  bpkt->dep_signal[2].handle, bpkt->dep_signal[3].handle,
-                  bpkt->dep_signal[4].handle,
-                  bpkt->completion_signal.handle,
-                  Hsa::queue_load_read_index_scacquire(gpu_queue_), slotIdx);
         }
       }
     }
 
+    auto* lastSlotPtr = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
+        queueBase + ((startIndex + chunkEnd - 1) & queueMask) * kPacketSize);
+    if (isLastChunk && (attach_signal || blocking) && timestamp_ == nullptr) {
+      lastSlotPtr->completion_signal = Barriers().ActiveSignal();
+    }
+
     // Write valid headers and ring the doorbell for this chunk.
     // Hold global packet-0's header back in the first chunk so the GPU sees a
     // fully-committed batch before starting (AQL protocol).  Subsequent chunks
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
index 1667f6b3a9f..1521f137f3d 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
@@ -286,7 +286,7 @@ class VirtualGPU : public device::VirtualDevice {
     void AddExternalSignal(ProfilingSignal* signal) { external_signals_.push_back(signal); }
 
     //! Get the last active signal on the queue
-    ProfilingSignal* GetLastSignal() const;
+    ProfilingSignal* GetLastSignal() const { return signal_list_[current_id_]; }
 
     //! Clear external signals
     void ClearExternalSignals() { external_signals_.clear(); }
@@ -524,7 +524,7 @@ class VirtualGPU : public device::VirtualDevice {
 
   void hasPendingDispatch() { hasPendingDispatch_ = true; }
   bool IsPendingDispatch() const { return (hasPendingDispatch_) ? true : false; }
-  void addSystemScope() override {
+  void addSystemScope() {
     addSystemScope_ = true;
     fence_state_ = amd::Device::CacheState::kCacheStateInvalid;
   }
diff --git a/projects/clr/rocclr/platform/command.cpp b/projects/clr/rocclr/platform/command.cpp
index bfd753e8fb5..22fd2688182 100644
--- a/projects/clr/rocclr/platform/command.cpp
+++ b/projects/clr/rocclr/platform/command.cpp
@@ -63,19 +63,13 @@ Event::~Event() {
 
 // ================================================================================================
 AccumulateCommand::~AccumulateCommand() {
-  if (owned_graph_signal_pool_ != nullptr) {
-    // Graph pool owns all signals (SyncPlan HW events + ActiveSignal signals).
-    // Skip per-device HW event release — pool destructor handles everything.
-    delete owned_graph_signal_pool_;
-  } else {
-    // Non-graph path: release HW events per device as before
-    for (auto& device_events_pair : hw_events_) {
-      Device* dev = device_events_pair.first;
-      if (dev != nullptr) {
-        for (void* hw_event : device_events_pair.second) {
-          if (hw_event != nullptr) {
-            dev->ReleaseGlobalSignal(hw_event);
-          }
+  // Release all retained HW events per device
+  for (auto& device_events_pair : hw_events_) {
+    Device* dev = device_events_pair.first;
+    if (dev != nullptr) {
+      for (void* hw_event : device_events_pair.second) {
+        if (hw_event != nullptr) {
+          dev->ReleaseGlobalSignal(hw_event);
         }
       }
     }
diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp
index ac5b7c3b676..f3c27258882 100644
--- a/projects/clr/rocclr/platform/command.hpp
+++ b/projects/clr/rocclr/platform/command.hpp
@@ -42,7 +42,6 @@ namespace amd {
 class Command;
 class HostQueue;
 union ComputeCommand;
-namespace roc { struct GraphSignalPool; }
 
 /*! \brief Encapsulates the status of a command.
  *
@@ -311,7 +310,6 @@ class Command : public Event {
   GraphKernelArgManager* graphKernArgMgr_ = nullptr;  //!< KernelMgr for graph
   address kernArgOffset_ = nullptr;  //!< KernelArg buffer to used when graph capturing is enabled
   const std::string** capturedKernelName_ = nullptr;  //!< Kernel under capture
-  roc::GraphSignalPool* graph_signal_pool_ = nullptr;  //!< Graph-owned signal pool for this command
  protected:
   bool cpu_wait_ = false;  //!< If true, then the command was issued for CPU/GPU sync
 
@@ -461,11 +459,6 @@ class Command : public Event {
 
   //! Check if this command(should be a marker) requires CPU wait
   bool CpuWaitRequested() const { return cpu_wait_; }
-
-  //! Set graph signal pool for graph-owned signal allocation
-  void SetGraphSignalPool(roc::GraphSignalPool* pool) { graph_signal_pool_ = pool; }
-  //! Get graph signal pool (nullptr for non-graph commands)
-  roc::GraphSignalPool* graphSignalPool() const { return graph_signal_pool_; }
 };
 
 class UserEvent : public Command {
@@ -1509,8 +1502,6 @@ class AccumulateCommand : public Command {
   std::vector<std::pair<uint64_t, uint64_t>> tsList_;
   //! HW events that need to be released when this command is destroyed
   std::unordered_map<Device*, std::vector<void*>> hw_events_;
-  //! Graph signal pool owned by this command (deleted after GPU completion)
-  roc::GraphSignalPool* owned_graph_signal_pool_ = nullptr;
 
  public:
   //! Create a new Marker
@@ -1534,9 +1525,6 @@ class AccumulateCommand : public Command {
     }
   }
 
-  //! Transfer ownership of graph signal pool for cleanup after GPU completion
-  void SetOwnedGraphSignalPool(roc::GraphSignalPool* pool) { owned_graph_signal_pool_ = pool; }
-
   //! Add kernel name to the list if available
   void addKernelName(const std::string* kernelName) { kernelNames_.push_back(kernelName); }