From 1f33802dc0bb3ce7893ffc2b23b495992336e9aa Mon Sep 17 00:00:00 2001 From: anusha godavarthy Surya Date: Sat, 9 May 2026 10:28:06 +0000 Subject: [PATCH] =?UTF-8?q?Revert=20"[hipGraph]=20Add=20graph=20signal=20p?= =?UTF-8?q?ool=20and=20remove=20pre/post=20markers=20for=20non-=E2=80=A6?= =?UTF-8?q?=20(#5333)"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit f17b05703aecf3f9f78a979239349ed00d27e289. Reverting due to two issues observed in GraphBench: 1. Resource leak: per-launch GraphSignalPool was deleted via a forward- declared pointer in ~AccumulateCommand, so ~GraphSignalPool never ran and every per-launch HSA signal was leaked ("Resource leak detected by SharedSignalPool, N Signals leaked"). 2. Per-launch host overhead: shifting from the runtime's rotating signal_list_ (which reuses signals after warmup with WaitCurrent + WaitNext) to a per-launch graph signal pool reintroduces N× signal_create + signal_destroy on every hipGraphLaunch, causing measurable regression in graph_bench across straight/paths2/paths4/ full2/full4 topologies. A safer redesign that preserves PR #5333's per-launch isolation while recycling signals (mirroring the runtime pool's WaitCurrent/WaitNext discipline) is needed before re-landing. Co-authored-by: Cursor --- .../clr/hipamd/src/hip_graph_internal.cpp | 122 +++++++------- .../clr/hipamd/src/hip_graph_internal.hpp | 10 -- projects/clr/rocclr/device/device.hpp | 16 -- projects/clr/rocclr/device/rocm/rocdevice.cpp | 112 ------------- projects/clr/rocclr/device/rocm/rocdevice.hpp | 61 ------- .../clr/rocclr/device/rocm/rocvirtual.cpp | 149 ++---------------- .../clr/rocclr/device/rocm/rocvirtual.hpp | 4 +- projects/clr/rocclr/platform/command.cpp | 20 +-- projects/clr/rocclr/platform/command.hpp | 12 -- 9 files changed, 77 insertions(+), 429 deletions(-) diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp index 83d7181473a..c152d1a253a 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.cpp +++ b/projects/clr/hipamd/src/hip_graph_internal.cpp @@ -432,7 +432,6 @@ void GraphExec::BuildSyncPlan() { sync_plan_.patch_list.clear(); sync_plan_.barrier_packets.clear(); sync_plan_.leaf_segment_ids.clear(); - graph_irq_signal_count_ = 0; auto* device = g_devices[instantiateDeviceId_]->devices()[0]; @@ -552,19 +551,7 @@ void GraphExec::BuildSyncPlan() { if (segment.segment_ids_edges.empty()) { sync_plan_.leaf_segment_ids.push_back(segment.id); } - - // Count interrupt signals needed for non-captured host nodes in this segment. - for (size_t i = 0; i < segment.nodes.size(); ++i) { - if (!segBatch.node_capture_status[i] && - segment.nodes[i]->GetType() == hipGraphNodeTypeHost) { - graph_irq_signal_count_ += 2; - } - } } - - // Seed GPU-only signal count with known minimum (SyncPlan HW events) - // so even the first launch pre-allocates these instead of growing on demand. - graph_signal_count_ = sync_plan_.num_segments; } // ================================================================================================ @@ -1728,19 +1715,19 @@ amd::Command* GraphExec::EnqueueSegmentedGraph(hip::Stream* launch_stream, auto* device = g_devices[launch_stream->DeviceId()]->devices()[0]; - // Create per-launch graph signal pool: pre-allocates GPU-only and IRQ signals, - // acquires one hw-event per segment, and resets GetLastAcquired() so it only - // tracks actual dispatches. All pool method calls are encapsulated in the factory. + // Allocate HW events for all segments std::vector segment_hw_events; - auto* launch_pool = device->CreateGraphSignalPool( - graph_signal_count_, graph_irq_signal_count_, - static_cast(sync_plan_.num_segments), segment_hw_events); - if (launch_pool == nullptr) { - if (out_status != nullptr) *out_status = hipErrorOutOfMemory; - return nullptr; + if (sync_plan_.num_segments > 0) { + if (!device->CreateHwEvents(sync_plan_.num_segments, segment_hw_events)) { + if (out_status != nullptr) { + *out_status = hipErrorOutOfMemory; + } + return nullptr; + } } - // Apply pre-computed patches — writes HW events directly into flatPacketData + // Apply pre-computed patches -- writes HW events directly into flatPacketData + // via the flat_packet pointers resolved at instantiate time, so no rebuild needed. if (!sync_plan_.patch_list.empty()) { device->ApplyHwEventPatches(sync_plan_.patch_list, segment_hw_events); } @@ -1748,13 +1735,18 @@ amd::Command* GraphExec::EnqueueSegmentedGraph(hip::Stream* launch_stream, // Track stream assignments across levels std::unordered_map segment_to_stream; - // Single AccumulateCommand on launch_stream manages graph pool lifetime + // Single AccumulateCommand on launch_stream manages all HW event lifetimes // and serves as the dispatch anchor for all segments across all streams. - // Note: SetOwnedGraphSignalPool transfers ownership — on error paths below, - // graph_accumulate->release() deletes the pool via ~AccumulateCommand. auto* graph_accumulate = new amd::AccumulateCommand(*launch_stream, {}, nullptr); - graph_accumulate->SetGraphSignalPool(launch_pool); - graph_accumulate->SetOwnedGraphSignalPool(launch_pool); + + // Register HW events with graph_accumulate for lifetime tracking. + // addHwEvent does not retain — ownership transfers from CreateHwEvents + // to graph_accumulate. ~AccumulateCommand releases them after graph completion. + for (auto& hw_event : segment_hw_events) { + if (hw_event != nullptr) { + graph_accumulate->addHwEvent(hw_event, device); + } + } // Process segments level by level for (int level = 0; level <= max_dependency_level_; ++level) { @@ -1767,13 +1759,13 @@ amd::Command* GraphExec::EnqueueSegmentedGraph(hip::Stream* launch_stream, AssignStreamsToSegments(segments_at_level, launch_stream, streams, segment_to_stream); if (level == 0) { - // Synchronize internal streams with launch stream's last command if available. - // These wait markers use runtime pool signals (no graph pool) for boundary sync. + // Synchronize internal streams with launch stream's last command if available amd::Command* launch_last_cmd = launch_stream->getLastQueuedCommand(true); if (launch_last_cmd != nullptr) { amd::Command::EventWaitList launch_wait_list; launch_wait_list.push_back(launch_last_cmd); + // For each segment at level 0, if it's on a different stream, add a wait marker for (int segment_id : segments_at_level) { hip::Stream* seg_stream = segment_to_stream[segment_id]; if (seg_stream != launch_stream) { @@ -1788,7 +1780,7 @@ amd::Command* GraphExec::EnqueueSegmentedGraph(hip::Stream* launch_stream, } } - // Dispatch each segment — barriers are in the batch, signals are patched + // Dispatch each segment -- barriers are in the batch, signals are patched for (int segment_id : segments_at_level) { const auto& segment = segments_[segment_id]; hip::Stream* current_stream = segment_to_stream[segment_id]; @@ -1805,7 +1797,10 @@ amd::Command* GraphExec::EnqueueSegmentedGraph(hip::Stream* launch_stream, } } - // Synchronize parallel streams back to the launch stream via leaf HW events + // Synchronize parallel streams back to the launch stream. + // Each leaf segment already has a completion HW event signal patched onto its + // last packet. Add those HW events as dependencies on graph_accumulate so + // the runtime emits barrier packets when it is enqueued on the launch stream. if (IsLeafNodeSyncRequired()) { for (int seg_id : sync_plan_.leaf_segment_ids) { auto it = segment_to_stream.find(seg_id); @@ -1815,12 +1810,6 @@ amd::Command* GraphExec::EnqueueSegmentedGraph(hip::Stream* launch_stream, } } - // Cache GPU-only signal count for future launches (first launch or if count grew) - size_t used = device->GetGraphSignalPoolUsedCount(launch_pool); - if (used > graph_signal_count_) { - graph_signal_count_ = used; - } - graph_accumulate->enqueue(); if (out_status != nullptr) { @@ -1844,11 +1833,8 @@ hipError_t GraphExec::EnqueueSegment(const Segment& segment, hip::Stream* stream size_t batchIndex = 0; - // Lambda to dispatch the current batch at batchIndex. - // attach_signal: when true, places a completion signal on the last packet so a - // subsequent non-captured node (e.g. SDMA) can wait on it directly via - // WaitingSignal, avoiding an extra barrier packet in the AQL queue. - auto dispatchCurrentBatch = [&](bool attach_signal = false) -> hipError_t { + // Lambda to dispatch the current batch at batchIndex + auto dispatchCurrentBatch = [&]() -> hipError_t { if (!segBatch || batchIndex >= segBatch->packet_batches.size()) { return hipSuccess; } @@ -1875,7 +1861,7 @@ hipError_t GraphExec::EnqueueSegment(const Segment& segment, hip::Stream* stream if (!flatData->empty()) { bool batchStatus = stream->vdev()->dispatchAqlPacketBatchFlat( - *flatData, *flatHdrs, accumulate, attach_signal, kernelNamesToDispatch, true); + *flatData, *flatHdrs, accumulate, false, kernelNamesToDispatch, true); if (!batchStatus) { return hipErrorUnknown; } @@ -1939,6 +1925,8 @@ hipError_t GraphExec::EnqueueSegment(const Segment& segment, hip::Stream* stream } for (size_t i = 0; i < segment.nodes.size(); ++i) { + auto& node = segment.nodes[i]; + if (segBatch && i < segBatch->node_capture_status.size() && segBatch->node_capture_status[i]) { // Node was successfully captured - dispatch its batch @@ -1953,36 +1941,36 @@ hipError_t GraphExec::EnqueueSegment(const Segment& segment, hip::Stream* stream // Skip all consecutive captured nodes that belong to this batch i += packetBatch.nodeRanges.size() - 1; - // Check if an uncaptured SDMA node (memcpy/memset) follows this batch. - // If so, attach a signal to the last packet so the SDMA engine can wait - // on it directly via WaitingSignal, avoiding an extra barrier packet. - bool sdma_follows = false; - size_t next = i + 1; - if (next < segment.nodes.size() && - next < segBatch->node_capture_status.size() && - !segBatch->node_capture_status[next]) { - sdma_follows = (segment.nodes[next]->GetType() == hipGraphNodeTypeMemcpy); - } - if (sdma_follows) { - stream->vdev()->addSystemScope(); - } - status = dispatchCurrentBatch(sdma_follows); + status = dispatchCurrentBatch(); if (status != hipSuccess) return status; } } else { - // Non-captured nodes execute through the normal command pipeline. - auto& uncaptured_node = segment.nodes[i]; + // Node doesn't support capture - execute individually. + // Flat dispatch bypasses the Barriers() tracker (ActiveSignal is skipped). + // Enqueue Markers before and after the non-captured node to: + // Before: resync the Barriers() tracker so releaseGpuMemoryFence/WaitCurrent + // can track queue progress for the node's internal operations. + // After: ensure the node's commands (e.g. host node blocking callbacks) are + // fully flushed to the HW queue before the next flat batch is + // dispatched, which writes directly to the HW queue. + auto pre_marker = new amd::Marker(*stream, kMarkerDisableFlush, {}); + if (pre_marker != nullptr) { + pre_marker->enqueue(); + pre_marker->release(); + } if (DEBUG_HIP_GRAPH_DOT_PRINT) { - uncaptured_node->stream_id_ = stream->GetStreamId(); - uncaptured_node->hw_queue_id_ = stream->getQueueID(); + node->stream_id_ = stream->GetStreamId(); + node->hw_queue_id_ = stream->getQueueID(); } - uncaptured_node->SetStream(stream); - status = uncaptured_node->CreateCommand(uncaptured_node->GetQueue()); + node->SetStream(stream); + status = node->CreateCommand(node->GetQueue()); if (status != hipSuccess) return status; - if (accumulate->graphSignalPool() != nullptr) { - uncaptured_node->SetGraphSignalPoolOnCommands(accumulate->graphSignalPool()); + node->EnqueueCommands(stream); + auto post_marker = new amd::Marker(*stream, kMarkerDisableFlush, {}); + if (post_marker != nullptr) { + post_marker->enqueue(); + post_marker->release(); } - uncaptured_node->EnqueueCommands(stream); } } diff --git a/projects/clr/hipamd/src/hip_graph_internal.hpp b/projects/clr/hipamd/src/hip_graph_internal.hpp index 4651b3acbc6..4ac0abe1a43 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.hpp +++ b/projects/clr/hipamd/src/hip_graph_internal.hpp @@ -291,12 +291,6 @@ class GraphNode : public hipGraphNodeDOTAttribute { int GetID() const { return id_; } /// Returns command for graph node virtual std::vector& GetCommands() { return commands_; } - /// Propagate graph signal pool to all commands owned by this node - void SetGraphSignalPoolOnCommands(amd::roc::GraphSignalPool* pool) { - for (auto& cmd : commands_) { - if (cmd != nullptr) cmd->SetGraphSignalPool(pool); - } - } /// Returns graph node type hipGraphNodeType GetType() const { return type_; } /// Clone graph node @@ -1138,10 +1132,6 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph { SyncPlan sync_plan_; void BuildSyncPlan(); - - //! Cached signal counts for graph signal pool pre-allocation - size_t graph_signal_count_ = 0; //!< GPU-only signals, cached after first launch - size_t graph_irq_signal_count_ = 0; //!< Interrupt signals, determined at instantiation }; class ChildGraphNode : public GraphNode, public GraphExec { diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index 33c2c1bff69..c1d3958c1ba 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -43,7 +43,6 @@ #include namespace amd { -namespace roc { struct GraphSignalPool; } class Command; class CommandQueue; class ReadMemoryCommand; @@ -1355,8 +1354,6 @@ class VirtualDevice : public amd::ReferenceCountedObject { //! Returns fence state of the VirtualGPU virtual bool isFenceDirty() const = 0; - //! Insert a system scope on the next dispatch - virtual void addSystemScope() {} //! Init hidden heap for device memory allocations virtual void HiddenHeapInit() = 0; @@ -2132,19 +2129,6 @@ class Device : public RuntimeObject { virtual void ApplyHwEventPatches(const std::vector& patches, const std::vector& hw_events) const {} - // Creates a fully-initialised per-launch graph signal pool. - // Allocates gpu_count GPU-only signals and irq_count interrupt signals, - // then acquires segment_count hw-event slots into hw_events and resets - // the last-acquired pointer so GetLastAcquired() only tracks dispatches. - // Returns nullptr (and leaves hw_events empty) on allocation failure. - virtual roc::GraphSignalPool* CreateGraphSignalPool( - size_t gpu_count, size_t irq_count, - size_t segment_count, std::vector& hw_events) const { return nullptr; } - - // Returns the number of GPU-only signals consumed from the pool so far. - // Used by the graph executor to cache the count for the next launch. - virtual size_t GetGraphSignalPoolUsedCount(roc::GraphSignalPool* pool) const { return 0; } - virtual const bool isFineGrainSupported() const { return (info().svmCapabilities_ & CL_DEVICE_SVM_ATOMICS) != 0 ? true : false; } diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 6315c3984f8..967d5be7ae8 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -3599,79 +3599,6 @@ void Device::DestroyHwEvent(void* hw_event) const { ReleaseGlobalSignal(hw_event); } -// ================================================================================================ -// GraphSignalPool implementation -// ================================================================================================ -ProfilingSignal* GraphSignalPool::AllocateOneSignal(bool interrupt, bool system_scope) { - auto* ps = new ProfilingSignal(); - hsa_status_t status; - if (interrupt && system_scope) { - status = Hsa::signal_create(1, 0, nullptr, &ps->signal_); - } else { - status = Hsa::signal_create(1, 0, nullptr, HSA_AMD_SIGNAL_AMD_GPU_ONLY, &ps->signal_); - } - if (status != HSA_STATUS_SUCCESS) { - delete ps; - return nullptr; - } - ps->flags_.interrupt_ = interrupt; - return ps; -} - -bool GraphSignalPool::Allocate(size_t count) { - signals_.reserve(count); - for (size_t i = signals_.size(); i < count; ++i) { - auto* ps = AllocateOneSignal(false, false); - if (ps == nullptr) return false; - signals_.push_back(ps); - } - capacity_.store(signals_.size(), std::memory_order_release); - next_idx_.store(0); - irq_next_idx_.store(0); - return true; -} - -bool GraphSignalPool::AllocateIrq(size_t count, bool system_scope) { - irq_signals_.reserve(count); - for (size_t i = irq_signals_.size(); i < count; ++i) { - auto* ps = AllocateOneSignal(true, system_scope); - if (ps == nullptr) return false; - irq_signals_.push_back(ps); - } - irq_capacity_.store(irq_signals_.size(), std::memory_order_release); - irq_next_idx_.store(0); - return true; -} - -ProfilingSignal* GraphSignalPool::GrowAndAcquire(size_t idx) { - amd::ScopedLock sl(lock_); - if (idx < signals_.size()) return signals_[idx]; - while (signals_.size() <= idx) { - auto* ps = AllocateOneSignal(false, false); - if (ps == nullptr) return nullptr; - signals_.push_back(ps); - } - capacity_.store(signals_.size(), std::memory_order_release); - return signals_[idx]; -} - -ProfilingSignal* GraphSignalPool::GrowAndAcquireIrq(size_t idx, bool system_scope) { - amd::ScopedLock sl(lock_); - if (idx < irq_signals_.size()) return irq_signals_[idx]; - while (irq_signals_.size() <= idx) { - auto* ps = AllocateOneSignal(true, system_scope); - if (ps == nullptr) return nullptr; - irq_signals_.push_back(ps); - } - irq_capacity_.store(irq_signals_.size(), std::memory_order_release); - return irq_signals_[idx]; -} - -GraphSignalPool::~GraphSignalPool() { - for (auto* ps : signals_) { ps->release(); } - for (auto* ps : irq_signals_) { ps->release(); } -} - // ================================================================================================ uint8_t* Device::CreateBarrierPacket() const { static constexpr uint16_t kBarrierNopHeader = @@ -3687,45 +3614,6 @@ uint8_t* Device::CreateBarrierPacket() const { return raw; } -// ================================================================================================ -GraphSignalPool* Device::CreateGraphSignalPool( - size_t gpu_count, size_t irq_count, - size_t segment_count, std::vector& hw_events) const { - // Clear upfront so every failure path honours the contract: caller sees an - // empty hw_events whenever nullptr is returned, regardless of what was passed in. - hw_events.clear(); - - auto* pool = new GraphSignalPool(); - - if (gpu_count > 0 && !pool->Allocate(gpu_count)) { - delete pool; - return nullptr; - } - if (irq_count > 0 && !pool->AllocateIrq(irq_count, true)) { - delete pool; - return nullptr; - } - - hw_events.resize(segment_count, nullptr); - for (size_t i = 0; i < segment_count; ++i) { - hw_events[i] = pool->Acquire(); - if (hw_events[i] == nullptr) { - delete pool; - hw_events.clear(); - return nullptr; - } - } - - // Pre-allocation is done; reset so GetLastAcquired() only reflects GPU dispatches. - pool->ResetLastAcquired(); - return pool; -} - -// ================================================================================================ -size_t Device::GetGraphSignalPoolUsedCount(GraphSignalPool* pool) const { - return pool ? pool->UsedCount() : 0; -} - // ================================================================================================ void Device::ApplyHwEventPatches(const std::vector& patches, const std::vector& hw_events) const { diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index 130c5f57876..c4051bf0779 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -114,63 +114,6 @@ class ProfilingSignal : public amd::ReferenceCountedObject { } }; -//! Graph-owned signal pool for per-launch signal allocation. -//! Avoids runtime signal pool stalls (WaitCurrent/WaitNext) during graph execution. -//! Two sub-pools: GPU-only signals (fast) and interrupt-capable signals (for host callbacks). -struct GraphSignalPool { - ProfilingSignal* Acquire() { - size_t idx = next_idx_.fetch_add(1, std::memory_order_relaxed); - ProfilingSignal* ps; - if (idx < capacity_.load(std::memory_order_acquire)) { - ps = signals_[idx]; - } else { - ps = GrowAndAcquire(idx); - } - if (ps) last_acquired_ = ps; - return ps; - } - - ProfilingSignal* AcquireIrq(bool system_scope) { - size_t idx = irq_next_idx_.fetch_add(1, std::memory_order_relaxed); - ProfilingSignal* ps; - if (idx < irq_capacity_.load(std::memory_order_acquire)) { - ps = irq_signals_[idx]; - } else { - ps = GrowAndAcquireIrq(idx, system_scope); - } - if (ps) last_acquired_ = ps; - return ps; - } - - //! Returns the most recently acquired signal from a GPU dispatch (not pre-allocation) - ProfilingSignal* GetLastAcquired() const { return last_acquired_; } - - //! Reset after pre-allocation so GetLastAcquired only reflects actual GPU dispatches - void ResetLastAcquired() { last_acquired_ = nullptr; } - - bool Allocate(size_t count); - bool AllocateIrq(size_t count, bool system_scope); - - size_t UsedCount() const { return next_idx_.load(std::memory_order_relaxed); } - size_t UsedIrqCount() const { return irq_next_idx_.load(std::memory_order_relaxed); } - - ~GraphSignalPool(); - - private: - std::vector signals_; //!< GPU-only signals - std::atomic capacity_{0}; //!< Current GPU-only signal count (atomic to avoid race with growth) - std::atomic next_idx_{0}; //!< Next GPU-only signal index - std::vector irq_signals_; //!< Interrupt-capable signals - std::atomic irq_capacity_{0}; //!< Current interrupt signal count - std::atomic irq_next_idx_{0}; //!< Next interrupt signal index - ProfilingSignal* last_acquired_ = nullptr; //!< Most recently acquired signal - amd::Monitor lock_; //!< Protects growth of both vectors - - static ProfilingSignal* AllocateOneSignal(bool interrupt, bool system_scope); - ProfilingSignal* GrowAndAcquire(size_t idx); - ProfilingSignal* GrowAndAcquireIrq(size_t idx, bool system_scope); -}; - class Sampler : public device::Sampler { public: //! Constructor @@ -545,10 +488,6 @@ class Device : public NullDevice { virtual uint8_t* CreateBarrierPacket() const override; virtual void ApplyHwEventPatches(const std::vector& patches, const std::vector& hw_events) const override; - virtual GraphSignalPool* CreateGraphSignalPool( - size_t gpu_count, size_t irq_count, - size_t segment_count, std::vector& hw_events) const override; - virtual size_t GetGraphSignalPoolUsedCount(GraphSignalPool* pool) const override; virtual bool CreateUserEvent(amd::UserEvent* event) const override; virtual void SetUserEvent(amd::UserEvent* event) const override; diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 59d183efc0a..8757a774d12 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -567,89 +567,6 @@ bool VirtualGPU::HwQueueTracker::Create() { hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(hsa_signal_value_t init_val, Timestamp* ts, bool attach_signal) { amd::Command* cmd = gpu_.command(); - - // Graph signal pool fast path: skip WaitCurrent/WaitNext, use graph-owned signals. - // No fallback to runtime pool — graph pool grows on demand. - if (cmd != nullptr && cmd->graphSignalPool() != nullptr) { - if (!attach_signal) { - if (cmd->HwEvent() != nullptr) { - reinterpret_cast(cmd->HwEvent())->release(); - } - cmd->SetHwEvent(nullptr); - return hsa_signal_t{0}; - } - - auto* pool = cmd->graphSignalPool(); - - bool enqueHandler = false; - if (ts != nullptr) { - enqueHandler = - (ts->command().Callback() != nullptr || ts->command().GetBatchHead() != nullptr) && - !ts->command().CpuWaitRequested(); - } - bool use_irq = enqueHandler || (IS_WINDOWS && gpu_.ForceIrq()); - use_irq |= !gpu_.dev().ActiveWait(); - - ProfilingSignal* ps; - if (use_irq) { - const Settings& settings = gpu_.dev().settings(); - ps = pool->AcquireIrq(settings.system_scope_signal_); - } else { - ps = pool->Acquire(); - } - - if (ps == nullptr) { - LogError("GraphSignalPool: failed to allocate signal"); - if (cmd->HwEvent() != nullptr) { - reinterpret_cast(cmd->HwEvent())->release(); - } - cmd->SetHwEvent(nullptr); - return hsa_signal_t{0}; - } - - Hsa::signal_silent_store_relaxed(ps->signal_, init_val); - ps->flags_.done_ = false; - ps->engine_ = engine_; - ps->flags_.isPacketDispatch_ = false; - ps->ResetCachedTiming(); - - if (cmd->HwEvent() != nullptr) { - reinterpret_cast(cmd->HwEvent())->release(); - } - cmd->SetHwEvent(ps); - ps->retain(); - - if (ts != nullptr) { - ts->retain(); - ps->ts_ = ts; - ts->AddProfilingSignal(ps); - - if (enqueHandler) { - uint32_t handler_init = kInitSignalValueOne; - if (ts->command().Callback() != nullptr) { - bool blocking = ts->command().Callback()->blocking_; - ts->SetCallbackSignal(ps->signal_, blocking); - if (blocking) { - Hsa::signal_add_relaxed(ps->signal_, 1); - handler_init += 1; - } - } - gpu_.QueuedAsyncHandlers()++; - ts->gpu()->retain(); - hsa_status_t result = Hsa::signal_async_handler( - ps->signal_, HSA_SIGNAL_CONDITION_LT, handler_init, &HsaAmdSignalHandler, ts); - if (HSA_STATUS_SUCCESS != result) { - gpu_.QueuedAsyncHandlers()--; - ts->gpu()->release(); - LogError("hsa_amd_signal_async_handler() failed in graph pool path"); - } - } - } - return ps->signal_; - } - - // --- Normal runtime pool path --- - // If no signal is needed, decrement the refcount and clear the hw_event of current command if (!attach_signal) { if (nullptr != cmd) { @@ -823,16 +740,15 @@ std::vector& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngi if (explicit_wait) { bool skip_internal_signal = false; - ProfilingSignal* last_signal = GetLastSignal(); for (uint32_t i = 0; i < external_signals_.size(); ++i) { // If external signal matches internal one, then skip it - if (external_signals_[i]->signal_.handle == last_signal->signal_.handle) { + if (external_signals_[i]->signal_.handle == signal_list_[current_id_]->signal_.handle) { skip_internal_signal = true; } } // Add the oldest signal into the tracking for a wait if (!skip_internal_signal) { - external_signals_.push_back(last_signal); + external_signals_.push_back(signal_list_[current_id_]); } } @@ -889,26 +805,14 @@ bool VirtualGPU::HwQueueTracker::CpuWaitForSignal(ProfilingSignal* signal) { return true; } -// ================================================================================================ -ProfilingSignal* VirtualGPU::HwQueueTracker::GetLastSignal() const { - auto* cmd = gpu_.command(); - if (cmd != nullptr && cmd->graphSignalPool() != nullptr) { - auto* last = cmd->graphSignalPool()->GetLastAcquired(); - if (last != nullptr) return last; - } - return signal_list_[current_id_]; -} - // ================================================================================================ bool VirtualGPU::HwQueueTracker::WaitCurrent() { - ProfilingSignal* signal = GetLastSignal(); + ProfilingSignal* signal = signal_list_[current_id_]; return CpuWaitForSignal(signal); } // ================================================================================================ void VirtualGPU::HwQueueTracker::WaitNext() { - auto* cmd = gpu_.command(); - if (cmd != nullptr && cmd->graphSignalPool() != nullptr) return; size_t next = (current_id_ + 1) % signal_list_.size(); ProfilingSignal* signal = signal_list_[next]; // Only wait, there is no need to save timestamp for the next signal @@ -918,14 +822,6 @@ void VirtualGPU::HwQueueTracker::WaitNext() { // ================================================================================================ void VirtualGPU::HwQueueTracker::ResetCurrentSignal() { - auto* cmd = gpu_.command(); - if (cmd != nullptr && cmd->graphSignalPool() != nullptr) { - auto* last = cmd->graphSignalPool()->GetLastAcquired(); - if (last != nullptr) { - Hsa::signal_silent_store_relaxed(last->signal_, 0); - } - return; - } // Reset the signal and return Hsa::signal_silent_store_relaxed(signal_list_[current_id_]->signal_, 0); // Fallback to the previous signal @@ -1568,16 +1464,8 @@ bool VirtualGPU::dispatchAqlPacketBatchFlat(const std::vector& flatPack (thisChunk - firstCount) * kPacketSize); } - // Attach signal to the last packet when requested (before per-packet logging). - auto* lastSlotPtr = reinterpret_cast( - queueBase + ((startIndex + chunkEnd - 1) & queueMask) * kPacketSize); - if (isLastChunk && (attach_signal || blocking) && timestamp_ == nullptr) { - lastSlotPtr->completion_signal = Barriers().ActiveSignal(); - } - - // Per-packet fixups: profiling signals, kernel-name printing, and barrier logging. - if (timestamp_ != nullptr || IsLogEnabled(amd::LOG_DETAIL_DEBUG, amd::LOG_KERN2) || - IsLogEnabled(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL)) { + // Per-packet fixups: profiling signals and kernel-name printing. + if (timestamp_ != nullptr || IsLogEnabled(amd::LOG_DETAIL_DEBUG, amd::LOG_KERN2)) { for (size_t i = chunkStart; i < chunkEnd; ++i) { const uint64_t slotIdx = (startIndex + i) & queueMask; auto* slot = reinterpret_cast( @@ -1586,6 +1474,8 @@ bool VirtualGPU::dispatchAqlPacketBatchFlat(const std::vector& flatPack const uint8_t pktType = extractAqlBits(hdr, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE); if (timestamp_ != nullptr) { + // When pre_patched, skip any slot whose completion_signal was already + // written by ApplyHwEventPatches (non-zero means pre-patched). bool has_prepatched_signal = pre_patched && (slot->completion_signal.handle != 0); if (!has_prepatched_signal) { slot->completion_signal = @@ -1623,29 +1513,16 @@ bool VirtualGPU::dispatchAqlPacketBatchFlat(const std::vector& flatPack slot->kernel_object, slot->kernarg_address, slot->completion_signal, slot->reserved2, Hsa::queue_load_read_index_scacquire(gpu_queue_), slotIdx); - } else if (pktType == HSA_PACKET_TYPE_BARRIER_AND) { - auto* bpkt = reinterpret_cast(slot); - ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL, - "SWq=0x%zx, HWq=0x%zx, id=%d, Graph Barrier-AND Header = " - "0x%x (type=%d, barrier=%d, acquire=%d, release=%d), " - "dep_signal=[0x%zx, 0x%zx, 0x%zx, 0x%zx, 0x%zx], " - "completion_signal=0x%zx, rptr=%u, wptr=%u", - gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, hdr, pktType, - extractAqlBits(hdr, HSA_PACKET_HEADER_BARRIER, - HSA_PACKET_HEADER_WIDTH_BARRIER), - extractAqlBits(hdr, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE, - HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE), - extractAqlBits(hdr, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, - HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE), - bpkt->dep_signal[0].handle, bpkt->dep_signal[1].handle, - bpkt->dep_signal[2].handle, bpkt->dep_signal[3].handle, - bpkt->dep_signal[4].handle, - bpkt->completion_signal.handle, - Hsa::queue_load_read_index_scacquire(gpu_queue_), slotIdx); } } } + auto* lastSlotPtr = reinterpret_cast( + queueBase + ((startIndex + chunkEnd - 1) & queueMask) * kPacketSize); + if (isLastChunk && (attach_signal || blocking) && timestamp_ == nullptr) { + lastSlotPtr->completion_signal = Barriers().ActiveSignal(); + } + // Write valid headers and ring the doorbell for this chunk. // Hold global packet-0's header back in the first chunk so the GPU sees a // fully-committed batch before starting (AQL protocol). Subsequent chunks diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index 1667f6b3a9f..1521f137f3d 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -286,7 +286,7 @@ class VirtualGPU : public device::VirtualDevice { void AddExternalSignal(ProfilingSignal* signal) { external_signals_.push_back(signal); } //! Get the last active signal on the queue - ProfilingSignal* GetLastSignal() const; + ProfilingSignal* GetLastSignal() const { return signal_list_[current_id_]; } //! Clear external signals void ClearExternalSignals() { external_signals_.clear(); } @@ -524,7 +524,7 @@ class VirtualGPU : public device::VirtualDevice { void hasPendingDispatch() { hasPendingDispatch_ = true; } bool IsPendingDispatch() const { return (hasPendingDispatch_) ? true : false; } - void addSystemScope() override { + void addSystemScope() { addSystemScope_ = true; fence_state_ = amd::Device::CacheState::kCacheStateInvalid; } diff --git a/projects/clr/rocclr/platform/command.cpp b/projects/clr/rocclr/platform/command.cpp index bfd753e8fb5..22fd2688182 100644 --- a/projects/clr/rocclr/platform/command.cpp +++ b/projects/clr/rocclr/platform/command.cpp @@ -63,19 +63,13 @@ Event::~Event() { // ================================================================================================ AccumulateCommand::~AccumulateCommand() { - if (owned_graph_signal_pool_ != nullptr) { - // Graph pool owns all signals (SyncPlan HW events + ActiveSignal signals). - // Skip per-device HW event release — pool destructor handles everything. - delete owned_graph_signal_pool_; - } else { - // Non-graph path: release HW events per device as before - for (auto& device_events_pair : hw_events_) { - Device* dev = device_events_pair.first; - if (dev != nullptr) { - for (void* hw_event : device_events_pair.second) { - if (hw_event != nullptr) { - dev->ReleaseGlobalSignal(hw_event); - } + // Release all retained HW events per device + for (auto& device_events_pair : hw_events_) { + Device* dev = device_events_pair.first; + if (dev != nullptr) { + for (void* hw_event : device_events_pair.second) { + if (hw_event != nullptr) { + dev->ReleaseGlobalSignal(hw_event); } } } diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp index ac5b7c3b676..f3c27258882 100644 --- a/projects/clr/rocclr/platform/command.hpp +++ b/projects/clr/rocclr/platform/command.hpp @@ -42,7 +42,6 @@ namespace amd { class Command; class HostQueue; union ComputeCommand; -namespace roc { struct GraphSignalPool; } /*! \brief Encapsulates the status of a command. * @@ -311,7 +310,6 @@ class Command : public Event { GraphKernelArgManager* graphKernArgMgr_ = nullptr; //!< KernelMgr for graph address kernArgOffset_ = nullptr; //!< KernelArg buffer to used when graph capturing is enabled const std::string** capturedKernelName_ = nullptr; //!< Kernel under capture - roc::GraphSignalPool* graph_signal_pool_ = nullptr; //!< Graph-owned signal pool for this command protected: bool cpu_wait_ = false; //!< If true, then the command was issued for CPU/GPU sync @@ -461,11 +459,6 @@ class Command : public Event { //! Check if this command(should be a marker) requires CPU wait bool CpuWaitRequested() const { return cpu_wait_; } - - //! Set graph signal pool for graph-owned signal allocation - void SetGraphSignalPool(roc::GraphSignalPool* pool) { graph_signal_pool_ = pool; } - //! Get graph signal pool (nullptr for non-graph commands) - roc::GraphSignalPool* graphSignalPool() const { return graph_signal_pool_; } }; class UserEvent : public Command { @@ -1509,8 +1502,6 @@ class AccumulateCommand : public Command { std::vector> tsList_; //! HW events that need to be released when this command is destroyed std::unordered_map> hw_events_; - //! Graph signal pool owned by this command (deleted after GPU completion) - roc::GraphSignalPool* owned_graph_signal_pool_ = nullptr; public: //! Create a new Marker @@ -1534,9 +1525,6 @@ class AccumulateCommand : public Command { } } - //! Transfer ownership of graph signal pool for cleanup after GPU completion - void SetOwnedGraphSignalPool(roc::GraphSignalPool* pool) { owned_graph_signal_pool_ = pool; } - //! Add kernel name to the list if available void addKernelName(const std::string* kernelName) { kernelNames_.push_back(kernelName); }