Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
384 changes: 245 additions & 139 deletions projects/clr/hipamd/src/hip_graph_internal.cpp

Large diffs are not rendered by default.

62 changes: 54 additions & 8 deletions projects/clr/hipamd/src/hip_graph_internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@

#pragma once
#include <algorithm>
#include <chrono>
#include <queue>
#include <stack>
#include <iostream>
#include <thread>
#include <unordered_map>
#include <unordered_set>
#include <vector>
Expand All @@ -21,6 +23,11 @@
#include "hip_mempool_impl.hpp"
#include "hip_vm.hpp"

namespace amd { namespace roc {
struct GraphSignalPool;
struct ProfilingSignal;
}}

typedef struct ihipExtKernelEvents {
hipEvent_t startEvent_;
hipEvent_t stopEvent_;
Expand Down Expand Up @@ -238,6 +245,9 @@ class GraphNode : public hipGraphNodeDOTAttribute {
std::vector<uint8_t*>* batchPackets = nullptr,
std::vector<const std::string*>* batchKernelNames = nullptr) {
auto capture_stream = hip::getNullStream(g_devices[dev_id_]->devices()[0]->context(), false);
if (capture_stream == nullptr) {
return hipErrorInvalidDevice;
}
hipError_t status = CreateCommand(capture_stream);
if (status != hipSuccess) {
return status;
Expand Down Expand Up @@ -291,6 +301,12 @@ class GraphNode : public hipGraphNodeDOTAttribute {
int GetID() const { return id_; }
/// Returns command for graph node
virtual std::vector<amd::Command*>& GetCommands() { return commands_; }
/// Propagate graph signal pool to all commands owned by this node
void SetGraphSignalPoolOnCommands(amd::roc::GraphSignalPool* pool) {
for (auto& cmd : commands_) {
if (cmd != nullptr) cmd->SetGraphSignalPool(pool);
}
}
/// Returns graph node type
hipGraphNodeType GetType() const { return type_; }
/// Clone graph node
Expand Down Expand Up @@ -971,6 +987,12 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
}
}

// Destroy persistent pool. All parallel streams were finished above, so
// every signal owned by the pool is settled and no in-flight AQL packet
// still references it. Any IRQ signal used by an AccumulateCommand came
// from the runtime pool, not this one.
DestroyPersistentPool();

segmentBatches_.clear();
}

Expand Down Expand Up @@ -1105,17 +1127,22 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
std::unordered_map<int, SegmentBatch> segmentBatches_;

struct SyncPlan {
int num_segments = 0; // total segment count (used for bounds checks)
int num_hw_events = 0; // HW event slots to allocate (one per ncs=true segment)
int num_segments = 0; //!< total segment count (bounds check reference)
int num_hw_events = 0; //!< HW event slots allocated (one per leaf parallel segment)

// Dense index into segment_hw_events for each segment.
// seg_to_hw_event[seg_id] == -1 -> no completion signal emitted.
// seg_to_hw_event[seg_id] >= 0 -> index into the compact hw_events vector.
std::vector<int> seg_to_hw_event;
//! seg_to_hw_event[seg_id]: -1 = no completion signal; >= 0 = index into segment_hw_events.
std::vector<int> seg_to_hw_event;
//! Stable ProfilingSignal* per HW event slot; indexed via seg_to_hw_event.
std::vector<void*> segment_hw_events;

std::vector<amd::Device::HwEventPatch> patch_list;
std::vector<uint8_t*> barrier_packets;
std::vector<int> leaf_segment_ids;
std::vector<uint8_t*> barrier_packets;
std::vector<int> leaf_segment_ids;

//! Pointers to dep_signal[0].handle (AQL byte offset 8) in the flat buffer of
//! each non-stream-0 segment's first packet. Patched at every hipGraphLaunch
//! with the reset kernel's per-launch completion signal handle.
std::vector<uint64_t*> nonstream0_dep_signal_ptrs;

~SyncPlan() {
for (auto* p : barrier_packets) { delete[] p; }
Expand All @@ -1125,6 +1152,25 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
SyncPlan sync_plan_;

void BuildSyncPlan();

//! Cached signal counts for graph signal pool pre-allocation
size_t graph_signal_count_ = 0; //!< GPU-only signals (= num_hw_events after BuildSyncPlan)

//! Persistent signal pool owned by this GraphExec, allocated at instantiate time
//! in CaptureAndFormPacketsForGraph and destroyed in ~GraphExec.
//! AccumulateCommand holds only a non-owning reference.
amd::roc::GraphSignalPool* persistent_pool_ = nullptr;
amd::Device* persistent_pool_device_ = nullptr; //!< Device that owns persistent_pool_

//! Dedicated completion signal for the pre-baked reset kernel.
//! Non-null when graph has parallel (non-stream-0) segments; guards the
//! dep-signal barrier prepend in CaptureAndFormPacketsForGraph.
//! Freed in DestroyPersistentPool.
amd::roc::ProfilingSignal* reset_signal_ = nullptr;

//! Release persistent_pool_ and reset_signal_. Must be called only after
//! all in-flight launches have retired (no AQL packets still reference them).
void DestroyPersistentPool();
};

class ChildGraphNode : public GraphNode, public GraphExec {
Expand Down
54 changes: 54 additions & 0 deletions projects/clr/rocclr/device/blitcl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,39 @@ const char* HipExtraSourceCode = BLIT_KERNELS(
__ockl_dm_init_v1(heap_to_initialize, initial_blocks, heap_size, number_of_initial_blocks);
}

// Resets graph-pool amd_signal_t.value fields back to a given value
// (default 1) so the pool can be recycled for the next launch.
// `valueAddrs` is a flat array of raw 64-bit device addresses (one per
// signal); each work-item casts an entry to a __global ulong* and performs
// a plain 64-bit store. Each address is written by exactly one work-item
// (no intra-kernel race), and the AQL packet's scope_release at kernel
// completion makes the writes visible to host signal-wait hardware before
// the next iteration's AQL barrier acquires them.
__kernel void __amd_rocclr_resetGraphSignals(__global ulong* valueAddrs, uint count,
ulong resetValue) {
uint i = (uint)get_global_id(0);
if (i < count) {
__global ulong* p = (__global ulong*)(valueAddrs[i]);
*p = resetValue;
}
}

// Faster variant for pools that carry a persistent contiguous VA buffer
// (GraphSignalPool::ContBuffer()). The caller passes the stable device
// pointer populated once at pool-creation time; no per-launch host-to-
// device copy of the address array is needed. The kernel body is
// identical to __amd_rocclr_resetGraphSignals — the performance benefit
// comes entirely from eliminating the CPU-side CollectValuePtrs() scan
// and the memcpy into a transient kernarg slot on the dispatch path.
__kernel void __amd_rocclr_resetContSignalBuffer(__global ulong* valueAddrs, uint count,
ulong resetValue) {
uint i = (uint)get_global_id(0);
if (i < count) {
__global ulong* p = (__global ulong*)(valueAddrs[i]);
*p = resetValue;
}
}

__kernel void __amd_rocclr_gwsInit(uint value) { __builtin_amdgcn_ds_gws_init(value, 0); });

const char* HipExtraSourceCodeNoGWS = BLIT_KERNELS(
Expand All @@ -206,6 +239,27 @@ const char* HipExtraSourceCodeNoGWS = BLIT_KERNELS(
__kernel void __amd_rocclr_initHeap(ulong heap_to_initialize, ulong initial_blocks,
uint heap_size, uint number_of_initial_blocks) {
__ockl_dm_init_v1(heap_to_initialize, initial_blocks, heap_size, number_of_initial_blocks);
}

// See HipExtraSourceCode for the rationale; same kernel emitted in the
// no-GWS variant for parity.
__kernel void __amd_rocclr_resetGraphSignals(__global ulong* valueAddrs, uint count,
ulong resetValue) {
uint i = (uint)get_global_id(0);
if (i < count) {
__global ulong* p = (__global ulong*)(valueAddrs[i]);
*p = resetValue;
}
}

// See HipExtraSourceCode for the rationale; no-GWS parity copy.
__kernel void __amd_rocclr_resetContSignalBuffer(__global ulong* valueAddrs, uint count,
ulong resetValue) {
uint i = (uint)get_global_id(0);
if (i < count) {
__global ulong* p = (__global ulong*)(valueAddrs[i]);
*p = resetValue;
}
});

const char* BlitImageSourceCode = BLIT_KERNELS(
Expand Down
43 changes: 43 additions & 0 deletions projects/clr/rocclr/device/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
#include <shared_mutex>

namespace amd {
namespace roc { struct GraphSignalPool; }
class Command;
class CommandQueue;
class ReadMemoryCommand;
Expand Down Expand Up @@ -1354,6 +1355,8 @@ class VirtualDevice : public amd::ReferenceCountedObject {

//! Returns fence state of the VirtualGPU
virtual bool isFenceDirty() const = 0;
//! Insert a system scope on the next dispatch
virtual void addSystemScope() {}
//! Init hidden heap for device memory allocations
virtual void HiddenHeapInit() = 0;

Expand Down Expand Up @@ -2129,6 +2132,46 @@ class Device : public RuntimeObject {
virtual void ApplyHwEventPatches(const std::vector<HwEventPatch>& patches,
const std::vector<void*>& hw_events) const {}

// Creates a fully-initialised per-launch graph signal pool.
// Allocates gpu_count GPU-only signals, then acquires segment_count
// hw-event slots into hw_events and resets the last-acquired pointer so
// GetLastAcquired() only tracks dispatches. IRQ/handler-bearing signals
// are NOT owned by the graph pool — they come from the runtime pool.
// Returns nullptr (and leaves hw_events empty) on allocation failure.
virtual roc::GraphSignalPool* CreateGraphSignalPool(
size_t gpu_count,
size_t segment_count, std::vector<void*>& hw_events) const { return nullptr; }

// Returns the number of GPU-only signals consumed from the pool so far.
// Used by the graph executor to cache the count for the next launch.
virtual size_t GetGraphSignalPoolUsedCount(roc::GraphSignalPool* pool) const { return 0; }

// Destroys a graph signal pool created by CreateGraphSignalPool.
// Required because GraphSignalPool is only forward-declared in platform/
// headers; calling `delete pool;` from generic code would not invoke
// ~GraphSignalPool (delete-on-incomplete-type is undefined behaviour and
// in practice silently skips the destructor). Routing through this virtual
// ensures the destructor in rocdevice.cpp is actually called.
virtual void DestroyGraphSignalPool(roc::GraphSignalPool* pool) const {}

// Reset a previously-used graph signal pool so it can be reused for the
// next launch on `vdev`. When `prev_done` is true the caller has already
// verified (via the previous launch's leaf signals) that every GPU-only
// signal has settled, so we take the CPU fast path. Otherwise dispatches
// the __amd_rocclr_resetGraphSignals kernel on `vdev`'s queue with a
// completion signal attached; caller pulls it back via
// `vdev->Barriers().GetLastSignal()` and serializes parallel-stream
// dispatches behind it. `out_did_gpu_reset` (optional) is set to true iff
// the GPU kernel was actually dispatched (i.e. caller should read the
// last signal). Returns true on success.
virtual bool ResetGraphSignalPool(device::VirtualDevice* vdev,
roc::GraphSignalPool* pool,
bool prev_done,
bool* out_did_gpu_reset = nullptr) const {
if (out_did_gpu_reset != nullptr) *out_did_gpu_reset = false;
return false;
}

virtual const bool isFineGrainSupported() const {
return (info().svmCapabilities_ & CL_DEVICE_SVM_ATOMICS) != 0 ? true : false;
}
Expand Down
Loading
Loading