@@ -35,13 +35,26 @@ static bool rocm_available() {
3535 return available == 1 ;
3636}
3737
38- // Check if managed memory is supported on this device
38+ // Check if managed memory (HMM) is supported on this device.
39+ // On integrated GPUs (Strix Halo), HMM is actually fast since there's no
40+ // discrete VRAM — managed memory avoids the overhead of hipExtMallocWithFlags.
3941static bool managed_memory_supported () {
40- // Always return false to force the use of hipHostMalloc (GTT RAM).
41- // hipMallocManaged uses HMM, which causes implicit page migrations and
42- // significant memory copying between host and device on access.
43- // Using hipHostMalloc maps pinned host memory directly to the GPU's address space.
44- return false ;
42+ static int supported = -1 ;
43+ if (supported < 0 ) {
44+ if (!rocm_available ()) {
45+ supported = 0 ;
46+ } else {
47+ void * test_ptr = nullptr ;
48+ hipError_t err = hipMallocManaged (&test_ptr, 64 );
49+ if (err == hipSuccess) {
50+ (void )hipFree (test_ptr);
51+ supported = 1 ;
52+ } else {
53+ supported = 0 ;
54+ }
55+ }
56+ }
57+ return supported == 1 ;
4558}
4659
4760static bool is_integrated () {
@@ -64,18 +77,18 @@ inline void* rocm_unified_malloc(size_t size, bool& is_managed) {
6477 void * data = nullptr ;
6578 hipError_t err;
6679 if (is_integrated ()) {
80+ // Unified memory device (iGPU/APU): CPU and GPU share system RAM.
81+ // Try hipExtMallocWithFlags first (fine-grained coherent, best GPU
82+ // bandwidth). Falls back to hipMallocManaged for large allocations
83+ // that exceed the small device-local VRAM (~2GB).
6784 err = hipExtMallocWithFlags (&data, size, hipDeviceMallocFinegrained);
68- is_managed = true ; // Use is_managed=true to signify hipFree should be used
85+ if (err != hipSuccess) {
86+ err = hipMallocManaged (&data, size);
87+ }
88+ is_managed = true ;
6989 } else if (managed_memory_supported ()) {
7090 err = hipMallocManaged (&data, size);
7191 is_managed = true ;
72- if (err == hipSuccess) {
73- int device_count = 0 ;
74- (void )hipGetDeviceCount (&device_count);
75- for (int i = 0 ; i < device_count; ++i) {
76- (void )hipMemAdvise (data, size, hipMemAdviseSetAccessedBy, i);
77- }
78- }
7992 } else {
8093 err = hipHostMalloc (&data, size, hipHostMallocDefault);
8194 is_managed = false ;
@@ -193,6 +206,14 @@ Buffer RocmAllocator::malloc(size_t size) {
193206 }
194207
195208 // Find available buffer from cache.
209+ // Use aggressive size rounding to maximize cache hit rate:
210+ // - Small (<=8B): scalar pool
211+ // - Medium (<16KB): power-of-2
212+ // - Large (<1MB): 16KB page aligned
213+ // - Very large (>=1MB): power-of-2 (coarser buckets = more cache hits)
214+ // The power-of-2 rounding for large allocations is critical for decode —
215+ // without it, slightly different sizes (e.g., 1.01MB vs 1.02MB) miss the
216+ // cache and trigger hipExtMallocWithFlags at ~7ms each.
196217 auto orig_size = size;
197218 std::unique_lock lock (mutex_);
198219 if (size <= small_block_size) {
@@ -219,14 +240,11 @@ Buffer RocmAllocator::malloc(size_t size) {
219240 lock.unlock ();
220241 if (!buf) {
221242 if (is_integrated ()) {
222- buf = new RocmBuffer{nullptr , size, false , -1 };
223- hipError_t err = hipExtMallocWithFlags (&buf->data , size, hipDeviceMallocFinegrained);
224- if (err != hipSuccess) {
225- delete buf;
226- std::ostringstream oss;
227- oss << " hipExtMallocWithFlags failed: " << hipGetErrorString (err) << " ." ;
228- throw std::runtime_error (oss.str ());
229- }
243+ // Integrated GPU: allocate unified memory (CPU+GPU accessible).
244+ // device=-1 signals unified memory — no move_to_unified_memory needed.
245+ bool is_managed = false ;
246+ void * data = rocm_unified_malloc (size, is_managed);
247+ buf = new RocmBuffer{data, size, is_managed, -1 };
230248 } else {
231249 int device = 0 ;
232250 hipGetDevice (&device);
@@ -373,12 +391,18 @@ void* Buffer::raw_ptr() {
373391 if (!ptr_) {
374392 return nullptr ;
375393 }
376- // Synchronize all streams before accessing memory from CPU
377- // This ensures all GPU operations have completed
378- (void )hipDeviceSynchronize ();
379-
380394 auto & cbuf = *static_cast <rocm::RocmBuffer*>(ptr_);
381- rocm::allocator ().move_to_unified_memory (cbuf);
395+
396+ if (cbuf.device == -1 ) {
397+ // Unified memory (integrated GPU or hipMallocManaged): CPU-accessible.
398+ // hipStreamSynchronize(nullptr) waits for the default stream — lighter
399+ // than hipDeviceSynchronize which waits for ALL streams.
400+ (void )hipStreamSynchronize (nullptr );
401+ } else {
402+ // Discrete GPU VRAM: full sync + migrate to host-accessible memory.
403+ (void )hipDeviceSynchronize ();
404+ rocm::allocator ().move_to_unified_memory (cbuf);
405+ }
382406 return cbuf.data ;
383407}
384408
0 commit comments