11#include < hip/hip_runtime.h>
2- #include < rocblas/rocblas.h>
32#include < stdio.h>
43#include < stdlib.h>
54#include < math.h>
65#include < chrono>
76#include " rocm7_utils.h"
87
8+ // Try to include ROCBlas if available
9+ #ifdef __has_include
10+ #if __has_include(<rocblas/rocblas.h>)
11+ #include < rocblas/rocblas.h>
12+ #define HAS_ROCBLAS 1
13+ #else
14+ #define HAS_ROCBLAS 0
15+ #endif
16+ #else
17+ #define HAS_ROCBLAS 0
18+ #endif
19+
920#define TILE_SIZE 16
1021#define BLOCK_SIZE 256
1122
@@ -271,22 +282,25 @@ __global__ void strassenMatrixMul(float *A, float *B, float *C, int N, int level
271282 }
272283}
273284
274- }
275- }
276-
277285// Matrix multiplication demonstration
278286
279287class MatrixOperations {
280288private:
289+ #if HAS_ROCBLAS
281290 rocblas_handle handle;
291+ #endif
282292
283293public:
284294 MatrixOperations () {
295+ #if HAS_ROCBLAS
285296 rocblas_create_handle (&handle);
297+ #endif
286298 }
287299
288300 ~MatrixOperations () {
301+ #if HAS_ROCBLAS
289302 rocblas_destroy_handle (handle);
303+ #endif
290304 }
291305
292306 void testMatrixMultiplication () {
@@ -299,7 +313,9 @@ class MatrixOperations {
299313 float *h_A = (float *)malloc (size);
300314 float *h_B = (float *)malloc (size);
301315 float *h_C_custom = (float *)malloc (size);
316+ #if HAS_ROCBLAS
302317 float *h_C_rocblas = (float *)malloc (size);
318+ #endif
303319
304320 // Initialize matrices
305321 for (int i = 0 ; i < N * N; i++) {
@@ -308,11 +324,14 @@ class MatrixOperations {
308324 }
309325
310326 // Allocate device memory
311- float *d_A, *d_B, *d_C_custom, *d_C_rocblas ;
327+ float *d_A, *d_B, *d_C_custom;
312328 HIP_CHECK (hipMalloc (&d_A, size));
313329 HIP_CHECK (hipMalloc (&d_B, size));
314330 HIP_CHECK (hipMalloc (&d_C_custom, size));
331+ #if HAS_ROCBLAS
332+ float *d_C_rocblas;
315333 HIP_CHECK (hipMalloc (&d_C_rocblas, size));
334+ #endif
316335
317336 // Copy data to device
318337 HIP_CHECK (hipMemcpy (d_A, h_A, size, hipMemcpyHostToDevice));
@@ -327,8 +346,7 @@ class MatrixOperations {
327346 HIP_CHECK (hipEventCreate (&stop));
328347
329348 HIP_CHECK (hipEventRecord (start));
330- hipLaunchKernelGGL (matrixMulTiled, gridSize, blockSize, 0 , 0 ,
331- d_A, d_B, d_C_custom, N);
349+ matrixMulTiled<<<gridSize, blockSize>>>(d_A, d_B, d_C_custom, N);
332350 HIP_CHECK (hipEventRecord (stop));
333351 HIP_CHECK (hipEventSynchronize (stop));
334352
@@ -340,14 +358,14 @@ class MatrixOperations {
340358 dim3 amdGridSize ((N + 31 ) / 32 , (N + 31 ) / 32 );
341359
342360 HIP_CHECK (hipEventRecord (start));
343- hipLaunchKernelGGL (matrixMulAMDOptimized, amdGridSize, amdBlockSize, 0 , 0 ,
344- d_A, d_B, d_C_custom, N);
361+ matrixMulAMDOptimized<<<amdGridSize, amdBlockSize>>>(d_A, d_B, d_C_custom, N);
345362 HIP_CHECK (hipEventRecord (stop));
346363 HIP_CHECK (hipEventSynchronize (stop));
347364
348365 float amd_time;
349366 HIP_CHECK (hipEventElapsedTime (&amd_time, start, stop));
350367
368+ #if HAS_ROCBLAS
351369 // Test rocBLAS implementation
352370 const float alpha = 1 .0f , beta = 0 .0f ;
353371
@@ -359,6 +377,7 @@ class MatrixOperations {
359377
360378 float rocblas_time;
361379 HIP_CHECK (hipEventElapsedTime (&rocblas_time, start, stop));
380+ #endif
362381
363382 // Performance analysis
364383 double flops = 2.0 * N * N * N; // Multiply-add operations
@@ -368,11 +387,16 @@ class MatrixOperations {
368387 custom_time, flops / (custom_time * 1e6 ));
369388 printf (" AMD optimized GEMM: %8.3f ms (%8.2f GFLOPS)\n " ,
370389 amd_time, flops / (amd_time * 1e6 ));
390+ #if HAS_ROCBLAS
371391 printf (" rocBLAS GEMM: %8.3f ms (%8.2f GFLOPS)\n " ,
372392 rocblas_time, flops / (rocblas_time * 1e6 ));
393+ #else
394+ printf (" rocBLAS GEMM: Not available (rocBLAS not found)\n " );
395+ #endif
373396
374397 // Verify correctness
375398 HIP_CHECK (hipMemcpy (h_C_custom, d_C_custom, size, hipMemcpyDeviceToHost));
399+ #if HAS_ROCBLAS
376400 HIP_CHECK (hipMemcpy (h_C_rocblas, d_C_rocblas, size, hipMemcpyDeviceToHost));
377401
378402 double max_error = 0.0 ;
@@ -381,13 +405,20 @@ class MatrixOperations {
381405 max_error = fmax (max_error, error);
382406 }
383407 printf (" Max error vs rocBLAS: %e\n " , max_error);
408+ #else
409+ printf (" Correctness verification: rocBLAS not available\n " );
410+ #endif
384411
385412 // Cleanup
386413 HIP_CHECK (hipEventDestroy (start));
387414 HIP_CHECK (hipEventDestroy (stop));
388415
389- free (h_A); free (h_B); free (h_C_custom); free (h_C_rocblas);
390- HIP_CHECK (hipFree (d_A)); HIP_CHECK (hipFree (d_B)); HIP_CHECK (hipFree (d_C_custom)); HIP_CHECK (hipFree (d_C_rocblas));
416+ free (h_A); free (h_B); free (h_C_custom);
417+ HIP_CHECK (hipFree (d_A)); HIP_CHECK (hipFree (d_B)); HIP_CHECK (hipFree (d_C_custom));
418+ #if HAS_ROCBLAS
419+ free (h_C_rocblas);
420+ HIP_CHECK (hipFree (d_C_rocblas));
421+ #endif
391422 }
392423
393424 void testMatrixTranspose () {
@@ -421,8 +452,7 @@ class MatrixOperations {
421452
422453 // Standard transpose
423454 HIP_CHECK (hipEventRecord (start));
424- hipLaunchKernelGGL (transposeSharedMem, gridSize, blockSize, 0 , 0 ,
425- d_input, d_output, width, height);
455+ transposeSharedMem<<<gridSize, blockSize>>>(d_input, d_output, width, height);
426456 HIP_CHECK (hipEventRecord (stop));
427457 HIP_CHECK (hipEventSynchronize (stop));
428458
@@ -434,8 +464,7 @@ class MatrixOperations {
434464 dim3 amdGridSize ((width + 31 ) / 32 , (height + 31 ) / 32 );
435465
436466 HIP_CHECK (hipEventRecord (start));
437- hipLaunchKernelGGL (transposeAMDOptimized, amdGridSize, amdBlockSize, 0 , 0 ,
438- d_input, d_output, width, height);
467+ transposeAMDOptimized<<<amdGridSize, amdBlockSize>>>(d_input, d_output, width, height);
439468 HIP_CHECK (hipEventRecord (stop));
440469 HIP_CHECK (hipEventSynchronize (stop));
441470
@@ -494,8 +523,7 @@ class MatrixOperations {
494523
495524 // Standard implementation
496525 HIP_CHECK (hipEventRecord (start));
497- hipLaunchKernelGGL (matrixVectorMul, N, BLOCK_SIZE , 0 , 0 ,
498- d_matrix, d_vector, d_result, N);
526+ matrixVectorMul<<<N, BLOCK_SIZE >>>(d_matrix, d_vector, d_result, N);
499527 HIP_CHECK (hipEventRecord (stop));
500528 HIP_CHECK (hipEventSynchronize (stop));
501529
@@ -504,8 +532,7 @@ class MatrixOperations {
504532
505533 // Wavefront-optimized implementation
506534 HIP_CHECK (hipEventRecord (start));
507- hipLaunchKernelGGL (matrixVectorMulWavefront, N, BLOCK_SIZE , 0 , 0 ,
508- d_matrix, d_vector, d_result, N);
535+ matrixVectorMulWavefront<<<N, BLOCK_SIZE >>>(d_matrix, d_vector, d_result, N);
509536 HIP_CHECK (hipEventRecord (stop));
510537 HIP_CHECK (hipEventSynchronize (stop));
511538
0 commit comments