Skip to content

Commit bbe90cb

Browse files
committed
Fixed the module 6 and 7
1 parent 357d727 commit bbe90cb

11 files changed

Lines changed: 100 additions & 125 deletions

modules/module3/examples/02_scan_prefix_sum_hip.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,7 @@ int main() {
348348
HIP_CHECK(hipDeviceSynchronize());
349349
auto end = std::chrono::high_resolution_clock::now();
350350

351-
double hillis_time = std::chrono::duration<double, std::milli>(end - start).count();
351+
hillis_time = std::chrono::duration<double, std::milli>(end - start).count();
352352

353353
HIP_CHECK(hipMemcpy(h_output, d_output, bytes, hipMemcpyDeviceToHost));
354354

@@ -369,7 +369,7 @@ int main() {
369369
HIP_CHECK(hipDeviceSynchronize());
370370
end = std::chrono::high_resolution_clock::now();
371371

372-
double blelloch_time = std::chrono::duration<double, std::milli>(end - start).count();
372+
blelloch_time = std::chrono::duration<double, std::milli>(end - start).count();
373373

374374
HIP_CHECK(hipMemcpy(h_output, d_output, bytes, hipMemcpyDeviceToHost));
375375

modules/module3/examples/04_convolution_stencil_hip.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -271,9 +271,6 @@ __global__ void separableConvCol(float *input, float *output, float *kernel,
271271
}
272272
}
273273

274-
}
275-
}
276-
277274
// 1D Convolution demonstration
278275

279276
void printImage(float *image, int width, int height, const char *name, int max_show = 8) {

modules/module3/examples/05_matrix_operations_hip.cpp

Lines changed: 46 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,22 @@
11
#include <hip/hip_runtime.h>
2-
#include <rocblas/rocblas.h>
32
#include <stdio.h>
43
#include <stdlib.h>
54
#include <math.h>
65
#include <chrono>
76
#include "rocm7_utils.h"
87

8+
// Try to include ROCBlas if available
9+
#ifdef __has_include
10+
#if __has_include(<rocblas/rocblas.h>)
11+
#include <rocblas/rocblas.h>
12+
#define HAS_ROCBLAS 1
13+
#else
14+
#define HAS_ROCBLAS 0
15+
#endif
16+
#else
17+
#define HAS_ROCBLAS 0
18+
#endif
19+
920
#define TILE_SIZE 16
1021
#define BLOCK_SIZE 256
1122

@@ -271,22 +282,25 @@ __global__ void strassenMatrixMul(float *A, float *B, float *C, int N, int level
271282
}
272283
}
273284

274-
}
275-
}
276-
277285
// Matrix multiplication demonstration
278286

279287
class MatrixOperations {
280288
private:
289+
#if HAS_ROCBLAS
281290
rocblas_handle handle;
291+
#endif
282292

283293
public:
284294
MatrixOperations() {
295+
#if HAS_ROCBLAS
285296
rocblas_create_handle(&handle);
297+
#endif
286298
}
287299

288300
~MatrixOperations() {
301+
#if HAS_ROCBLAS
289302
rocblas_destroy_handle(handle);
303+
#endif
290304
}
291305

292306
void testMatrixMultiplication() {
@@ -299,7 +313,9 @@ class MatrixOperations {
299313
float *h_A = (float*)malloc(size);
300314
float *h_B = (float*)malloc(size);
301315
float *h_C_custom = (float*)malloc(size);
316+
#if HAS_ROCBLAS
302317
float *h_C_rocblas = (float*)malloc(size);
318+
#endif
303319

304320
// Initialize matrices
305321
for (int i = 0; i < N * N; i++) {
@@ -308,11 +324,14 @@ class MatrixOperations {
308324
}
309325

310326
// Allocate device memory
311-
float *d_A, *d_B, *d_C_custom, *d_C_rocblas;
327+
float *d_A, *d_B, *d_C_custom;
312328
HIP_CHECK(hipMalloc(&d_A, size));
313329
HIP_CHECK(hipMalloc(&d_B, size));
314330
HIP_CHECK(hipMalloc(&d_C_custom, size));
331+
#if HAS_ROCBLAS
332+
float *d_C_rocblas;
315333
HIP_CHECK(hipMalloc(&d_C_rocblas, size));
334+
#endif
316335

317336
// Copy data to device
318337
HIP_CHECK(hipMemcpy(d_A, h_A, size, hipMemcpyHostToDevice));
@@ -327,8 +346,7 @@ class MatrixOperations {
327346
HIP_CHECK(hipEventCreate(&stop));
328347

329348
HIP_CHECK(hipEventRecord(start));
330-
hipLaunchKernelGGL(matrixMulTiled, gridSize, blockSize, 0, 0,
331-
d_A, d_B, d_C_custom, N);
349+
matrixMulTiled<<<gridSize, blockSize>>>(d_A, d_B, d_C_custom, N);
332350
HIP_CHECK(hipEventRecord(stop));
333351
HIP_CHECK(hipEventSynchronize(stop));
334352

@@ -340,14 +358,14 @@ class MatrixOperations {
340358
dim3 amdGridSize((N + 31) / 32, (N + 31) / 32);
341359

342360
HIP_CHECK(hipEventRecord(start));
343-
hipLaunchKernelGGL(matrixMulAMDOptimized, amdGridSize, amdBlockSize, 0, 0,
344-
d_A, d_B, d_C_custom, N);
361+
matrixMulAMDOptimized<<<amdGridSize, amdBlockSize>>>(d_A, d_B, d_C_custom, N);
345362
HIP_CHECK(hipEventRecord(stop));
346363
HIP_CHECK(hipEventSynchronize(stop));
347364

348365
float amd_time;
349366
HIP_CHECK(hipEventElapsedTime(&amd_time, start, stop));
350367

368+
#if HAS_ROCBLAS
351369
// Test rocBLAS implementation
352370
const float alpha = 1.0f, beta = 0.0f;
353371

@@ -359,6 +377,7 @@ class MatrixOperations {
359377

360378
float rocblas_time;
361379
HIP_CHECK(hipEventElapsedTime(&rocblas_time, start, stop));
380+
#endif
362381

363382
// Performance analysis
364383
double flops = 2.0 * N * N * N; // Multiply-add operations
@@ -368,11 +387,16 @@ class MatrixOperations {
368387
custom_time, flops / (custom_time * 1e6));
369388
printf("AMD optimized GEMM: %8.3f ms (%8.2f GFLOPS)\n",
370389
amd_time, flops / (amd_time * 1e6));
390+
#if HAS_ROCBLAS
371391
printf("rocBLAS GEMM: %8.3f ms (%8.2f GFLOPS)\n",
372392
rocblas_time, flops / (rocblas_time * 1e6));
393+
#else
394+
printf("rocBLAS GEMM: Not available (rocBLAS not found)\n");
395+
#endif
373396

374397
// Verify correctness
375398
HIP_CHECK(hipMemcpy(h_C_custom, d_C_custom, size, hipMemcpyDeviceToHost));
399+
#if HAS_ROCBLAS
376400
HIP_CHECK(hipMemcpy(h_C_rocblas, d_C_rocblas, size, hipMemcpyDeviceToHost));
377401

378402
double max_error = 0.0;
@@ -381,13 +405,20 @@ class MatrixOperations {
381405
max_error = fmax(max_error, error);
382406
}
383407
printf("Max error vs rocBLAS: %e\n", max_error);
408+
#else
409+
printf("Correctness verification: rocBLAS not available\n");
410+
#endif
384411

385412
// Cleanup
386413
HIP_CHECK(hipEventDestroy(start));
387414
HIP_CHECK(hipEventDestroy(stop));
388415

389-
free(h_A); free(h_B); free(h_C_custom); free(h_C_rocblas);
390-
HIP_CHECK(hipFree(d_A)); HIP_CHECK(hipFree(d_B)); HIP_CHECK(hipFree(d_C_custom)); HIP_CHECK(hipFree(d_C_rocblas));
416+
free(h_A); free(h_B); free(h_C_custom);
417+
HIP_CHECK(hipFree(d_A)); HIP_CHECK(hipFree(d_B)); HIP_CHECK(hipFree(d_C_custom));
418+
#if HAS_ROCBLAS
419+
free(h_C_rocblas);
420+
HIP_CHECK(hipFree(d_C_rocblas));
421+
#endif
391422
}
392423

393424
void testMatrixTranspose() {
@@ -421,8 +452,7 @@ class MatrixOperations {
421452

422453
// Standard transpose
423454
HIP_CHECK(hipEventRecord(start));
424-
hipLaunchKernelGGL(transposeSharedMem, gridSize, blockSize, 0, 0,
425-
d_input, d_output, width, height);
455+
transposeSharedMem<<<gridSize, blockSize>>>(d_input, d_output, width, height);
426456
HIP_CHECK(hipEventRecord(stop));
427457
HIP_CHECK(hipEventSynchronize(stop));
428458

@@ -434,8 +464,7 @@ class MatrixOperations {
434464
dim3 amdGridSize((width + 31) / 32, (height + 31) / 32);
435465

436466
HIP_CHECK(hipEventRecord(start));
437-
hipLaunchKernelGGL(transposeAMDOptimized, amdGridSize, amdBlockSize, 0, 0,
438-
d_input, d_output, width, height);
467+
transposeAMDOptimized<<<amdGridSize, amdBlockSize>>>(d_input, d_output, width, height);
439468
HIP_CHECK(hipEventRecord(stop));
440469
HIP_CHECK(hipEventSynchronize(stop));
441470

@@ -494,8 +523,7 @@ class MatrixOperations {
494523

495524
// Standard implementation
496525
HIP_CHECK(hipEventRecord(start));
497-
hipLaunchKernelGGL(matrixVectorMul, N, BLOCK_SIZE, 0, 0,
498-
d_matrix, d_vector, d_result, N);
526+
matrixVectorMul<<<N, BLOCK_SIZE>>>(d_matrix, d_vector, d_result, N);
499527
HIP_CHECK(hipEventRecord(stop));
500528
HIP_CHECK(hipEventSynchronize(stop));
501529

@@ -504,8 +532,7 @@ class MatrixOperations {
504532

505533
// Wavefront-optimized implementation
506534
HIP_CHECK(hipEventRecord(start));
507-
hipLaunchKernelGGL(matrixVectorMulWavefront, N, BLOCK_SIZE, 0, 0,
508-
d_matrix, d_vector, d_result, N);
535+
matrixVectorMulWavefront<<<N, BLOCK_SIZE>>>(d_matrix, d_vector, d_result, N);
509536
HIP_CHECK(hipEventRecord(stop));
510537
HIP_CHECK(hipEventSynchronize(stop));
511538

modules/module5/examples/02_memory_optimization_hip.cpp

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,7 @@
1313
#include <random>
1414
#include <algorithm>
1515
#include <iomanip>
16-
17-
#define HIP_CHECK(call) \
18-
do { \
19-
hipError_t error = call; \
20-
if (error != hipSuccess) { \
21-
std::cerr << "HIP error at " << __FILE__ << ":" << __LINE__ << " - " << hipGetErrorString(error) << std::endl; \
22-
exit(1); \
23-
} \
24-
} while(0)
16+
#include "rocm7_utils.h"
2517

2618
constexpr int WAVEFRONT_SIZE = 64;
2719

@@ -201,8 +193,8 @@ void test_matrix_transpose() {
201193

202194
std::cout << "Correctness: " << (correct ? "PASS" : "FAIL") << "\n";
203195

204-
hipFree(d_input);
205-
hipFree(d_output);
196+
HIP_CHECK(hipFree(d_input));
197+
HIP_CHECK(hipFree(d_output));
206198
}
207199

208200
void test_memory_bandwidth() {
@@ -230,18 +222,18 @@ void test_memory_bandwidth() {
230222
<< std::fixed << std::setprecision(3) << kernel_time << " ms"
231223
<< " (Bandwidth: " << std::setprecision(1) << bandwidth << " GB/s)\n";
232224

233-
hipFree(d_input);
234-
hipFree(d_output);
225+
HIP_CHECK(hipFree(d_input));
226+
HIP_CHECK(hipFree(d_output));
235227
}
236228

237229
int main() {
238230
std::cout << "HIP Memory Optimization Techniques\n";
239231
std::cout << "==================================\n";
240232

241233
int device;
242-
hipGetDevice(&device);
234+
HIP_CHECK(hipGetDevice(&device));
243235
hipDeviceProp_t props;
244-
hipGetDeviceProperties(&props, device);
236+
HIP_CHECK(hipGetDeviceProperties(&props, device));
245237

246238
std::cout << "GPU: " << props.name << "\n";
247239
std::cout << "Memory: " << props.totalGlobalMem / (1024*1024) << " MB\n";

modules/module6/examples/01_convolution_hip.cpp

Lines changed: 10 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,6 @@
1919
#include <vector>
2020
#include <cassert>
2121

22-
// Error checking macro
23-
#define HIP_CHECK(call) \
24-
do { \
25-
hipError_t err = call; \
26-
if (err != hipSuccess) { \
27-
fprintf(stderr, "HIP error at %s:%d - %s\n", __FILE__, __LINE__, \
28-
hipGetErrorString(err)); \
29-
exit(EXIT_FAILURE); \
30-
} \
31-
} while(0)
32-
3322
// Constants
3423
const int BLOCK_SIZE = 16;
3524
const int TILE_SIZE = 16;
@@ -509,9 +498,9 @@ void benchmark_conv1d() {
509498
delete[] h_output_naive;
510499
delete[] h_output_shared;
511500
delete[] h_output_cpu;
512-
HIP_CHECK(HIP_CHECK(hipFree(d_input));
513-
HIP_CHECK(HIP_CHECK(hipFree(d_kernel));
514-
HIP_CHECK(HIP_CHECK(hipFree(d_output));
501+
HIP_CHECK(hipFree(d_input));
502+
HIP_CHECK(hipFree(d_kernel));
503+
HIP_CHECK(hipFree(d_output));
515504
}
516505

517506
/**
@@ -615,9 +604,9 @@ void benchmark_conv2d() {
615604
delete[] h_output_naive;
616605
delete[] h_output_shared;
617606
delete[] h_output_cpu;
618-
HIP_CHECK(HIP_CHECK(hipFree(d_input));
619-
HIP_CHECK(HIP_CHECK(hipFree(d_kernel));
620-
HIP_CHECK(HIP_CHECK(hipFree(d_output));
607+
HIP_CHECK(hipFree(d_input));
608+
HIP_CHECK(hipFree(d_kernel));
609+
HIP_CHECK(hipFree(d_output));
621610
}
622611

623612
/**
@@ -710,10 +699,10 @@ void benchmark_separable_conv() {
710699
delete[] h_output_separable;
711700
delete[] h_output_cpu;
712701
delete[] h_temp;
713-
HIP_CHECK(HIP_CHECK(hipFree(d_input));
714-
HIP_CHECK(HIP_CHECK(hipFree(d_kernel));
715-
HIP_CHECK(HIP_CHECK(hipFree(d_output));
716-
HIP_CHECK(HIP_CHECK(hipFree(d_temp));
702+
HIP_CHECK(hipFree(d_input));
703+
HIP_CHECK(hipFree(d_kernel));
704+
HIP_CHECK(hipFree(d_output));
705+
HIP_CHECK(hipFree(d_temp));
717706
}
718707

719708
/**

modules/module6/examples/02_stencil_hip.cpp

Lines changed: 6 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,6 @@
1919
#include <vector>
2020
#include <cassert>
2121

22-
// Error checking macro
23-
#define HIP_CHECK(call) \
24-
do { \
25-
hipError_t err = call; \
26-
if (err != hipSuccess) { \
27-
fprintf(stderr, "HIP error at %s:%d - %s\n", __FILE__, __LINE__, \
28-
hipGetErrorString(err)); \
29-
exit(EXIT_FAILURE); \
30-
} \
31-
} while(0)
32-
3322
// Constants
3423
const int BLOCK_SIZE = 16;
3524
const int RADIUS = 3;
@@ -581,8 +570,8 @@ void benchmark_stencil_1d() {
581570
delete[] h_output_shared;
582571
delete[] h_output_coarsened;
583572
delete[] h_output_cpu;
584-
HIP_CHECK(HIP_CHECK(hipFree(d_input));
585-
HIP_CHECK(HIP_CHECK(hipFree(d_output));
573+
HIP_CHECK(hipFree(d_input));
574+
HIP_CHECK(hipFree(d_output));
586575
}
587576

588577
/**
@@ -684,8 +673,8 @@ void benchmark_stencil_2d() {
684673
delete[] h_output_shared;
685674
delete[] h_output_register;
686675
delete[] h_output_cpu;
687-
HIP_CHECK(HIP_CHECK(hipFree(d_input));
688-
HIP_CHECK(HIP_CHECK(hipFree(d_output));
676+
HIP_CHECK(hipFree(d_input));
677+
HIP_CHECK(hipFree(d_output));
689678
}
690679

691680
/**
@@ -741,8 +730,8 @@ void benchmark_stencil_3d() {
741730
// Cleanup
742731
delete[] h_input;
743732
delete[] h_output_naive;
744-
HIP_CHECK(HIP_CHECK(hipFree(d_input));
745-
HIP_CHECK(HIP_CHECK(hipFree(d_output));
733+
HIP_CHECK(hipFree(d_input));
734+
HIP_CHECK(hipFree(d_output));
746735
}
747736

748737
/**

0 commit comments

Comments
 (0)