@@ -30,8 +30,7 @@ struct BenchmarkResult {
3030 void print () const {
3131 printf (" %-30s | %4d x %4d x %4d | %8.3f ms | %8.2f GFLOPS | %s | err: "
3232 " %.2e\n " ,
33- kernel_name.c_str (), M, K, N, time_ms, gflops,
34- correct ? " PASS" : " FAIL" , max_error);
33+ kernel_name.c_str (), M, K, N, time_ms, gflops, correct ? " PASS" : " FAIL" , max_error);
3534 }
3635};
3736
@@ -54,9 +53,8 @@ public:
5453 }
5554
5655 template <typename KernelFunc>
57- BenchmarkResult run (const std::string &name, KernelFunc kernel_func, int M,
58- int K, int N, int warmup_runs = 5 ,
59- int benchmark_runs = 20 ,
56+ BenchmarkResult run (const std::string &name, KernelFunc kernel_func, int M, int K, int N,
57+ int warmup_runs = 5 , int benchmark_runs = 20 ,
6058 VerifyTolerance tolerance = kStandardVerifyTolerance ) {
6159 BenchmarkResult result;
6260 result.kernel_name = name;
@@ -77,9 +75,8 @@ public:
7775 d_B.copyFromHost (h_B.data (), K * N);
7876
7977 float alpha = 1 .0f , beta = 0 .0f ;
80- CUBLAS_CHECK (cublasSgemm (cublas_handle_, CUBLAS_OP_N , CUBLAS_OP_N , N, M, K,
81- &alpha, d_B.get (), N, d_A.get (), K, &beta,
82- d_C_ref.get (), N));
78+ CUBLAS_CHECK (cublasSgemm (cublas_handle_, CUBLAS_OP_N , CUBLAS_OP_N , N, M, K, &alpha, d_B.get (),
79+ N, d_A.get (), K, &beta, d_C_ref.get (), N));
8380
8481 for (int i = 0 ; i < warmup_runs; ++i) {
8582 d_C.zero ();
@@ -101,17 +98,15 @@ public:
10198 d_C.copyToHost (h_C.data (), M * N);
10299 d_C_ref.copyToHost (h_C_ref.data (), M * N);
103100
104- VerifyResult verify_result = compareMatrices (h_C.data (), h_C_ref.data (), M,
105- N, tolerance);
101+ VerifyResult verify_result = compareMatrices (h_C.data (), h_C_ref.data (), M, N, tolerance);
106102 result.correct = verify_result.passed ;
107103 result.max_error = verify_result.max_rel_error ;
108104
109105 results_.push_back (result);
110106 return result;
111107 }
112108
113- BenchmarkResult runCublas (int M, int K, int N, int warmup_runs = 5 ,
114- int benchmark_runs = 20 ) {
109+ BenchmarkResult runCublas (int M, int K, int N, int warmup_runs = 5 , int benchmark_runs = 20 ) {
115110 BenchmarkResult result;
116111 result.kernel_name = " cuBLAS" ;
117112 result.M = M;
@@ -132,17 +127,15 @@ public:
132127 float alpha = 1 .0f , beta = 0 .0f ;
133128
134129 for (int i = 0 ; i < warmup_runs; ++i) {
135- CUBLAS_CHECK (cublasSgemm (cublas_handle_, CUBLAS_OP_N , CUBLAS_OP_N , N, M,
136- K, &alpha, d_B.get (), N, d_A.get (), K, &beta,
137- d_C.get (), N));
130+ CUBLAS_CHECK (cublasSgemm (cublas_handle_, CUBLAS_OP_N , CUBLAS_OP_N , N, M, K, &alpha, d_B.get (),
131+ N, d_A.get (), K, &beta, d_C.get (), N));
138132 }
139133 CUDA_CHECK (cudaDeviceSynchronize ());
140134
141135 CUDA_CHECK (cudaEventRecord (start_));
142136 for (int i = 0 ; i < benchmark_runs; ++i) {
143- CUBLAS_CHECK (cublasSgemm (cublas_handle_, CUBLAS_OP_N , CUBLAS_OP_N , N, M,
144- K, &alpha, d_B.get (), N, d_A.get (), K, &beta,
145- d_C.get (), N));
137+ CUBLAS_CHECK (cublasSgemm (cublas_handle_, CUBLAS_OP_N , CUBLAS_OP_N , N, M, K, &alpha, d_B.get (),
138+ N, d_A.get (), K, &beta, d_C.get (), N));
146139 }
147140 CUDA_CHECK (cudaEventRecord (stop_));
148141 CUDA_CHECK (cudaEventSynchronize (stop_));
@@ -157,9 +150,9 @@ public:
157150 return result;
158151 }
159152
160- BenchmarkResult runTensorCoreComputeOnly (
161- int M, int K, int N, int warmup_runs = 5 , int benchmark_runs = 20 ,
162- VerifyTolerance tolerance = kTensorCoreVerifyTolerance ) {
153+ BenchmarkResult runTensorCoreComputeOnly (int M, int K, int N, int warmup_runs = 5 ,
154+ int benchmark_runs = 20 ,
155+ VerifyTolerance tolerance = kTensorCoreVerifyTolerance ) {
163156 if (!tensorCoresAvailable () || !tensorCoreDimensionsSupported (M, K, N)) {
164157 throw CudaError (" Tensor Core compute-only benchmark requires sm_70+ and "
165158 " dimensions aligned to 16" );
@@ -186,32 +179,27 @@ public:
186179 d_B.copyFromHost (h_B.data (), K * N);
187180
188181 float alpha = 1 .0f , beta = 0 .0f ;
189- CUBLAS_CHECK (cublasSgemm (cublas_handle_, CUBLAS_OP_N , CUBLAS_OP_N , N, M, K,
190- &alpha, d_B.get (), N, d_A.get (), K, &beta,
191- d_C_ref.get (), N));
182+ CUBLAS_CHECK (cublasSgemm (cublas_handle_, CUBLAS_OP_N , CUBLAS_OP_N , N, M, K, &alpha, d_B.get (),
183+ N, d_A.get (), K, &beta, d_C_ref.get (), N));
192184
193185 int blockSize = 256 ;
194186 int gridSizeA = (M * K + blockSize - 1 ) / blockSize;
195187 int gridSizeB = (K * N + blockSize - 1 ) / blockSize;
196188
197- float_to_half_kernel<<<gridSizeA, blockSize>>> (d_A.get (), d_A_fp16.get (),
198- M * K);
199- float_to_half_kernel<<<gridSizeB, blockSize>>> (d_B.get (), d_B_fp16.get (),
200- K * N);
189+ float_to_half_kernel<<<gridSizeA, blockSize>>> (d_A.get (), d_A_fp16.get (), M * K);
190+ float_to_half_kernel<<<gridSizeB, blockSize>>> (d_B.get (), d_B_fp16.get (), K * N);
201191 CUDA_CHECK (cudaGetLastError ());
202192 CUDA_CHECK (cudaDeviceSynchronize ());
203193
204194 for (int i = 0 ; i < warmup_runs; ++i) {
205195 d_C.zero ();
206- launch_tensor_core_sgemm_fp16 (d_A_fp16.get (), d_B_fp16.get (), d_C.get (), M,
207- K, N);
196+ launch_tensor_core_sgemm_fp16 (d_A_fp16.get (), d_B_fp16.get (), d_C.get (), M, K, N);
208197 }
209198 CUDA_CHECK (cudaDeviceSynchronize ());
210199
211200 CUDA_CHECK (cudaEventRecord (start_));
212201 for (int i = 0 ; i < benchmark_runs; ++i) {
213- launch_tensor_core_sgemm_fp16 (d_A_fp16.get (), d_B_fp16.get (), d_C.get (), M,
214- K, N);
202+ launch_tensor_core_sgemm_fp16 (d_A_fp16.get (), d_B_fp16.get (), d_C.get (), M, K, N);
215203 }
216204 CUDA_CHECK (cudaEventRecord (stop_));
217205 CUDA_CHECK (cudaEventSynchronize (stop_));
@@ -223,8 +211,7 @@ public:
223211 d_C.copyToHost (h_C.data (), M * N);
224212 d_C_ref.copyToHost (h_C_ref.data (), M * N);
225213
226- VerifyResult verify_result = compareMatrices (h_C.data (), h_C_ref.data (), M,
227- N, tolerance);
214+ VerifyResult verify_result = compareMatrices (h_C.data (), h_C_ref.data (), M, N, tolerance);
228215 result.correct = verify_result.passed ;
229216 result.max_error = verify_result.max_rel_error ;
230217
@@ -239,8 +226,8 @@ public:
239226 printf (" SGEMM Benchmark Results\n " );
240227 printf (" ===================================================================="
241228 " ============\n " );
242- printf (" %-30s | %-17s | %10s | %14s | %4s | %s\n " , " Kernel" ,
243- " Dimensions " , " Time " , " Performance" , " Pass" , " Max Error" );
229+ printf (" %-30s | %-17s | %10s | %14s | %4s | %s\n " , " Kernel" , " Dimensions " , " Time " ,
230+ " Performance" , " Pass" , " Max Error" );
244231 printf (" --------------------------------------------------------------------"
245232 " ------------\n " );
246233
@@ -264,13 +251,12 @@ public:
264251 for (const auto &result : results_) {
265252 double flops = 2.0 * result.M * result.N * result.K ;
266253 double bytes =
267- (result.M * result.K + result.K * result.N + result.M * result.N ) *
268- sizeof (float );
254+ (result.M * result.K + result.K * result.N + result.M * result.N ) * sizeof (float );
269255 double ai = flops / bytes;
270256
271- file << result.kernel_name << " ," << result.M << " ," << result.K << " ,"
272- << result.N << " ," << result.time_ms << " ," << result.gflops << " ,"
273- << result. bandwidth_gb_s << " , " << ai << " \n " ;
257+ file << result.kernel_name << " ," << result.M << " ," << result.K << " ," << result. N << " , "
258+ << result.time_ms << " ," << result.gflops << " ," << result.bandwidth_gb_s << " ," << ai
259+ << " \n " ;
274260 }
275261
276262 file.close ();
@@ -289,9 +275,8 @@ private:
289275 double flops = 2.0 * result.M * result.N * result.K ;
290276 result.gflops = (flops / (result.time_ms * 1e-3 )) / 1e9 ;
291277
292- double bytes = (result.M * result.K + result.K * result.N +
293- result.M * result.N ) *
294- sizeof (float );
278+ double bytes =
279+ (result.M * result.K + result.K * result.N + result.M * result.N ) * sizeof (float );
295280 result.bandwidth_gb_s = (bytes / (result.time_ms * 1e-3 )) / 1e9 ;
296281 }
297282
@@ -304,9 +289,8 @@ private:
304289// Utility Functions
305290// ============================================================================
306291
307- inline void
308- printPerformanceComparison (const std::vector<BenchmarkResult> &results,
309- float cublas_gflops) {
292+ inline void printPerformanceComparison (const std::vector<BenchmarkResult> &results,
293+ float cublas_gflops) {
310294 printf (" \n " );
311295 printf (" Performance Comparison (vs cuBLAS):\n " );
312296 printf (" ---------------------------------------------------------------------"
@@ -317,8 +301,8 @@ printPerformanceComparison(const std::vector<BenchmarkResult> &results,
317301
318302 for (const auto &result : results) {
319303 float percentage = (result.gflops / cublas_gflops) * 100 .0f ;
320- printf (" %-30s | %10.2f | %8.1f%%\n " , result.kernel_name .c_str (),
321- result. gflops , percentage);
304+ printf (" %-30s | %10.2f | %8.1f%%\n " , result.kernel_name .c_str (), result. gflops ,
305+ percentage);
322306 }
323307 printf (" ---------------------------------------------------------------------"
324308 " -----------\n " );
@@ -348,8 +332,7 @@ inline float getTheoreticalPeakGflops() {
348332 float clockGHz = static_cast <float >(prop.clockRate ) / 1e6f;
349333
350334 // Peak GFLOPS = SMs * cores/SM * 2 (FMA) * clock (GHz) * 1000 (MHz factor)
351- float peakGflops =
352- prop.multiProcessorCount * coresPerSM * 2 * clockGHz * 1000 ;
335+ float peakGflops = prop.multiProcessorCount * coresPerSM * 2 * clockGHz * 1000 ;
353336
354337 return peakGflops;
355338}
0 commit comments