Skip to content

Commit 5fe6ec0

Browse files
committed
refactor: improve code safety and quality
- Fix DeviceMemory constructor exception safety (initialize ptr_ to nullptr) - Add gridSize overflow check for large matrix support - Extract magic numbers into named constants (kDefaultTileSize, kDefaultBlockSize, kFilenameBufferSize) - Add DeviceInfoCache singleton to cache device properties and reduce redundant queries - Add [[maybe_unused]] attribute to utility functions (nullFallback, calculateBandwidthUtilization, printPerformanceReport)
1 parent 8fb0a60 commit 5fe6ec0

5 files changed

Lines changed: 124 additions & 55 deletions

File tree

src/benchmark_runner.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ class BenchmarkRunner {
103103
printPerformanceComparison(benchmark.getResults(), cublas_gflops);
104104

105105
// 导出 roofline 数据
106-
char filename[256];
106+
char filename[kFilenameBufferSize];
107107
snprintf(filename, sizeof(filename), "roofline_data_%d_%d_%d.csv", M, K, N);
108108
benchmark.exportRooflineData(filename);
109109
}

src/kernels/tensor_core_benchmark.cuh

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "../utils/benchmark_metrics.cuh"
66
#include "../utils/verify.cuh"
77

8+
#include <climits>
89
#include <cublas_v2.h>
910
#include <cuda_fp16.h>
1011
#include <cuda_runtime.h>
@@ -66,9 +67,17 @@ runTensorCoreComputeOnlyBenchmark(cublasHandle_t cublas_handle, int M, int K, in
6667
CUBLAS_CHECK(cublasSgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha,
6768
d_B.get(), N, d_A.get(), K, &beta, d_C_ref.get(), N));
6869

69-
int blockSize = 256;
70-
int gridSizeA = (M * K + blockSize - 1) / blockSize;
71-
int gridSizeB = (K * N + blockSize - 1) / blockSize;
70+
int blockSize = kDefaultBlockSize;
71+
// 安全计算 gridSize,检查溢出
72+
auto safeGridSize = [](size_t num, int blk) -> int {
73+
size_t grid = (num + blk - 1) / blk;
74+
if (grid > static_cast<size_t>(INT_MAX)) {
75+
throw CudaError("Grid size overflow: matrix too large for kernel launch");
76+
}
77+
return static_cast<int>(grid);
78+
};
79+
int gridSizeA = safeGridSize(static_cast<size_t>(M) * K, blockSize);
80+
int gridSizeB = safeGridSize(static_cast<size_t>(K) * N, blockSize);
7281

7382
float_to_half_kernel<<<gridSizeA, blockSize>>>(d_A.get(), d_A_fp16.get(), M * K);
7483
float_to_half_kernel<<<gridSizeB, blockSize>>>(d_B.get(), d_B_fp16.get(), K * N);

src/kernels/tensor_core_sgemm.cuh

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
#include "../utils/cuda_utils.cuh"
2323
#include "../utils/verify.cuh"
24+
#include <climits>
2425
#include <cuda_fp16.h>
2526
#include <cuda_runtime.h>
2627
#include <functional>
@@ -46,14 +47,7 @@ using tensor_core::WMMA_N;
4647
/**
4748
* 检查当前设备是否支持 Tensor Core (sm_70+)
4849
*/
49-
inline bool tensorCoresAvailable() {
50-
int device;
51-
CUDA_CHECK(cudaGetDevice(&device));
52-
53-
cudaDeviceProp prop;
54-
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
55-
return prop.major >= 7;
56-
}
50+
inline bool tensorCoresAvailable() { return DeviceInfoCache::instance().hasTensorCores(); }
5751

5852
/**
5953
* 检查给定维度是否适合 Tensor Core 加速
@@ -67,11 +61,7 @@ inline bool tensorCoreDimensionsSupported(int M, int K, int N) {
6761
* 获取当前设备的 Tensor Core 信息字符串
6862
*/
6963
inline const char *getTensorCoreArchName() {
70-
int device;
71-
CUDA_CHECK(cudaGetDevice(&device));
72-
73-
cudaDeviceProp prop;
74-
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
64+
const cudaDeviceProp &prop = DeviceInfoCache::instance().prop();
7565

7666
if (prop.major == 7) {
7767
return (prop.minor == 0) ? "Volta" : (prop.minor == 5) ? "Turing" : "Unknown sm_7x";
@@ -101,7 +91,8 @@ using FallbackKernel =
10191
*
10292
* 提供一个空的 fallback(用于测试或显式配置场景)
10393
*/
104-
inline void nullFallback(const float *, const float *, float *, int, int, int, cudaStream_t = 0) {
94+
[[maybe_unused]] inline void
95+
nullFallback(const float *, const float *, float *, int, int, int, cudaStream_t = 0) {
10596
// 空实现 - 用于测试
10697
}
10798

@@ -179,7 +170,7 @@ __global__ void tensor_core_sgemm_kernel_fp16(const half *__restrict__ A,
179170
*/
180171
inline void launch_tensor_core_sgemm_fp16_fast_path(const half *A, const half *B, float *C, int M,
181172
int K, int N, cudaStream_t stream = 0) {
182-
dim3 blockDim(32, 1);
173+
dim3 blockDim(kDefaultTileSize, 1);
183174
dim3 gridDim((N + WMMA_N - 1) / WMMA_N, (M + WMMA_M - 1) / WMMA_M);
184175

185176
tensor_core_sgemm_kernel_fp16<<<gridDim, blockDim, 0, stream>>>(A, B, C, M, K, N);
@@ -255,9 +246,17 @@ inline void launch_tensor_core_sgemm_with_fallback(const float *A, const float *
255246
DeviceMemory<half> d_A_fp16(num_A);
256247
DeviceMemory<half> d_B_fp16(num_B);
257248

258-
int blockSize = 256;
259-
int gridSizeA = static_cast<int>((num_A + blockSize - 1) / blockSize);
260-
int gridSizeB = static_cast<int>((num_B + blockSize - 1) / blockSize);
249+
int blockSize = kDefaultBlockSize;
250+
// 安全计算 gridSize,检查溢出
251+
auto safeGridSize = [](size_t num, int blk) -> int {
252+
size_t grid = (num + blk - 1) / blk;
253+
if (grid > static_cast<size_t>(INT_MAX)) {
254+
throw CudaError("Grid size overflow: matrix too large for kernel launch");
255+
}
256+
return static_cast<int>(grid);
257+
};
258+
int gridSizeA = safeGridSize(num_A, blockSize);
259+
int gridSizeB = safeGridSize(num_B, blockSize);
261260

262261
float_to_half_kernel<<<gridSizeA, blockSize, 0, stream>>>(A, d_A_fp16.get(),
263262
static_cast<int>(num_A));

src/utils/benchmark_metrics.cuh

Lines changed: 15 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -57,32 +57,11 @@ inline PerformanceMetrics calculateSgemmMetrics(int M, int K, int N, float time_
5757
* - 时钟频率
5858
*/
5959
inline float getTheoreticalPeakGflops() {
60-
int device;
61-
CUDA_CHECK(cudaGetDevice(&device));
62-
63-
cudaDeviceProp prop;
64-
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
65-
66-
// 每个 SM 的核心数(基于架构)
67-
// 参考: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
68-
int coresPerSM;
69-
if (prop.major == 7) {
70-
coresPerSM = 64; // Volta (sm_70, sm_72), Turing (sm_75)
71-
} else if (prop.major == 8) {
72-
coresPerSM = (prop.minor == 0 || prop.minor == 6)
73-
? 64
74-
: 128; // A100/sm_80, A10G/sm_86: 64, others: 128
75-
} else if (prop.major == 9) {
76-
coresPerSM = 128; // Hopper (sm_90)
77-
} else {
78-
coresPerSM = 64; // 默认回退
79-
}
80-
81-
// 时钟频率 (kHz -> GHz)
82-
float clockGHz = static_cast<float>(prop.clockRate) / 1e6f;
60+
DeviceInfoCache &cache = DeviceInfoCache::instance();
61+
const cudaDeviceProp &prop = cache.prop();
8362

8463
// 峰值 GFLOPS = SMs * cores/SM * 2 (FMA) * clock (GHz) * 1000 (MHz factor)
85-
float peakGflops = prop.multiProcessorCount * coresPerSM * 2 * clockGHz * 1000;
64+
float peakGflops = prop.multiProcessorCount * cache.coresPerSM() * 2 * cache.clockGHz() * 1000;
8665

8766
return peakGflops;
8867
}
@@ -96,11 +75,7 @@ inline float getTheoreticalPeakGflops() {
9675
* - 内存总线宽度
9776
*/
9877
inline float getTheoreticalPeakBandwidth() {
99-
int device;
100-
CUDA_CHECK(cudaGetDevice(&device));
101-
102-
cudaDeviceProp prop;
103-
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
78+
const cudaDeviceProp &prop = DeviceInfoCache::instance().prop();
10479

10580
// 内存时钟频率 (Hz -> MHz)
10681
float memoryClockMHz = static_cast<float>(prop.memoryClockRate) / 1000.0f;
@@ -139,8 +114,12 @@ inline float calculateEfficiency(float actual_gflops, float peak_gflops) {
139114

140115
/**
141116
* 计算带宽利用率(相对于理论峰值的百分比)
117+
*
118+
* 注意:此函数为工具函数,供外部调用者使用。
119+
* 内部 benchmark 流程使用 calculateEfficiency。
142120
*/
143-
inline float calculateBandwidthUtilization(float actual_bandwidth, float peak_bandwidth) {
121+
[[maybe_unused]] inline float
122+
calculateBandwidthUtilization(float actual_bandwidth, float peak_bandwidth) {
144123
if (peak_bandwidth <= 0)
145124
return 0.0f;
146125
return (actual_bandwidth / peak_bandwidth) * 100.0f;
@@ -153,12 +132,16 @@ inline float calculateBandwidthUtilization(float actual_bandwidth, float peak_ba
153132
/**
154133
* 打印性能比较报告
155134
*
135+
* 注意:此函数为工具函数,供外部调用者打印格式化报告。
136+
* 内部 benchmark 流程使用 SGEMMBenchmark::printSummary。
137+
*
156138
* @param kernel_name 内核名称
157139
* @param metrics 性能指标
158140
* @param baseline_gflops 基线 GFLOPS(如 cuBLAS)
159141
*/
160-
inline void printPerformanceReport(const char *kernel_name, const PerformanceMetrics &metrics,
161-
float baseline_gflops = 0.0f) {
142+
[[maybe_unused]] inline void
143+
printPerformanceReport(const char *kernel_name, const PerformanceMetrics &metrics,
144+
float baseline_gflops = 0.0f) {
162145
printf(" %-30s | %8.3f ms | %10.2f GFLOPS | %8.2f GB/s | AI: %.1f\n", kernel_name,
163146
metrics.time_ms, metrics.gflops, metrics.bandwidth_gb_s, metrics.arithmetic_intensity);
164147

src/utils/cuda_utils.cuh

Lines changed: 79 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,25 @@
77
#include <curand.h>
88
#include <random>
99

10+
// ============================================================================
11+
// 命名常量
12+
// ============================================================================
13+
14+
namespace config {
15+
/// 默认 tile 大小(用于 SGEMM 内核)
16+
inline constexpr int kDefaultTileSize = 32;
17+
18+
/// 默认 block 大小(用于 CUDA 内核启动)
19+
inline constexpr int kDefaultBlockSize = 256;
20+
21+
/// 文件名缓冲区大小
22+
inline constexpr int kFilenameBufferSize = 256;
23+
} // namespace config
24+
25+
using config::kDefaultBlockSize;
26+
using config::kDefaultTileSize;
27+
using config::kFilenameBufferSize;
28+
1029
// ============================================================================
1130
// Error Checking Macros
1231
// ============================================================================
@@ -56,7 +75,7 @@ template <typename T> class DeviceMemory {
5675
public:
5776
DeviceMemory() : ptr_(nullptr), size_(0) {}
5877

59-
explicit DeviceMemory(size_t count) : size_(count) {
78+
explicit DeviceMemory(size_t count) : ptr_(nullptr), size_(count) {
6079
CUDA_CHECK(cudaMalloc(&ptr_, count * sizeof(T)));
6180
}
6281

@@ -169,3 +188,62 @@ inline void printGPUInfo() {
169188
printf(" L2 Cache Size: %d KB\n", prop.l2CacheSize / 1024);
170189
printf("\n");
171190
}
191+
192+
// ============================================================================
193+
// Device Info Cache - 缓存设备属性避免重复查询
194+
// ============================================================================
195+
196+
/**
197+
* 设备信息缓存类(单例模式)
198+
*
199+
* 缓存 cudaDeviceProp 和常用计算值,避免重复调用 cudaGetDeviceProperties。
200+
* 首次访问时初始化,之后返回缓存值。
201+
*/
202+
class DeviceInfoCache {
203+
public:
204+
/// 获取单例实例
205+
static DeviceInfoCache &instance() {
206+
static DeviceInfoCache cache;
207+
return cache;
208+
}
209+
210+
/// 获取缓存的设备属性
211+
const cudaDeviceProp &prop() const { return prop_; }
212+
213+
/// 获取设备 ID
214+
int deviceId() const { return device_; }
215+
216+
/// 检查是否支持 Tensor Core (sm_70+)
217+
bool hasTensorCores() const { return prop_.major >= 7; }
218+
219+
/// 获取每个 SM 的 CUDA 核心数(基于架构)
220+
int coresPerSM() const { return coresPerSM_; }
221+
222+
/// 获取时钟频率 (GHz)
223+
float clockGHz() const { return clockGHz_; }
224+
225+
private:
226+
DeviceInfoCache() {
227+
CUDA_CHECK(cudaGetDevice(&device_));
228+
CUDA_CHECK(cudaGetDeviceProperties(&prop_, device_));
229+
coresPerSM_ = computeCoresPerSM();
230+
clockGHz_ = static_cast<float>(prop_.clockRate) / 1e6f;
231+
}
232+
233+
int computeCoresPerSM() const {
234+
// 参考: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
235+
if (prop_.major == 7) {
236+
return 64; // Volta (sm_70, sm_72), Turing (sm_75)
237+
} else if (prop_.major == 8) {
238+
return (prop_.minor == 0 || prop_.minor == 6) ? 64 : 128; // A100/sm_80, A10G/sm_86: 64
239+
} else if (prop_.major == 9) {
240+
return 128; // Hopper (sm_90)
241+
}
242+
return 64; // 默认回退
243+
}
244+
245+
int device_;
246+
cudaDeviceProp prop_;
247+
int coresPerSM_;
248+
float clockGHz_;
249+
};

0 commit comments

Comments
 (0)