Skip to content

Commit 0ae1249

Browse files
author
330-shh
committed
添加readme
1 parent 2b0984c commit 0ae1249

72 files changed

Lines changed: 3349 additions & 446 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.clang-format

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
---
2+
Language: Cpp
3+
BasedOnStyle: Google
4+
Standard: c++17
5+
ColumnLimit: 120
6+
IndentWidth: 4
7+
8+
# CUDA-specific formatting
9+
SpaceBeforeParens: ControlStatements
10+
AlignConsecutiveAssignments: true
11+
AlignConsecutiveDeclarations: true
12+
DerivePointerAlignment: false
13+
PointerAlignment: Right
14+
15+
# Keep blocks concise
16+
AllowShortBlocksOnASingleLine: true
17+
AllowShortFunctionsOnASingleLine: Inline
18+
AllowShortIfStatementsOnASingleLine: false
19+
AllowShortLoopsOnASingleLine: false
20+
21+
# Includes
22+
SortIncludes: true
23+
IncludeBlocks: Regroup
24+
25+
# CUDA macro handling
26+
MacroBlockBegin: "^__global__.*$"
27+
StatementMacros: ["__syncthreads", "CUDA_CHECK", "CUDA_CHECK_LAST"]

.github/workflows/build.yml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
name: CUDA Build Check
2+
3+
on:
4+
push:
5+
branches: [ main, master ]
6+
pull_request:
7+
branches: [ main, master ]
8+
9+
jobs:
10+
build:
11+
runs-on: ubuntu-latest
12+
13+
steps:
14+
- uses: actions/checkout@v3
15+
16+
# 由于 Ubuntu 最新环境可能已经自带一定的 CUDA/C++ 环境
17+
# 我们使用现成的 action 或者 ubuntu-latest 自带环境安装完整的 CUDA Toolkit
18+
- name: Install CUDA Toolkit
19+
uses: Jimver/cuda-toolkit@v0.2.11
20+
id: cuda-toolkit
21+
with:
22+
cuda: '12.2.0'
23+
24+
- name: Check nvcc
25+
run: nvcc -V
26+
27+
- name: Configure CMake
28+
# 这里使用我们项目的根目录 CMakeLists.txt
29+
run: cmake -S . -B build
30+
31+
- name: Build All Projects
32+
# 并发编译所有项目
33+
run: cmake --build build --parallel 4

.gitignore

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
build/
33
docs/
44
cmake-build*/
5+
out/
56
*.o
67
*.obj
78
*.exe
@@ -22,3 +23,20 @@ cmake-build*/
2223
*.ptx
2324
*.fatbin
2425
*.gpu
26+
27+
# Python & PyTorch C++ Extension 产物
28+
__pycache__/
29+
*.pyc
30+
*.pyo
31+
*.pyd
32+
build/lib.*
33+
build/temp.*
34+
*.egg-info/
35+
dist/
36+
37+
# AI 助手与工作流存储
38+
.agents/
39+
.agent/
40+
_agents/
41+
_agent/
42+
.gemini/

01_Basics/01_vector_add/vector_add.cu

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,5 @@
1-
#include <iostream>
2-
#include <vector>
3-
#include <cmath>
4-
#include <iomanip>
5-
#include "cuda_utils.cuh"
6-
#include "timer.cuh"
1+
#include <code_abbreviation.h>
72

8-
using namespace std;
93

104
// 向量加法(GPU kernel,手写)
115
__global__ void vector_add(const float* A, const float* B, float* C, const int n) {
@@ -23,13 +17,6 @@ void vector_add_cpu(const vector<float>& h_a, const vector<float>& h_b,
2317
}
2418
}
2519

26-
// GPU 计时结果结构体(AI 生成)
27-
struct GpuTimingResult {
28-
float h2d_ms; // Host to Device 传输时间
29-
float kernel_ms; // Kernel 执行时间(多次平均)
30-
float d2h_ms; // Device to Host 传输时间
31-
float total_ms; // 总时间
32-
};
3320

3421
// 向量加法(GPU,手写)
3522
GpuTimingResult vector_add_device(const vector<float>& h_a, const vector<float>& h_b,
@@ -96,7 +83,7 @@ bool verify_results(const vector<float>& h_a, const vector<float>& h_b,
9683
error_count = 0;
9784
for (int i = 0; i < n; ++i) {
9885
float expected = h_a[i] + h_b[i];
99-
if (std::fabs(h_c[i] - expected) > 1e-5) {
86+
if (fabs(h_c[i] - expected) > 1e-5) {
10087
error_count++;
10188
// 打印前 5 个错误
10289
if (error_count <= 5) {

01_Basics/02_matrix_mul_naive/matrix_mul_naive.cu

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,8 @@
1-
#include <iostream>
2-
#include <vector>
3-
#include <cmath>
4-
#include <iomanip>
5-
#include "cuda_utils.cuh"
6-
#include "timer.cuh"
7-
8-
using namespace std;
9-
using matrix = vector<float>; // 行主序
10-
using Int = const int;
11-
constexpr size_t sizeF = sizeof(float);
1+
#include <code_abbreviation.h>
2+
3+
using matrix = Matrix; // 兼容本文件中的 'matrix' 别名
4+
using Int = CInt; // 兼容本文件中的 'Int' 别名
5+
constexpr size_t sizeF = FSIZE; // 兼容本文件中的 'sizeF' 别名
126

137
// 矩阵乘法(GPU kernel,手写)
148
__global__ void matrix_mul_naive(const float* A, const float* B, float* C, Int m, Int n, Int k){
@@ -36,13 +30,6 @@ void matrix_mul_cpu(const float* A, const float* B, float* C, Int m, Int n, Int
3630
}
3731
}
3832

39-
// GPU 计时结果结构体(AI 生成)
40-
struct GpuTimingResult {
41-
float h2d_ms; // Host to Device 传输时间
42-
float kernel_ms; // Kernel 执行时间(多次平均)
43-
float d2h_ms; // Device to Host 传输时间
44-
float total_ms; // 总时间
45-
};
4633

4734
// 矩阵乘法(GPU,手写)
4835
GpuTimingResult matrix_mul_naive_device(const matrix& h_a, const matrix& h_b, matrix& h_c, Int m, Int n, Int k, Int iterations = 100) {

01_Basics/03_matrix_mul_tiled/matrix_mul_tiled.cu

Lines changed: 2 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,6 @@
1-
#include <iostream>
2-
#include <vector>
3-
#include <cmath>
4-
#include <iomanip>
5-
#include "cuda_utils.cuh"
6-
#include "timer.cuh"
7-
8-
using namespace std;
9-
using CInt = const int;
10-
using CSize = const size_t;
11-
using CPFloat = const float*;
12-
using PFloat = float*;
13-
using CRMatrix = const vector<float>&;
14-
using RMatrix = vector<float>&;
15-
using Matrix = vector<float>;
1+
#include <code_abbreviation.h>
2+
163
constexpr int TILE_WIDTH = 32;
17-
constexpr size_t FSIZE = sizeof(float);
184

195
// 矩阵乘法-使用共享内存分块优化(GPU kernel,手写)
206
__global__ void matrix_mul_tiled(CPFloat a, CPFloat b, PFloat c, CInt m, CInt n, CInt k) {
@@ -78,13 +64,6 @@ void matrix_mul_cpu(CPFloat a, CPFloat b, PFloat c, CInt m, CInt n, CInt k) {
7864
}
7965
}
8066

81-
// GPU 计时结果结构体(AI 生成)
82-
struct GpuTimingResult {
83-
float h2d_ms; // Host to Device 传输时间
84-
float kernel_ms; // Kernel 执行时间(多次平均)
85-
float d2h_ms; // Device to Host 传输时间
86-
float total_ms; // 总时间
87-
};
8867

8968
// 矩阵乘法(GPU,手写)
9069
GpuTimingResult matrix_mul_tiled_device(CRMatrix a, CRMatrix b, RMatrix c, CInt m, CInt n, CInt k, CInt iterations = 100) {

01_Basics/README.md

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
# 01_Basics: CUDA 基础编程与执行模型
2+
3+
## 1. 全景导览与学习目标 (Overview & Learning Objectives)
4+
5+
本章是 CUDA 学习的起点,旨在让开发者熟悉 CUDA 的异构编程模型、线程层级结构(Grid, Block, Thread)以及最基础的全局内存(Global Memory)访问。它涵盖了如何从 CPU 侧向 GPU 发起计算请求,以及 GPU 计算内核(Kernel)的基本编写范式。这些是最基础也是最核心的 CUDA 概念。
6+
7+
目录下的实现逐步加深对计算与内存瓶颈的理解:
8+
9+
- `01_vector_add/`:演示最基本的单维度网格与线程块划分,以及内存的 Allocate、Copy 与 Free 流程。
10+
- `02_matrix_mul_naive/`:演示二维线程网格映射,实现基础的矩阵乘法,暴露出未优化的全局内存的冗余访存问题。
11+
- `03_matrix_mul_tiled/`:引入基于共享内存(Shared Memory)的分块(Tiling)策略,大幅减少全局内存宽带的浪费,为后续更高级的优化打下基础。
12+
13+
## 2. 原理推导与数学表达 (Math & Logic)
14+
15+
向量加法是最纯粹的 Element-wise 操作:
16+
$$ C_i = A_i + B_i $$
17+
18+
对于矩阵乘法 $C = A \times B$(假设矩阵大小为 $M \times K$ 和 $K \times N$),目标元素计算式为:
19+
$$ C_{i, j} = \sum_{k=0}^{K-1} A_{i, k} \cdot B_{k, j} $$
20+
在朴素实现中,这需要执行 $O(M \cdot N \cdot K)$ 次全局内存访问。而在 Tiling 优化中,我们将大矩阵分解为大小为 $B_s \times B_s$ 的小块,此时访存量缩减至原先的 $1/B_s$(理论上)。
21+
22+
## 3. 硬核内存映射解析 (Memory & Thread Mapping)
23+
24+
以 2D Tiling 矩阵乘法为例,使用 Shared Memory 降低全局内存访存带宽(假定 Block 大小为 $16 \times 16$):
25+
26+
```text
27+
[Global Memory] 矩阵 A 与 B
28+
---------------------------------------------------
29+
| |
30+
| [Shared Mem 块 `sA` 16x16] |
31+
| +--------------------+ |
32+
| | Thread 映射区(0,0) | <-- Block内的(tx, ty) |
33+
| | 到(15, 15)协作加载 | |
34+
| +--------------------+ |
35+
---------------------------------------------------
36+
||
37+
\/ (寄存器做内积累加)
38+
39+
计算局部 $C_{sub}$ 积累至对应的 Global Memory / Register
40+
```
41+
42+
每个 Block 合作将数据从 Global Memory 读入 Shared Memory 后,需要调用 `__syncthreads()` 确保数据就绪,然后再执行乘累加。
43+
44+
## 4. 关键源码逐行解剖 (Code Deep-Dive)
45+
46+
来自 `03_matrix_mul_tiled/matrix_mul_tiled.cu` 的共享内存同步读取:
47+
48+
```cpp
49+
// 声明 2D 共享内存,用于存储 A 和 B 的子块,利用高带宽和极低延迟
50+
__shared__ float s_A[TILE_WIDTH][TILE_WIDTH];
51+
__shared__ float s_B[TILE_WIDTH][TILE_WIDTH];
52+
53+
// 协作加载:当前 Thread (tx, ty) 负责把全局内存中属于它的那个元素搬到共享内存
54+
s_A[ty][tx] = A[Row * k + (ph * TILE_WIDTH + tx)];
55+
s_B[ty][tx] = B[(ph * TILE_WIDTH + ty) * n + Col];
56+
57+
// ⚠️ 极其关键的屏障:必须等该 Block 内部所有 Thread 完成搬运,才能开始计算
58+
__syncthreads();
59+
60+
// 在共享内存上计算局部的乘积累加
61+
for (int j = 0; j < TILE_WIDTH; ++j) {
62+
Cvalue += s_A[ty][j] * s_B[j][tx];
63+
}
64+
65+
// ⚠️ 释放屏障:防止迭代过快导致下一轮的加载覆盖了当前还在计算的数据
66+
__syncthreads();
67+
```
68+
69+
## 5. 性能基准与分析视角 (Performance & Profiling)
70+
71+
- **基准**:对比 CPU 上的顺序循环矩阵乘法,以及简单的朴素版(Naïve)CUDA 矩阵乘法。
72+
- **典型分析**:使用 Tiled 分块后,全局显存带宽的使用率大幅度降低。在 NCU 中观察 `sm__throughput``l1tex__t_sectors_pipe_lsu_mem_global_op_ld` 的比值,明显改善了 Memory Workload,使得计算受限(Compute Bound)的比重增加。
73+
74+
## 6. 编译指引与参考资料 (Compile & References)
75+
76+
```bash
77+
# 通用编译指令
78+
nvcc -O3 -arch=sm_89 matrix_mul_tiled.cu -o run_tiled
79+
# NCU 性能分析指令(提取内存带宽和SM利用率)
80+
ncu --metrics sm__throughput.avg.pct_of_peak_sustained_elapsed,dram__throughput.avg.pct_of_peak_sustained_elapsed ./run_tiled
81+
```
82+
83+
- 参考资料: [CUDA C++ Programming Guide - Shared Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory)

02_Reduction/01_reduce_sum/reduce_sum.cu

Lines changed: 2 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,5 @@
1-
#include <iostream>
2-
#include <vector>
3-
#include <cmath>
4-
#include <iomanip>
5-
#include "cuda_utils.cuh"
6-
#include "timer.cuh"
7-
8-
using namespace std;
9-
using CInt = const int;
10-
using CSize = const size_t;
11-
using PFloat = float*;
12-
using CPFloat = const float*;
13-
using Matrix = vector<float>;
14-
using RMatrix = vector<float>&;
15-
using CRMatrix = const vector<float>&;
16-
17-
constexpr size_t FSIZE = sizeof(float);
18-
constexpr int BLOCK_SIZE = 1024;
1+
#include <code_abbreviation.h>
2+
193

204
// 归约(GPU kernel,手写)
215
__global__ void simple_reduce_sum(PFloat input, PFloat output) {
@@ -70,13 +54,6 @@ float reduce_sum_cpu(CPFloat data, CInt length) {
7054
return static_cast<float>(total);
7155
}
7256

73-
// GPU 计时结果结构体(AI 生成)
74-
struct GpuTimingResult {
75-
float h2d_ms; // Host to Device 传输时间
76-
float kernel_ms; // Kernel 执行时间(多次平均)
77-
float d2h_ms; // Device to Host 传输时间
78-
float total_ms; // 总时间
79-
};
8057

8158
// 验证结果(AI 生成)
8259
bool verify_results(float gpu_result, float cpu_result, const string& kernel_name) {

02_Reduction/02_reduce_optimized/reduce_optimized.cu

Lines changed: 2 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,5 @@
1-
#include <iostream>
2-
#include <vector>
3-
#include <cmath>
4-
#include <iomanip>
5-
#include "cuda_utils.cuh"
6-
#include "timer.cuh"
7-
8-
using namespace std;
9-
using CInt = const int;
10-
using CFloat = const float;
11-
using CSize = const size_t;
12-
using PFloat = float*;
13-
using CPFloat = const float*;
14-
using Matrix = vector<float>;
15-
using RMatrix = vector<float>&;
16-
using CRMatrix = const vector<float>&;
17-
18-
constexpr int BLOCK_SIZE = 1024;
19-
constexpr size_t FSIZE = sizeof(float);
20-
constexpr int COARSE_FACTOR = 4;
1+
#include <code_abbreviation.h>
2+
213

224
// 归约-任意长度(GPU kernel,手写)
235
__global__ void segmented_reduce_sum(PFloat input, PFloat output, CInt length) {
@@ -143,13 +125,6 @@ bool verify_results(float gpu_result, float cpu_result, const string& kernel_nam
143125
}
144126
}
145127

146-
// GPU 计时结果结构体(AI 生成)
147-
struct GpuTimingResult {
148-
float h2d_ms; // Host to Device 传输时间
149-
float kernel_ms; // Kernel 执行时间(多次平均)
150-
float d2h_ms; // Device to Host 传输时间
151-
float total_ms; // 总时间
152-
};
153128

154129
// 通用归约求和 GPU 封装(GPU,手写)
155130
template<typename KernelFunc>

02_Reduction/03_dot_product/dot_product.cu

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#include <code_abbreviation.h>
1+
#include <code_abbreviation.h>
22

33
// 点积-共享内存(GPU kernel,手写)
44
__global__ void shared_dot_product(CPFloat a, CPFloat b, PFloat output, CInt size) {
@@ -104,13 +104,6 @@ bool verify_results(RFloat gpu_result, RFloat cpu_result, const string& kernel_n
104104
}
105105
}
106106

107-
// GPU 计时结果结构体(AI 生成)
108-
struct GpuTimingResult {
109-
float h2d_ms; // Host to Device 传输时间
110-
float kernel_ms; // Kernel 执行时间(多次平均)
111-
float d2h_ms; // Device to Host 传输时间
112-
float total_ms; // 总时间
113-
};
114107

115108
// 通用点积 GPU 封装(GPU,手写)
116109
template<typename KernelFunc>

0 commit comments

Comments
 (0)