Skip to content

Commit cc97d93

Browse files
author
shijiashuai
committed
chore: snapshot local changes
1 parent 8b5c600 commit cc97d93

File tree

16 files changed

+489
-439
lines changed

16 files changed

+489
-439
lines changed

.github/workflows/ci.yml

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,48 @@ jobs:
4040
ruff check python/
4141
ruff format --check python/
4242
43+
consistency-check:
44+
name: Entry Consistency Check
45+
runs-on: ubuntu-latest
46+
steps:
47+
- uses: actions/checkout@v4
48+
49+
- name: Verify documented example files exist
50+
run: |
51+
python - <<'PY'
52+
from pathlib import Path
53+
54+
required = [
55+
Path('examples/01_elementwise/relu_example.cu'),
56+
Path('examples/03_gemm/gemm_benchmark.cu'),
57+
Path('examples/python/basic_usage.py'),
58+
]
59+
60+
missing = [str(path) for path in required if not path.exists()]
61+
if missing:
62+
raise SystemExit(f"Missing documented examples: {missing}")
63+
PY
64+
65+
- name: Verify Python module naming is aligned
66+
run: |
67+
python - <<'PY'
68+
from pathlib import Path
69+
70+
files = {
71+
'python/bindings/bindings.cpp': 'NB_MODULE(hpc_ai_opt, m)',
72+
'python/CMakeLists.txt': 'nanobind_add_module(hpc_ai_opt',
73+
'examples/python/basic_usage.py': 'import hpc_ai_opt as opt',
74+
'docs/python/index.rst': 'hpc_ai_opt',
75+
'README.md': 'hpc_ai_opt',
76+
'README.zh-CN.md': 'hpc_ai_opt',
77+
}
78+
79+
for file_path, expected in files.items():
80+
content = Path(file_path).read_text(encoding='utf-8')
81+
if expected not in content:
82+
raise SystemExit(f"Expected '{expected}' in {file_path}")
83+
PY
84+
4385
docs:
4486
name: Build Documentation
4587
runs-on: ubuntu-latest
@@ -80,12 +122,12 @@ jobs:
80122
ci-success:
81123
name: CI Success
82124
runs-on: ubuntu-latest
83-
needs: [format-check, docs]
125+
needs: [format-check, consistency-check, docs]
84126
if: always()
85127
steps:
86128
- name: Check all jobs
87129
run: |
88-
if [[ "${{ needs.format-check.result }}" != "success" || "${{ needs.docs.result }}" != "success" ]]; then
130+
if [[ "${{ needs.format-check.result }}" != "success" || "${{ needs.consistency-check.result }}" != "success" || "${{ needs.docs.result }}" != "success" ]]; then
89131
echo "One or more required jobs failed"
90132
exit 1
91133
fi

.github/workflows/pages.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,14 @@ name: Documentation
33

44
on:
55
push:
6-
branches: [main]
6+
branches: [master, main]
77
paths:
88
- 'docs/**'
99
- 'src/**/*.cuh'
1010
- 'src/**/*.cu'
1111
- 'python/**'
1212
pull_request:
13-
branches: [main]
13+
branches: [master, main]
1414
paths:
1515
- 'docs/**'
1616
workflow_dispatch:
@@ -66,7 +66,7 @@ jobs:
6666
- name: Combine documentation
6767
run: |
6868
mkdir -p docs/_site
69-
# Copy main docs
69+
cp docs/index.html docs/_site/index.html
7070
cp -r docs/*.md docs/_site/
7171
# Copy Doxygen output
7272
if [ -d docs/api/html ]; then
@@ -85,7 +85,7 @@ jobs:
8585
deploy:
8686
name: Deploy to GitHub Pages
8787
needs: build-docs
88-
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
88+
if: github.event_name == 'push' && (github.ref == 'refs/heads/master' || github.ref == 'refs/heads/main')
8989
runs-on: ubuntu-latest
9090
environment:
9191
name: github-pages

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,6 @@ Thumbs.db
8181
# Environment
8282
.env
8383
.env.local
84+
85+
# Local AI agent worktrees
86+
.claude/

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1313
- Code quality tools: .clang-format, .clang-tidy, .editorconfig, pre-commit
1414
- CI/CD with GitHub Actions
1515
- Doxygen and Sphinx documentation configuration
16+
- Entry consistency checks for documented examples and Python module naming
17+
18+
### Changed
19+
- Unified the Python extension module name to `hpc_ai_opt`
20+
- Reworked Python entry documentation and examples to match the current bindings surface
21+
- Connected `examples/` to the top-level CMake build via `BUILD_EXAMPLES`
22+
- Updated README and examples documentation to reflect the repository's actual structure and Python workflow
1623

1724
## [0.1.0] - 2024-01-01
1825

CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ endif()
5858

5959
# Nanobind (optional, for Python bindings)
6060
option(BUILD_PYTHON_BINDINGS "Build Python bindings" OFF)
61+
option(BUILD_EXAMPLES "Build example programs" OFF)
6162
if(BUILD_PYTHON_BINDINGS)
6263
find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
6364
FetchContent_Declare(
@@ -109,6 +110,11 @@ if(BUILD_PYTHON_BINDINGS)
109110
add_subdirectory(python)
110111
endif()
111112

113+
# Examples
114+
if(BUILD_EXAMPLES)
115+
add_subdirectory(examples)
116+
endif()
117+
112118
# Tests
113119
enable_testing()
114120
add_subdirectory(tests)

README.md

Lines changed: 50 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -2,74 +2,72 @@
22

33
English | [简体中文](README.zh-CN.md)
44

5-
<p align="center">
6-
<b>A Living Textbook for High-Performance CUDA Kernel Development</b>
7-
</p>
5+
A CUDA optimization lab for AI kernels, organized as a set of focused kernel modules, tests, examples, and lightweight Python bindings.
86

9-
![CUDA](https://img.shields.io/badge/CUDA-13.1+-76B900?style=flat-square&logo=nvidia)
10-
![C++20](https://img.shields.io/badge/C++-20-00599C?style=flat-square&logo=cplusplus)
11-
![Architecture](https://img.shields.io/badge/Architecture-Hopper%2FBlackwell-green?style=flat-square)
12-
![License](https://img.shields.io/badge/License-MIT-blue?style=flat-square)
7+
## What is in the repository
138

14-
---
9+
- `src/common/`: shared CUDA utilities such as tensor wrappers, timers, launch helpers, and reduction primitives
10+
- `src/01_elementwise/` to `src/07_cuda13_features/`: numbered kernel modules covering elementwise ops, reductions, GEMM, convolution, attention, quantization, and newer CUDA features
11+
- `tests/`: GoogleTest + RapidCheck coverage across kernel modules
12+
- `examples/`: currently shipped CUDA and Python examples
13+
- `python/`: nanobind bindings plus benchmark scripts
14+
- `docs/`: optimization notes and Python binding docs
1515

16-
## Overview
16+
## Build the C++/CUDA project
1717

18-
A systematic CUDA high-performance computing tutorial, from naive implementations to extreme optimization, covering core operators needed by modern AI models (LLM, Diffusion).
19-
20-
## Modules
18+
```bash
19+
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
20+
cmake --build build -j$(nproc)
21+
ctest --test-dir build --output-on-failure
22+
```
2123

22-
| Module | Description | Key Techniques |
23-
|--------|-------------|----------------|
24-
| **GEMM** | Matrix multiplication optimization | Tiled → Register Blocked → Tensor Core |
25-
| **Attention** | FlashAttention variants | Online Softmax, causal masking |
26-
| **Normalization** | LayerNorm, RMSNorm | Warp shuffle, vectorized loads |
27-
| **Elementwise** | Activation functions | GELU, SiLU, vectorized |
28-
| **Quantization** | INT8/FP8 | Calibration, per-channel scaling |
29-
| **Fusion** | Kernel fusion patterns | Bias+Act, LayerNorm+Residual |
24+
## Build the Python bindings
3025

31-
## Quick Start
26+
The current Python extension is named `hpc_ai_opt` and exposes low-level submodules such as `elementwise`, `reduction`, and `gemm`.
3227

3328
```bash
34-
git clone https://github.com/LessUp/hpc-ai-optimization-lab.git
35-
cd hpc-ai-optimization-lab
29+
cmake -S . -B build -DBUILD_PYTHON_BINDINGS=ON
30+
cmake --build build
31+
export PYTHONPATH="$(pwd)/build/python:${PYTHONPATH}"
32+
python -c "import hpc_ai_opt; print(hpc_ai_opt.__doc__)"
33+
python examples/python/basic_usage.py
34+
```
3635

37-
cmake -B build -DCMAKE_BUILD_TYPE=Release
38-
cmake --build build -j$(nproc)
39-
ctest --test-dir build --output-on-failure
36+
## Build the shipped examples
37+
38+
```bash
39+
cmake -S . -B build -DBUILD_EXAMPLES=ON
40+
cmake --build build --target relu_example gemm_benchmark
4041
```
4142

42-
## Requirements
43+
## Current Python API shape
4344

44-
- CUDA Toolkit 13.1+ (Hopper/Blackwell recommended)
45-
- CMake 3.20+, C++20 compiler
46-
- GPU: SM 8.0+ (Ampere or newer)
45+
```python
46+
import torch
47+
import hpc_ai_opt
4748

48-
## Project Structure
49+
x = torch.randn(1024, 1024, device="cuda", dtype=torch.float32)
50+
y = torch.empty_like(x)
4951

52+
hpc_ai_opt.elementwise.relu(x, y)
5053
```
51-
hpc-ai-optimization-lab/
52-
├── src/ # Kernel implementations
53-
│ ├── gemm/ # GEMM optimization levels
54-
│ ├── attention/ # Attention kernels
55-
│ ├── normalization/ # Norm kernels
56-
│ ├── elementwise/ # Activation kernels
57-
│ └── quantization/ # Quantization kernels
58-
├── include/ # Public headers
59-
├── tests/ # Google Test suite
60-
├── benchmarks/ # Performance benchmarks
61-
├── docs/ # Documentation
62-
└── .github/workflows/ # CI
63-
```
6454

65-
## Key Topics
55+
The current bindings are intentionally thin:
56+
- CUDA tensors are passed in directly
57+
- output tensors are allocated by the caller
58+
- some kernels require explicit shape arguments
59+
60+
## Requirements
6661

67-
- **Memory Hierarchy**: Global → Shared → Register optimization
68-
- **Tensor Core Programming**: WMMA / MMA for mixed-precision compute
69-
- **Async Operations**: TMA, async copy, pipeline overlapping
70-
- **Warp-Level Primitives**: Shuffle, vote, cooperative groups
71-
- **Kernel Fusion**: Reducing HBM round-trips
62+
- CUDA Toolkit 13.1+
63+
- CMake 3.24+
64+
- A C++20 compiler
65+
- An NVIDIA GPU with CUDA support
66+
- PyTorch with CUDA support for the Python example path
7267

73-
## License
68+
## Documentation
7469

75-
MIT License
70+
- `docs/README.md`
71+
- `docs/python/index.rst`
72+
- `docs/01_gemm_optimization.md`
73+
- `docs/04_flash_attention.md`

README.zh-CN.md

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -147,15 +147,15 @@ cmake .. -GNinja && ninja
147147
### 安装 Python 绑定
148148

149149
```bash
150-
# 在 build 目录下
151-
cmake .. -DBUILD_PYTHON_BINDINGS=ON
152-
ninja
150+
# 在仓库根目录下
151+
cmake -S . -B build -DBUILD_PYTHON_BINDINGS=ON
152+
cmake --build build
153153

154-
# 安装到 Python 环境
155-
pip install python/
154+
# 将扩展模块加入 Python 搜索路径
155+
export PYTHONPATH="$(pwd)/build/python:${PYTHONPATH}"
156156

157157
# 验证安装
158-
python -c "import hpc_kernels; print('Success!')"
158+
python -c "import hpc_ai_opt; print('Success!')"
159159
```
160160

161161
## 📚 优化案例详解
@@ -459,14 +459,14 @@ constexpr int SMEM_SIZE = TILE_SIZE * (TILE_SIZE + 1); // +1 避免 Bank Confli
459459

460460
```python
461461
import torch
462-
import hpc_kernels
462+
import hpc_ai_opt
463463

464464
# 零拷贝:直接使用 PyTorch CUDA Tensor
465465
x = torch.randn(1024, 1024, device='cuda')
466466
y = torch.empty_like(x)
467467

468468
# 调用我们的 Kernel
469-
hpc_kernels.elementwise.relu(x, y)
469+
hpc_ai_opt.elementwise.relu(x, y)
470470

471471
# 验证结果
472472
assert torch.allclose(y, torch.relu(x))
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# 入口闭环第一阶段完善记录
2+
3+
日期:2026-03-22
4+
5+
## 变更内容
6+
7+
### Python 绑定与命名统一
8+
- 将 Python 扩展模块名从 `hpc_kernels` 统一为 `hpc_ai_opt`
9+
- 同步更新 `python/bindings/bindings.cpp``python/CMakeLists.txt`
10+
- 保留现有 `elementwise` / `reduction` / `gemm` 子模块 API 结构,不在本阶段引入新的高层兼容层
11+
12+
### 示例与文档入口收敛
13+
- 重写 `examples/python/basic_usage.py`,使其只演示当前 bindings 真实暴露的 API
14+
- 重写 `docs/python/index.rst`,移除不存在的 toctree 页面,改为真实单页入口
15+
- 重写 `examples/README.md`,仅保留仓库当前真实存在的示例文件与运行方式
16+
- 更新 `README.md``README.zh-CN.md` 中的 Python 导入、构建与示例说明,使其与 `hpc_ai_opt` 和当前 bindings 一致
17+
18+
### 构建入口与 CI 一致性
19+
- 在根 `CMakeLists.txt` 中增加 `BUILD_EXAMPLES` 并接入 `examples/`
20+
- 移除 `examples/CMakeLists.txt` 中重复定义的 `BUILD_EXAMPLES`,改由顶层统一控制
21+
-`.github/workflows/ci.yml` 中新增 `Entry Consistency Check`,校验:
22+
- 文档声明的示例文件真实存在
23+
- 关键入口文件中的 Python 模块命名一致
24+
- 保持主线 CI 的 CPU-safe 边界,不引入依赖 GPU runner 的构建/测试矩阵
25+
26+
## 背景
27+
28+
本次工作聚焦“入口闭环”而非内核能力扩展。仓库此前的主要问题是:Python 模块命名、示例、文档、examples 构建入口和 CI 之间存在明显漂移,导致项目对外入口失真。第一阶段先解决这些问题,为后续测试增强和高级内核补强建立稳定基础。

0 commit comments

Comments
 (0)