LessUp
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 44 additions & 2 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 44 additions & 2 deletions
diff --git a/‎.github/workflows/pages.yml‎
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/pages.yml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 7 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 50 additions & 52 deletions b/‎README.md‎
Lines changed: 50 additions & 52 deletions
diff --git a/‎README.zh-CN.md‎
Lines changed: 8 additions & 8 deletions b/‎README.zh-CN.md‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎changelog/2026-03-22_entry-closure-phase1.md‎
Lines changed: 28 additions & 0 deletions b/‎changelog/2026-03-22_entry-closure-phase1.md‎
Lines changed: 28 additions & 0 deletions
@@ -40,6 +40,48 @@ jobs:
           ruff check python/
           ruff format --check python/
 
+  consistency-check:
+    name: Entry Consistency Check
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Verify documented example files exist
+        run: |
+          python - <<'PY'
+          from pathlib import Path
+
+          required = [
+              Path('examples/01_elementwise/relu_example.cu'),
+              Path('examples/03_gemm/gemm_benchmark.cu'),
+              Path('examples/python/basic_usage.py'),
+          ]
+
+          missing = [str(path) for path in required if not path.exists()]
+          if missing:
+              raise SystemExit(f"Missing documented examples: {missing}")
+          PY
+
+      - name: Verify Python module naming is aligned
+        run: |
+          python - <<'PY'
+          from pathlib import Path
+
+          files = {
+              'python/bindings/bindings.cpp': 'NB_MODULE(hpc_ai_opt, m)',
+              'python/CMakeLists.txt': 'nanobind_add_module(hpc_ai_opt',
+              'examples/python/basic_usage.py': 'import hpc_ai_opt as opt',
+              'docs/python/index.rst': 'hpc_ai_opt',
+              'README.md': 'hpc_ai_opt',
+              'README.zh-CN.md': 'hpc_ai_opt',
+          }
+
+          for file_path, expected in files.items():
+              content = Path(file_path).read_text(encoding='utf-8')
+              if expected not in content:
+                  raise SystemExit(f"Expected '{expected}' in {file_path}")
+          PY
+
   docs:
     name: Build Documentation
     runs-on: ubuntu-latest
@@ -80,12 +122,12 @@ jobs:
   ci-success:
     name: CI Success
     runs-on: ubuntu-latest
-    needs: [format-check, docs]
+    needs: [format-check, consistency-check, docs]
     if: always()
     steps:
       - name: Check all jobs
         run: |
-          if [[ "${{ needs.format-check.result }}" != "success" || "${{ needs.docs.result }}" != "success" ]]; then
+          if [[ "${{ needs.format-check.result }}" != "success" || "${{ needs.consistency-check.result }}" != "success" || "${{ needs.docs.result }}" != "success" ]]; then
             echo "One or more required jobs failed"
             exit 1
           fi
 
@@ -3,14 +3,14 @@ name: Documentation
 
 on:
   push:
-    branches: [main]
+    branches: [master, main]
     paths:
       - 'docs/**'
       - 'src/**/*.cuh'
       - 'src/**/*.cu'
       - 'python/**'
   pull_request:
-    branches: [main]
+    branches: [master, main]
     paths:
       - 'docs/**'
   workflow_dispatch:
@@ -66,7 +66,7 @@ jobs:
       - name: Combine documentation
         run: |
           mkdir -p docs/_site
-          # Copy main docs
+          cp docs/index.html docs/_site/index.html
           cp -r docs/*.md docs/_site/
           # Copy Doxygen output
           if [ -d docs/api/html ]; then
@@ -85,7 +85,7 @@ jobs:
   deploy:
     name: Deploy to GitHub Pages
     needs: build-docs
-    if: github.ref == 'refs/heads/main' && github.event_name == 'push'
+    if: github.event_name == 'push' && (github.ref == 'refs/heads/master' || github.ref == 'refs/heads/main')
     runs-on: ubuntu-latest
     environment:
       name: github-pages
 
@@ -81,3 +81,6 @@ Thumbs.db
 # Environment
 .env
 .env.local
+
+# Local AI agent worktrees
+.claude/
@@ -13,6 +13,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Code quality tools: .clang-format, .clang-tidy, .editorconfig, pre-commit
 - CI/CD with GitHub Actions
 - Doxygen and Sphinx documentation configuration
+- Entry consistency checks for documented examples and Python module naming
+
+### Changed
+- Unified the Python extension module name to `hpc_ai_opt`
+- Reworked Python entry documentation and examples to match the current bindings surface
+- Connected `examples/` to the top-level CMake build via `BUILD_EXAMPLES`
+- Updated README and examples documentation to reflect the repository's actual structure and Python workflow
 
 ## [0.1.0] - 2024-01-01
 
 
@@ -58,6 +58,7 @@ endif()
 
 # Nanobind (optional, for Python bindings)
 option(BUILD_PYTHON_BINDINGS "Build Python bindings" OFF)
+option(BUILD_EXAMPLES "Build example programs" OFF)
 if(BUILD_PYTHON_BINDINGS)
     find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
     FetchContent_Declare(
@@ -109,6 +110,11 @@ if(BUILD_PYTHON_BINDINGS)
     add_subdirectory(python)
 endif()
 
+# Examples
+if(BUILD_EXAMPLES)
+    add_subdirectory(examples)
+endif()
+
 # Tests
 enable_testing()
 add_subdirectory(tests)
 
@@ -2,74 +2,72 @@
 
 English | [简体中文](README.zh-CN.md)
 
-<p align="center">
-  <b>A Living Textbook for High-Performance CUDA Kernel Development</b>
-</p>
+A CUDA optimization lab for AI kernels, organized as a set of focused kernel modules, tests, examples, and lightweight Python bindings.
 
-![CUDA](https://img.shields.io/badge/CUDA-13.1+-76B900?style=flat-square&logo=nvidia)
-![C++20](https://img.shields.io/badge/C++-20-00599C?style=flat-square&logo=cplusplus)
-![Architecture](https://img.shields.io/badge/Architecture-Hopper%2FBlackwell-green?style=flat-square)
-![License](https://img.shields.io/badge/License-MIT-blue?style=flat-square)
+## What is in the repository
 
----
+- `src/common/`: shared CUDA utilities such as tensor wrappers, timers, launch helpers, and reduction primitives
+- `src/01_elementwise/` to `src/07_cuda13_features/`: numbered kernel modules covering elementwise ops, reductions, GEMM, convolution, attention, quantization, and newer CUDA features
+- `tests/`: GoogleTest + RapidCheck coverage across kernel modules
+- `examples/`: currently shipped CUDA and Python examples
+- `python/`: nanobind bindings plus benchmark scripts
+- `docs/`: optimization notes and Python binding docs
 
-## Overview
+## Build the C++/CUDA project
 
-A systematic CUDA high-performance computing tutorial, from naive implementations to extreme optimization, covering core operators needed by modern AI models (LLM, Diffusion).
-
-## Modules
+```bash
+cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
+cmake --build build -j$(nproc)
+ctest --test-dir build --output-on-failure
+```
 
-| Module | Description | Key Techniques |
-|--------|-------------|----------------|
-| **GEMM** | Matrix multiplication optimization | Tiled → Register Blocked → Tensor Core |
-| **Attention** | FlashAttention variants | Online Softmax, causal masking |
-| **Normalization** | LayerNorm, RMSNorm | Warp shuffle, vectorized loads |
-| **Elementwise** | Activation functions | GELU, SiLU, vectorized |
-| **Quantization** | INT8/FP8 | Calibration, per-channel scaling |
-| **Fusion** | Kernel fusion patterns | Bias+Act, LayerNorm+Residual |
+## Build the Python bindings
 
-## Quick Start
+The current Python extension is named `hpc_ai_opt` and exposes low-level submodules such as `elementwise`, `reduction`, and `gemm`.
 
 ```bash
-git clone https://github.com/LessUp/hpc-ai-optimization-lab.git
-cd hpc-ai-optimization-lab
+cmake -S . -B build -DBUILD_PYTHON_BINDINGS=ON
+cmake --build build
+export PYTHONPATH="$(pwd)/build/python:${PYTHONPATH}"
+python -c "import hpc_ai_opt; print(hpc_ai_opt.__doc__)"
+python examples/python/basic_usage.py
+```
 
-cmake -B build -DCMAKE_BUILD_TYPE=Release
-cmake --build build -j$(nproc)
-ctest --test-dir build --output-on-failure
+## Build the shipped examples
+
+```bash
+cmake -S . -B build -DBUILD_EXAMPLES=ON
+cmake --build build --target relu_example gemm_benchmark
 ```
 
-## Requirements
+## Current Python API shape
 
-- CUDA Toolkit 13.1+ (Hopper/Blackwell recommended)
-- CMake 3.20+, C++20 compiler
-- GPU: SM 8.0+ (Ampere or newer)
+```python
+import torch
+import hpc_ai_opt
 
-## Project Structure
+x = torch.randn(1024, 1024, device="cuda", dtype=torch.float32)
+y = torch.empty_like(x)
 
+hpc_ai_opt.elementwise.relu(x, y)
 ```
-hpc-ai-optimization-lab/
-├── src/                    # Kernel implementations
-│   ├── gemm/               # GEMM optimization levels
-│   ├── attention/           # Attention kernels
-│   ├── normalization/       # Norm kernels
-│   ├── elementwise/         # Activation kernels
-│   └── quantization/        # Quantization kernels
-├── include/                # Public headers
-├── tests/                  # Google Test suite
-├── benchmarks/             # Performance benchmarks
-├── docs/                   # Documentation
-└── .github/workflows/      # CI
-```
 
-## Key Topics
+The current bindings are intentionally thin:
+- CUDA tensors are passed in directly
+- output tensors are allocated by the caller
+- some kernels require explicit shape arguments
+
+## Requirements
 
-- **Memory Hierarchy**: Global → Shared → Register optimization
-- **Tensor Core Programming**: WMMA / MMA for mixed-precision compute
-- **Async Operations**: TMA, async copy, pipeline overlapping
-- **Warp-Level Primitives**: Shuffle, vote, cooperative groups
-- **Kernel Fusion**: Reducing HBM round-trips
+- CUDA Toolkit 13.1+
+- CMake 3.24+
+- A C++20 compiler
+- An NVIDIA GPU with CUDA support
+- PyTorch with CUDA support for the Python example path
 
-## License
+## Documentation
 
-MIT License
+- `docs/README.md`
+- `docs/python/index.rst`
+- `docs/01_gemm_optimization.md`
+- `docs/04_flash_attention.md`
@@ -147,15 +147,15 @@ cmake .. -GNinja && ninja
 ### 安装 Python 绑定
 
 ```bash
-# 在 build 目录下
-cmake .. -DBUILD_PYTHON_BINDINGS=ON
-ninja
+# 在仓库根目录下
+cmake -S . -B build -DBUILD_PYTHON_BINDINGS=ON
+cmake --build build
 
-# 安装到 Python 环境
-pip install python/
+# 将扩展模块加入 Python 搜索路径
+export PYTHONPATH="$(pwd)/build/python:${PYTHONPATH}"
 
 # 验证安装
-python -c "import hpc_kernels; print('Success!')"
+python -c "import hpc_ai_opt; print('Success!')"
 ```
 
 ## 📚 优化案例详解
@@ -459,14 +459,14 @@ constexpr int SMEM_SIZE = TILE_SIZE * (TILE_SIZE + 1);  // +1 避免 Bank Confli
 
 ```python
 import torch
-import hpc_kernels
+import hpc_ai_opt
 
 # 零拷贝：直接使用 PyTorch CUDA Tensor
 x = torch.randn(1024, 1024, device='cuda')
 y = torch.empty_like(x)
 
 # 调用我们的 Kernel
-hpc_kernels.elementwise.relu(x, y)
+hpc_ai_opt.elementwise.relu(x, y)
 
 # 验证结果
 assert torch.allclose(y, torch.relu(x))
 
@@ -0,0 +1,28 @@
+# 入口闭环第一阶段完善记录
+
+日期：2026-03-22
+
+## 变更内容
+
+### Python 绑定与命名统一
+- 将 Python 扩展模块名从 `hpc_kernels` 统一为 `hpc_ai_opt`
+- 同步更新 `python/bindings/bindings.cpp` 与 `python/CMakeLists.txt`
+- 保留现有 `elementwise` / `reduction` / `gemm` 子模块 API 结构，不在本阶段引入新的高层兼容层
+
+### 示例与文档入口收敛
+- 重写 `examples/python/basic_usage.py`，使其只演示当前 bindings 真实暴露的 API
+- 重写 `docs/python/index.rst`，移除不存在的 toctree 页面，改为真实单页入口
+- 重写 `examples/README.md`，仅保留仓库当前真实存在的示例文件与运行方式
+- 更新 `README.md` 与 `README.zh-CN.md` 中的 Python 导入、构建与示例说明，使其与 `hpc_ai_opt` 和当前 bindings 一致
+
+### 构建入口与 CI 一致性
+- 在根 `CMakeLists.txt` 中增加 `BUILD_EXAMPLES` 并接入 `examples/`
+- 移除 `examples/CMakeLists.txt` 中重复定义的 `BUILD_EXAMPLES`，改由顶层统一控制
+- 在 `.github/workflows/ci.yml` 中新增 `Entry Consistency Check`，校验：
+  - 文档声明的示例文件真实存在
+  - 关键入口文件中的 Python 模块命名一致
+- 保持主线 CI 的 CPU-safe 边界，不引入依赖 GPU runner 的构建/测试矩阵
+
+## 背景
+
+本次工作聚焦“入口闭环”而非内核能力扩展。仓库此前的主要问题是：Python 模块命名、示例、文档、examples 构建入口和 CI 之间存在明显漂移，导致项目对外入口失真。第一阶段先解决这些问题，为后续测试增强和高级内核补强建立稳定基础。