Fix clangd Configs and Update Transformer Examples

jamesnulliu · jamesnulliu · commit f3a241ee4185 · 2026-03-10T08:18:55.000Z
diff --git a/.clangd b/.clangd
@@ -1,20 +1,3 @@
-CompileFlags:
-  Add: 
-    - -std=c++20
-    - --no-cuda-version-check
-  Remove:
-    - -ccbin
-    - -forward-unknown-to-host-compiler
-    - -rdc=true
-    - -gencode
-    - --generate-code*
-    - -Xcudafe
-    - --diag_suppress=*
-    - --expt-relaxed-constexpr
-    - --expt-extended-lambda
-    - -Xcompiler*
-    - -arch=*
-
 Index:
   Background: Build
   StandardLibrary: Yes
@@ -36,6 +19,7 @@ Diagnostics:
       readability-identifier-length,
       readability-magic-numbers,
       readability-function-cognitive-complexity,
+      readability-redundant-access-specifiers,
       modernize-avoid-c-arrays,
       readability-math-missing-parentheses,
     ]
@@ -50,4 +34,32 @@ Diagnostics:
       readability-identifier-naming.ClassCase: CamelCase
       readability-identifier-naming.StructCase: CamelCase
       readability-identifier-naming.FunctionCase: camelBack
-      readability-identifier-naming.ClassMethodCase: camelBack
+      readability-identifier-naming.ClassMethodCase: camelBack
+
+CompileFlags:
+  Add: 
+    - -Wall
+    - -Wextra
+  Remove:
+    - -ccbin
+    - -forward-unknown-to-host-compiler
+    - --generate-code*
+    - -arch*
+    - -rdc=true
+    - -Xcudafe
+    - --diag_suppress=*
+    - --expt-relaxed-constexpr
+    - --expt-extended-lambda
+    - -gencode
+    - -Xcompiler*
+    - -fmodules*
+    - -fmodule-mapper*
+    - -fdeps-format*
+
+---
+
+If:
+  PathMatch: [.*\.cu, .*\.cuh]
+CompileFlags:
+  Add: 
+    - --cuda-gpu-arch=sm_89
diff --git a/csrc/include/pmpp/types/concepts.hpp b/csrc/include/pmpp/types/concepts.hpp
@@ -3,6 +3,7 @@
 #include <iterator>
 #include <ranges>
 #include <type_traits>
+#include <concepts>
 
 namespace pmpp
 {
diff --git a/csrc/include/pmpp/utils/common.cuh b/csrc/include/pmpp/utils/common.cuh
@@ -4,62 +4,52 @@
 #include <cuda_runtime_api.h>
 #include <stdexcept>
 
-#ifdef PMPP_CUDA_ERR_CHECK
-    #error "PMPP_CUDA_ERR_CHECK already defined."
+/**
+ * @brief Check the given cuda error. Exit with `EXIT_FAILURE` if not
+ *        success.
+ *        The error message is printed to `stderr`.
+ */
+#define PMPP_CUDA_ERR_CHECK(err)                                              \
+    do {                                                                      \
+        cudaError_t err_ = (err);                                             \
+        if (err_ != cudaSuccess) {                                            \
+            ::fprintf(                                                        \
+                stderr, "CUDA error at %s:%d; Error code: %d(%s) \"%s\"",     \
+                __FILE__, __LINE__, err, ::cudaGetErrorString(err_), #err);   \
+            ::cudaDeviceReset();                                              \
+            ::std::exit(EXIT_FAILURE);                                        \
+        }                                                                     \
+    } while (0)
+
+#define PMPP_CUDA_ABORT(msg)                                                  \
+    do {                                                                      \
+        ::fprintf(stderr, "Abort at %s:%d \"%s\"", __FILE__, __LINE__, msg);  \
+        ::cudaDeviceReset();                                                  \
+        ::std::abort();                                                       \
+    } while (0)
+
+#ifdef NDEBUG
+    /**
+     * @brief Cuda error check is turned off on Release mode.
+     */
+    #define PMPP_DEBUG_CUDA_ERR_CHECK(err) ((void) 0)
 #else
     /**
      * @brief Check the given cuda error. Exit with `EXIT_FAILURE` if not
      *        success.
      *        The error message is printed to `stderr`.
      */
-    #define PMPP_CUDA_ERR_CHECK(err)                                          \
-        do {                                                                  \
-            cudaError_t err_ = (err);                                         \
-            if (err_ != cudaSuccess) {                                        \
-                ::fprintf(stderr,                                             \
-                          "CUDA error at %s:%d; Error code: %d(%s) \"%s\"",   \
-                          __FILE__, __LINE__, err,                            \
-                          ::cudaGetErrorString(err_), #err);                  \
-                ::cudaDeviceReset();                                          \
-                ::std::abort();                                               \
-            }                                                                 \
-        } while (0)
-
-    #define PMPP_ABORT(msg)                                                   \
-        do {                                                                  \
-            ::fprintf(stderr, "Abort at %s:%d \"%s\"", __FILE__, __LINE__,    \
-                      msg);                                                   \
-            ::cudaDeviceReset();                                              \
-            ::std::abort();                                                   \
-        } while (0)
-#endif
-
-#ifdef PMPP_DEBUG_CUDA_ERR_CHECK
-    #error "PMPP_DEBUG_CUDA_ERR_CHECK already defined."
-#else
-    #ifdef NDEBUG
-        /**
-         * @brief Cuda error check is turned off on Release mode.
-         */
-        #define PMPP_DEBUG_CUDA_ERR_CHECK(err) ((void) 0)
-    #else
-        /**
-         * @brief Check the given cuda error. Exit with `EXIT_FAILURE` if not
-         *        success.
-         *        The error message is printed to `stderr`.
-         */
-        #define PMPP_DEBUG_CUDA_ERR_CHECK(err) PMPP_CUDA_ERR_CHECK(err)
-    #endif
+    #define PMPP_DEBUG_CUDA_ERR_CHECK(err) PMPP_CUDA_ERR_CHECK(err)
 #endif
 
 namespace pmpp::cuda
 {
+
 template <typename T>
 __host__ __device__ void initMemory(T* ptr, size_t n, const T& val)
 {
     for (size_t i = 0; i < n; ++i) {
         ptr[i] = val;
     }
 }
-
 }  // namespace pmpp::cuda
diff --git a/csrc/include/pmpp/utils/math.hpp b/csrc/include/pmpp/utils/math.hpp
@@ -14,8 +14,8 @@ namespace pmpp
  * @return The ceiling of the division of `a` by `b`.
  */
 template <typename T1, typename T2>
-    requires std::is_integral_v<T1> && std::is_integral_v<T2>
-[[nodiscard]] constexpr auto ceilDiv(T1 a, T2 b) -> T1
+[[nodiscard]] 
+constexpr auto ceilDiv(T1 a, T2 b) -> T1
 {
     return T1((a + b - 1) / b);
 }
diff --git a/csrc/lib/ops/vecAdd/op.cuh b/csrc/lib/ops/vecAdd/op.cuh
@@ -9,7 +9,7 @@ namespace pmpp::ops::cuda
 __global__ void vecAddKernelv0(const fp32_t* a, const fp32_t* b, fp32_t* c,
                                int32_t n)
 {
-
+    // Coalesced DRAM access
     int gtid = threadIdx.x + blockDim.x * blockIdx.x;
     if (gtid < n) {
         // [DRAM] 2 load, 1 store, 3 inst
@@ -20,9 +20,10 @@ __global__ void vecAddKernelv0(const fp32_t* a, const fp32_t* b, fp32_t* c,
 __global__ void vecAddKernelv1(const fp32_t* a, const fp32_t* b, fp32_t* c,
                                int32_t n)
 {
-
+    // Coalesced DRAM access
     int gtid = threadIdx.x + blockDim.x * blockIdx.x;
     gtid = gtid % 2 == 0 ? gtid + 1 : gtid - 1;
+    gtid = gtid == n ? n - 1 : gtid;
     if (gtid < n) {
         // [DRAM] 2 load, 1 store, 3 inst
         c[gtid] = a[gtid] + b[gtid];
@@ -32,8 +33,10 @@ __global__ void vecAddKernelv1(const fp32_t* a, const fp32_t* b, fp32_t* c,
 __global__ void vecAddKernelv2(const fp32_t* a, const fp32_t* b, fp32_t* c,
                                int32_t n)
 {
-
-    int gtid = threadIdx.x + blockDim.x * blockIdx.x + 1;
+    int gtid = threadIdx.x + blockDim.x * blockIdx.x;
+    if (gtid % warpSize == 0) {
+        gtid = (gtid + warpSize) % (ceilDiv(n, warpSize) * warpSize);
+    }
     if (gtid < n) {
         // [DRAM] 2 load, 1 store, 3 inst
         c[gtid] = a[gtid] + b[gtid];
@@ -53,9 +56,9 @@ void launchVecAdd(const fp32_t* d_A, const fp32_t* d_B, fp32_t* d_C, size_t n)
     } else if (VERSION == 2) {
         vecAddKernelv2<<<gridSize, blockSize>>>(d_A, d_B, d_C, n);
     } else {
-        PMPP_ABORT(std::format("Unsupported version: {}", VERSION).c_str());
+        PMPP_CUDA_ABORT(
+            std::format("Unsupported version: {}", VERSION).c_str());
     }
-
     PMPP_DEBUG_CUDA_ERR_CHECK(cudaGetLastError());
 }
 
diff --git a/csrc/test/OpTest/vecAdd.cpp b/csrc/test/OpTest/vecAdd.cpp
@@ -103,9 +103,8 @@ TEST_F(OpTest, VecAddv2)
         std::cout << std::format("nElems: {}, cosSim: {}\n", nElems,
                                  cosSim.item<fp32_t>());
 
-        // // [NOTE] This won't pass because the kernel is deliberately wrong
-        // EXPECT_TRUE(matCh.allclose(matCd2h));
-        // EXPECT_GE(cosSim.item<fp32_t>(), 0.99);
+        EXPECT_TRUE(matCh.allclose(matCd2h));
+        EXPECT_GE(cosSim.item<fp32_t>(), 0.99);
     }
 }
 }  // namespace pmpp::test::ops
diff --git a/csrc/vcpkg.json b/csrc/vcpkg.json
@@ -1,7 +1,6 @@
 {
     "dependencies": [
         "cxxopts",
-        "fmt",
         "spdlog",
         "proxy",
         "gtest",
diff --git a/pmpp/models/attention.py b/pmpp/models/attention.py

Original file line number	Diff line number	Diff line change
`@@ -3,6 +3,7 @@`
`3`	`3`	`#include <iterator>`
`4`	`4`	`#include <ranges>`
`5`	`5`	`#include <type_traits>`
	`6`	`+#include <concepts>`
`6`	`7`
`7`	`8`	`namespace pmpp`
`8`	`9`	`{`
Original file line number	Diff line number	Diff line change
`@@ -14,8 +14,8 @@ namespace pmpp`
`14`	`14`	* @return The ceiling of the division of `a` by `b`.
`15`	`15`	`*/`
`16`	`16`	`template <typename T1, typename T2>`
`17`		`- requires std::is_integral_v<T1> && std::is_integral_v<T2>`
`18`		`-[[nodiscard]] constexpr auto ceilDiv(T1 a, T2 b) -> T1`
	`17`	`+[[nodiscard]]`
	`18`	`+constexpr auto ceilDiv(T1 a, T2 b) -> T1`
`19`	`19`	`{`
`20`	`20`	`return T1((a + b - 1) / b);`
`21`	`21`	`}`
Original file line number	Diff line number	Diff line change
`@@ -103,9 +103,8 @@ TEST_F(OpTest, VecAddv2)`
`103`	`103`	`std::cout << std::format("nElems: {}, cosSim: {}\n", nElems,`
`104`	`104`	`cosSim.item<fp32_t>());`
`105`	`105`
`106`		`- // // [NOTE] This won't pass because the kernel is deliberately wrong`
`107`		`- // EXPECT_TRUE(matCh.allclose(matCd2h));`
`108`		`- // EXPECT_GE(cosSim.item<fp32_t>(), 0.99);`
	`106`	`+ EXPECT_TRUE(matCh.allclose(matCd2h));`
	`107`	`+ EXPECT_GE(cosSim.item<fp32_t>(), 0.99);`
`109`	`108`	`}`
`110`	`109`	`}`
`111`	`110`	`} // namespace pmpp::test::ops`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"dependencies": [`
`3`	`3`	`"cxxopts",`
`4`		`- "fmt",`
`5`	`4`	`"spdlog",`
`6`	`5`	`"proxy",`
`7`	`6`	`"gtest",`