Skip to content

LLAA178/LeetGPU-Guidebook

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

19 Commits
 
 
 
 
 
 
 
 

Repository files navigation

算力修仙笔记:LeetGPU 题解

网址:https://leetgpu.com/challenges 主页:https://leetgpu.com/EpicSamurai464

按题目记录 CUDA 解法、优化思路和部分完整实现。

CUDA Best Practices

索引

简单题 中等题 困难题
vector-addition reduction gpt-2-transformer-block
matrix-multiplication softmax
matrix-transpose dot-product
color-inversion parallel-merge
1d-convolution int8-quantized-matmul
reverse-array batch-normalization
relu rotary-positional-embedding
leaky-relu rms-normalization
rainbow-table weight-dequantization
matrix-copy
count-array-element
count-2d-array-element

简单题

基础实现

#include <cuda_runtime.h>

__global__ void vector_add(const float* A, const float* B, float* C, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if(idx < N)
        C[idx] = A[idx] + B[idx];
}

// A, B, C are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* A, const float* B, float* C, int N) {
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    vector_add<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, N);
    cudaDeviceSynchronize();
}

0.09953 ms 11.4th percentile (B200)

这两个数据分别为运行耗时,百分比排名。耗时越少,百分比排名越大。

向量化

#include <cuda_runtime.h>

__global__ void vector_add4(const float4* A, const float4* B, float* C, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if(idx < N / 4)
        reinterpret_cast<float4*      >(C)[idx] = make_float4(A[idx].x + B[idx].x, A[idx].y + B[idx].y, A[idx].z + B[idx].z, A[idx].w + B[idx].w);
    else if (idx == N / 4)
        for(int i = N - N % 4; i < N; i++)
            C[i] = reinterpret_cast<const float*      >(A)[i] + reinterpret_cast<const float*      >(B)[i];
}

// A, B, C are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* A, const float* B, float* C, int N) {
    int threadsPerBlock = 256;
    int blocksPerGrid = ((N+3) / 4 + threadsPerBlock - 1) / threadsPerBlock;
    const float4* A4 = reinterpret_cast<const float4*>(A);
    const float4* B4 = reinterpret_cast<const float4*>(B);
    vector_add4<<<blocksPerGrid, threadsPerBlock>>>(A4, B4, C, N);
    cudaDeviceSynchronize();
}

0.06307 ms 61.4th percentile (B200)

运用了GPU向量化访存的特性,float4占32*4=128位,运行时调用向量寄存器,访存大幅加快。但是需要额外处理数据不足4的情况。

编译加速

#include <cuda_runtime.h>
#include <cstdint>

__global__ void vecadd4_kernel(const float4* __restrict__ A4,
                               const float4* __restrict__ B4,
                               float4* __restrict__ C4,
                               int N4) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < N4) {
        float4 a = A4[idx];
        float4 b = B4[idx];
        C4[idx] = make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
    }
}

__global__ void tail_kernel(const float* __restrict__ A,
                            const float* __restrict__ B,
                            float* __restrict__ C,
                            int start, int N) {
    int i = start + blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) C[i] = A[i] + B[i];
}

extern "C" void solve(const float* A, const float* B, float* C, int N) {
    int N4   = N >> 2;        // N / 4
    int tail = N & 3;         // N % 4

    if (N4 > 0) {
        const float4* A4 = reinterpret_cast<const float4*>(A);
        const float4* B4 = reinterpret_cast<const float4*>(B);
        float4*       C4 = reinterpret_cast<float4*>(C);

        int threads = 256;
        int blocks  = (N4 + threads - 1) / threads;
        vecadd4_kernel<<<blocks, threads>>>(A4, B4, C4, N4);
    }

    if (tail) {
        int start   = N & ~3; // 4 对齐的起点
        // 1 个 block、32 线程足够处理 <=3 的尾巴
        tail_kernel<<<1, 32>>>(A, B, C, start, N);
    }
}

0.05337 ms 80.0th percentile (B200)

加上__restrict__编译选项,并且把尾部另外起了一个kernel避免在主kernel中串行,提高效率。

基础款

#include <cuda_runtime.h>

__global__ void matrix_multiplication_kernel(const float* A, const float* B, float* C, int M, int N, int K) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    if(row < M && col < K){
        float sum = 0.0f;
        #pragma unroll
        for(int r = 0; r < N; r++){
            sum += A[row * N + r] * B[r * K + col];
        }
        C[row * K + col] = sum;
    }
}

// A, B, C are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* A, const float* B, float* C, int M, int N, int K) {
    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((K + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (M + threadsPerBlock.y - 1) / threadsPerBlock.y);
    matrix_multiplication_kernel<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, M, N, K);
    cudaDeviceSynchronize();
}

71.06379 ms 27.0th percentile (B200)

冷知识:GEMM是AI使用最多的算子。

基础实现

#include <cuda_runtime.h>
#define BLOCK_SIZE 32
__global__ void matrix_transpose_kernel(const float* input, float* output, int rows, int cols) {
    int r = blockIdx.y * blockDim.y + threadIdx.y;
    int c = blockIdx.x * blockDim.x + threadIdx.x;
    if(r < rows && c < cols)
        output[c * rows + r] = input[r * cols + c];
}

// input, output are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* input, float* output, int rows, int cols) {
    dim3 threadsPerBlock(BLOCK_SIZE, BLOCK_SIZE);
    dim3 blocksPerGrid((cols + BLOCK_SIZE - 1) / BLOCK_SIZE,
                       (rows + BLOCK_SIZE - 1) / BLOCK_SIZE);

    matrix_transpose_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, rows, cols);
    cudaDeviceSynchronize();
}

0.27386 ms 18.9th percentile (B200)

中规中矩,将输入的行列互换赋值到输出。

共享内存基础版

#define BLOCK_SIZE 16
__global__ void matrix_transpose_kernel(const float* input, float* output, int rows, int cols) {
    int r = blockIdx.y * blockDim.y + threadIdx.y;
    int c = blockIdx.x * blockDim.x + threadIdx.x;
    __shared__ float tile[BLOCK_SIZE][BLOCK_SIZE];
    if(r < rows && c < cols){
        tile[threadIdx.x][threadIdx.y] = input[r * cols + c];
        output[c * rows + r] = tile[threadIdx.x][threadIdx.y];
    }
}

0.18147 ms 35.9th percentile (B200)

仅仅访存时经过共享内存,就有大幅提升。原因是input数组的访存在warp内(threadIdx.x增长方向,即c)是连续的,走共享内存时可以合并。

共享内存进阶版

#define BLOCK_SIZE 16
__global__ void matrix_transpose_kernel(const float* input, float* output, int rows, int cols) {
    int r = blockIdx.y * blockDim.y + threadIdx.y;
    int c = blockIdx.x * blockDim.x + threadIdx.x;
    int transR = blockIdx.x * blockDim.x + threadIdx.y;
    int transC = blockIdx.y * blockDim.y + threadIdx.x;
    __shared__ float tile[BLOCK_SIZE][BLOCK_SIZE];
    if(r < rows && c < cols){
        tile[threadIdx.y][threadIdx.x] = input[r * cols + c];
    }
    __syncthreads();
    if(transR < cols && transC < rows){
        output[transR * rows + transC] = tile[threadIdx.x][threadIdx.y];
    }
}

0.178 ms 37.7th percentile (B200)

通过将共享内存的坐标转置,使得output访存也变得连续。但是线程之间存在交叉访问,需要做同步。

为什么共享内存可以跨行访问,但是输入输出数组不能呢? 因为共享内存访存效率非常高,和L1缓存相当,约几十个cycles,但input/output属于global显存,每次访问需要上百个cycles,不合并的话效率非常低下。

终极版

// solution.cu
#include <cuda_runtime.h>
#define TILE_DIM    32
#define BLOCK_ROWS   2

__global__ void transpose_cp_async_kernel(
    const float* __restrict__ input,
          float* __restrict__ output,
    int width, int height)  // width=cols, height=rows
{
    // 消除 bank conflict,多一列
    __shared__ float tile[TILE_DIM][TILE_DIM + 1];

    int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
    int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
    const float* srcPtr = input + yIndex * width + xIndex;


    // 普通拷贝
    for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
        int y = yIndex + i;
        if (y < height && xIndex < width)
            tile[threadIdx.y + i][threadIdx.x] =
                input[y * width + xIndex];
    }

    __syncthreads();

    // 写回全局内存,注意转置坐标
    xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
    yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
    for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
        int y = yIndex + i;
        if (y < width && xIndex < height) {
            output[y * height + xIndex] =
                tile[threadIdx.x][threadIdx.y + i];
        }
    }
}

extern "C" void solve(
    const float* input, float* output,
    int rows, int cols)
{
    dim3 threads(TILE_DIM, BLOCK_ROWS);
    dim3 grid((cols + TILE_DIM - 1) / TILE_DIM,
              (rows + TILE_DIM - 1) / TILE_DIM);

    transpose_cp_async_kernel
        <<<grid, threads>>>(input, output, cols, rows);
    //cudaDeviceSynchronize();
}

0.06781 ms 98.1th percentile (B200)

image

主要有这几个优化点:

  • 共享内存列号+1,让每行物理跨过bank,消除bank conflict
  • tile为32正好是warp大小,row为2实现指令并行
  • restrict限定符,设定内存不重叠方便编译优化
  • 去除了显式的device同步

进一步了解:https://developer.nvidia.com/blog/efficient-matrix-transpose-cuda-cc/

基础版

#include <cuda_runtime.h>

__global__ void invert_kernel(unsigned char* image, int width, int height) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if(idx >= width * height) return;
    image[idx * 4]      = 255 - image[idx * 4];
    image[idx * 4 + 1]  = 255 - image[idx * 4 + 1];
    image[idx * 4 + 2]  = 255 - image[idx * 4 + 2];
}
// image_input, image_output are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(unsigned char* image, int width, int height) {
    int threadsPerBlock = 256;
    int blocksPerGrid = (width * height + threadsPerBlock - 1) / threadsPerBlock;
    invert_kernel<<<blocksPerGrid, threadsPerBlock>>>(image, width, height);
    //cudaDeviceSynchronize();
}

0.06884 ms 21.2th percentile (B200)

每个线程获取32位,反转前24位rgb即可。

位运算

__global__ void invert_kernel(unsigned char* image, int width, int height) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if(idx >= width * height) return;
    auto flip24 = [](int32_t x) -> int32_t {
    // XOR 后低 24 位翻转,高 8 位不变
        return x ^ 0xFFFFFF;
    };
    int temp = flip24(reinterpret_cast<int32_t*>(image)[idx]);
    reinterpret_cast<int32_t*>(image)[idx] = temp;
}

0.06728 ms 40.2th percentile (B200)

使用位运算函数直接反转32位,替换了逐个元素反转。

向量化

#define BLOCK_SIZE 256
__global__ void invert_kernel(unsigned char* image, int width, int height) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if(idx > width * height / 4) return;
    auto flip24 = [](int32_t x) -> int32_t {
    // XOR 后低 24 位翻转,高 8 位不变
        return x ^ 0xFFFFFF;
    };
    if(idx == width * height / 4){
        if(width * height % 4 == 0)
            return;
        #pragma unroll
        for(int i = idx * 4; i < width * height; i++){
            int temp = flip24(reinterpret_cast<int32_t*>(image)[i]);
            reinterpret_cast<int32_t*>(image)[i] = temp;
        }
        return;
    }
    auto temp = reinterpret_cast<int4*>(image)[idx];
    //__syncthreads();
    reinterpret_cast<int4*>(image)[idx] = make_int4(flip24(temp.x), flip24(temp.y), 
    flip24(temp.z), flip24(temp.w));
}

0.03529 ms 78.4th percentile (B200)

经典向量化加快访存。

流水线

#include <cuda_runtime.h>
#include <stdint.h>
#include <algorithm>

#ifndef BLOCK_SIZE
#define BLOCK_SIZE 128
#endif

// 低 24 位取反(RGB),A 不变
__device__ __forceinline__ uint32_t flip24(uint32_t x) { return x ^ 0x00FFFFFFu; }

// 128-bit streaming load(L1 no-allocate + L2 128B),失败则退化为普通加载
__device__ __forceinline__ uint4 ld_stream128(const uint4* ptr) {
#if __CUDA_ARCH__ >= 900  // Hopper/Blackwell
    uint4 v;
    asm volatile(
        "ld.global.nc.L1::no_allocate.L2::128B.v4.u32 {%0,%1,%2,%3}, [%4];\n"
        : "=r"(v.x), "=r"(v.y), "=r"(v.z), "=r"(v.w)
        : "l"(ptr));
    return v;
#else
    return *ptr;
#endif
}

// 128-bit streaming store
__device__ __forceinline__ void st_stream128(uint4* ptr, const uint4& v) {
#if __CUDA_ARCH__ >= 900
    asm volatile(
        "st.global.wb.v4.u32 [%0], {%1,%2,%3,%4};\n" ::
        "l"(ptr), "r"(v.x), "r"(v.y), "r"(v.z), "r"(v.w));
#else
    *ptr = v;
#endif
}

__global__ void invert_rgba_prefetch(uint32_t* __restrict__ data, size_t num_pixels)
{
    // 以 4 像素(16B)为一组
    size_t n4 = num_pixels >> 2;
    uint4* __restrict__ p4 = reinterpret_cast<uint4*>(data);

    size_t tid    = blockIdx.x * blockDim.x + threadIdx.x;
    size_t stride = (size_t)blockDim.x * gridDim.x;

    // 软件流水线:先预取一组
    if (tid < n4) {
        size_t j  = tid;
        uint4 r0  = ld_stream128(p4 + j);
        j += stride;

        // 主体:每轮预取下一组 r1,同时处理并写回 r0
        for (; j < n4; j += stride) {
            uint4 r1 = ld_stream128(p4 + j);

            r0.x = flip24(r0.x);
            r0.y = flip24(r0.y);
            r0.z = flip24(r0.z);
            r0.w = flip24(r0.w);
            st_stream128(p4 + (j - stride), r0);

            r0 = r1; // 滑动窗口
        }

        // 收尾:把最后一组 r0 写回
        r0.x = flip24(r0.x);
        r0.y = flip24(r0.y);
        r0.z = flip24(r0.z);
        r0.w = flip24(r0.w);
        st_stream128(p4 + (j - stride), r0);
    }

    // 处理余数(不足 4 像素)——单线程避免竞争
    if ((num_pixels & 3) && tid == 0) {
        for (size_t k = (n4 << 2); k < num_pixels; ++k) {
            data[k] = flip24(data[k]);
        }
    }
}

extern "C" void solve(unsigned char* image, int width, int height) {
    const size_t N = (size_t)width * (size_t)height;
    uint32_t* data = reinterpret_cast<uint32_t*>(image);

    int sm = 148; //cudaDeviceGetAttribute(&sm, cudaDevAttrMultiProcessorCount, 0);
    const int threads = BLOCK_SIZE;
    size_t n4 = N >> 2;

    int blocks;
    if (n4 == 0) blocks = 1;
    else {
        // 轻度超额订阅(8–12×SM),并受需求上限约束
        int target = sm * 22;
        size_t need = (n4 + threads - 1) / threads;
        blocks = std::max(1, (int)std::min((size_t)target, need));
    }

    invert_rgba_prefetch<<<blocks, threads>>>(data, N);
}

0.03195 ms 99.0th percentile (B200)

这里运用了一些高级特性

  • 流水线:每轮预取下一次的输入,增加吞吐
  • ptx:直接指定底层访存指令
  • 限制block数:148为B200 SM数,防止带宽阻塞

基础版

#include <cuda_runtime.h>
__global__ void convolution_1d_kernel(const float* input, const float* kernel, float* output,
                                      int input_size, int kernel_size) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    if(idx >= input_size - kernel_size + 1) return;
    float temp = 0.0f;
    #pragma unroll
    for(int i = 0; i < kernel_size; i++){
        float tempI = input[idx + i];
        float tempK = kernel[i];
        temp += tempI * tempK;
    }
    output[idx] = temp;
}

// input, kernel, output are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* input, const float* kernel, float* output, int input_size, int kernel_size) {
    int output_size = input_size - kernel_size + 1;
    int threadsPerBlock = 256;
    int blocksPerGrid = (output_size + threadsPerBlock - 1) / threadsPerBlock;

    convolution_1d_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, kernel, output, input_size, kernel_size);
    //cudaDeviceSynchronize();
}

0.82399 ms 62.5th percentile (B200)

1维卷积。理解很难,动手写却很简单。

向量化

#include <cuda_runtime.h>
__global__ void convolution_1d_kernel(const float* input, const float* kernel, float* output,
                                      int input_size, int kernel_size) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    if(idx >= input_size - kernel_size + 1) return;
    float temp = 0.0f;
    #pragma unroll
    for(int i = 0; i < kernel_size / 4; i++){
        int p = i * 4;
        float4 tempI = make_float4(input[idx + p], input[idx + p + 1], input[idx + p + 2], input[idx + p + 3]);
        float4 tempK = make_float4(kernel[p], kernel[p + 1], kernel[p + 2], kernel[p + 3]);
        temp += tempI.x * tempK.x + tempI.y * tempK.y + tempI.z * tempK.z + tempI.w * tempK.w ;
    }
    for(int i = kernel_size & ~3; i < kernel_size; i++ ){
        float tempI = input[idx + i];
        float tempK = kernel[i];
        temp += tempI * tempK;
    }
    output[idx] = temp;
}

// input, kernel, output are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* input, const float* kernel, float* output, int input_size, int kernel_size) {
    int output_size = input_size - kernel_size + 1;
    int threadsPerBlock = 1024;
    int blocksPerGrid = (output_size + threadsPerBlock - 1) / threadsPerBlock;

    convolution_1d_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, kernel, output, input_size, kernel_size);
    //cudaDeviceSynchronize();
}

0.73152 ms 80.0th percentile (B200)

这个向量化的独特处是,在单个线程的循环内做向量化而不是用更少线程数。

共享内存

#include <cuda_runtime.h>

__global__ void conv1d_shared_kernel(
    const float* __restrict__ input,
    const float* __restrict__ kernel,
    float* __restrict__ output,
    int input_size, int kernel_size)
{
    const int out_size = input_size - kernel_size + 1;
    const int out_start = blockIdx.x * blockDim.x;                  // 本 block 负责的输出起点(全局)
    const int out_end   = min(out_start + blockDim.x, out_size);    // 本 block 负责的输出终点(开区间)

    // 共享内存布局:[ 输入切片 (blockDim.x + K - 1) | kernel (K) ]
    extern __shared__ float smem[];
    float* s_in = smem;
    float* s_k  = smem + (blockDim.x + kernel_size - 1);

    // 这个 tile 对应的输入起点
    const int tile_in_start = out_start; // valid 卷积时输入对齐输出
    const int tile_in_len   = min(blockDim.x + kernel_size - 1, input_size - tile_in_start);

    // 1) cooperative load: 输入切片 -> shared
    #pragma unroll
    for (int t = threadIdx.x; t < tile_in_len; t += blockDim.x) {
        s_in[t] = input[tile_in_start + t];
    }
    #pragma unroll
    // 2) cooperative load: kernel -> shared
    for (int t = threadIdx.x; t < kernel_size; t += blockDim.x) {
        s_k[t] = kernel[t];
    }
    __syncthreads();

    // 3) 计算本线程的输出(若在有效范围内)
    const int out_idx = out_start + threadIdx.x;
    if (out_idx < out_end) {
        // 在共享内存上做点积:s_in[threadIdx.x + j] * s_k[j]
        float acc = 0.f;

        // 手动 4-步展开(避免 float4 对齐问题,且适用于任意对齐的起点)
        int j = 0;
        #pragma unroll
        for (; j + 3 < kernel_size; j += 4) {
            float a0 = s_in[threadIdx.x + j + 0];
            float a1 = s_in[threadIdx.x + j + 1];
            float a2 = s_in[threadIdx.x + j + 2];
            float a3 = s_in[threadIdx.x + j + 3];
            float b0 = s_k[j + 0];
            float b1 = s_k[j + 1];
            float b2 = s_k[j + 2];
            float b3 = s_k[j + 3];
            acc += a0*b0 + a1*b1 + a2*b2 + a3*b3;
        }
        for (; j < kernel_size; ++j) {
            acc += s_in[threadIdx.x + j] * s_k[j];
        }

        output[out_idx] = acc;
    }
}

// input, kernel, output are device pointers
extern "C" void solve(const float* input, const float* kernel, float* output,
                      int input_size, int kernel_size)
{
    const int out_size = input_size - kernel_size + 1;
    if (out_size <= 0) return;

    const int threadsPerBlock = 1024;
    const int blocksPerGrid   = (out_size + threadsPerBlock - 1) / threadsPerBlock;

    // 共享内存大小 = 输入切片 + kernel
    const size_t shmem_bytes =
        (threadsPerBlock + kernel_size - 1 + kernel_size) * sizeof(float);

    conv1d_shared_kernel<<<std::min(148*22,blocksPerGrid), threadsPerBlock, shmem_bytes>>>(
        input, kernel, output, input_size, kernel_size);

}

0.68818 ms 88.4th percentile (B200)

把当前block输入和kernel都加载进共享内存,减少全局内存访问。

多级访存

#include <cuda_runtime.h>
#include <stdint.h>

#define BLOCK_THREADS 256
#define THREAD_OUT    8
#define MAX_K         4096

__constant__ float ck[MAX_K];

template<int BYTES>
__device__ __forceinline__ void cp_async_smem_gmem(void* smem_dst, const void* gmem_src) {
    uint32_t saddr = static_cast<uint32_t>(__cvta_generic_to_shared(smem_dst)); // 32-bit shared addr
    asm volatile(
        "cp.async.cg.shared.global [%0], [%1], %2;\n" ::
        "r"(saddr), "l"(gmem_src), "n"(BYTES)
    );
}

__global__ void conv1d_bk_cpasync_kernel(
    const float* __restrict__ input,
    float* __restrict__ output,
    int input_size, int kernel_size)
{
    const int out_size = input_size - kernel_size + 1;
    if (out_size <= 0) return;

    const int out_per_blk = BLOCK_THREADS * THREAD_OUT;
    const int out_blk_beg = blockIdx.x * out_per_blk;
    if (out_blk_beg >= out_size) return;

    // tile 输入覆盖 [out_blk_beg, out_blk_beg + out_per_blk + K - 1)
    const int tile_in_start = out_blk_beg;
    const int stage_elems   = out_per_blk + kernel_size - 1;
    const int tile_in_len   = max(0, min(stage_elems, input_size - tile_in_start));

    extern __shared__ float smem[];
    float* s_in = smem;

    // ===== 全覆盖装载:float4 协同 + 尾部标量 =====
    const int vec_elems  = (tile_in_len & ~3);     // 4 对齐部分
    const int vec_chunks = (vec_elems >> 2);       // float4 个数

    // 关键修复:所有线程覆盖所有 chunk(不再按 warp 残缺分段)
    for (int i = threadIdx.x; i < vec_chunks; i += BLOCK_THREADS) {
        const char* gptr = reinterpret_cast<const char*>(input + tile_in_start) + i * 16;
        char*       sptr = reinterpret_cast<char*>(s_in)                               + i * 16;
        cp_async_smem_gmem<16>(sptr, gptr);
    }
    asm volatile("cp.async.commit_group;\n"::);
    asm volatile("cp.async.wait_group 0;\n"::);

    // 剩余 <4 元素用标量补齐
    for (int t = vec_elems + threadIdx.x; t < tile_in_len; t += BLOCK_THREADS) {
        s_in[t] = input[tile_in_start + t];
    }
    __syncthreads();

    // ===== 计算:每线程 8 输出 =====
    const int tlocal  = threadIdx.x * THREAD_OUT;
    const int out_g0  = out_blk_beg + tlocal;
    const int n_valid = max(0, min(THREAD_OUT, out_size - out_g0));
    if (n_valid <= 0) return;

    float acc[THREAD_OUT] = {0};

    int j = 0;
    for (; j + 3 < kernel_size; j += 4) {
        float k0 = ck[j+0], k1 = ck[j+1], k2 = ck[j+2], k3 = ck[j+3];
        #pragma unroll
        for (int o = 0; o < THREAD_OUT; ++o) {
            float a0 = s_in[tlocal + o + j + 0];
            float a1 = s_in[tlocal + o + j + 1];
            float a2 = s_in[tlocal + o + j + 2];
            float a3 = s_in[tlocal + o + j + 3];
            acc[o] += a0*k0 + a1*k1 + a2*k2 + a3*k3;  // 让编译器自由做 FMA/ILP
        }
    }
    for (; j < kernel_size; ++j) {
        float kj = ck[j];
        #pragma unroll
        for (int o = 0; o < THREAD_OUT; ++o) {
            acc[o] += s_in[tlocal + o + j] * kj;
        }
    }

    #pragma unroll
    for (int o = 0; o < THREAD_OUT; ++o) {
        if (o < n_valid) output[out_g0 + o] = acc[o];
    }
}

extern "C" void solve(const float* __restrict__ input,
                      const float* __restrict__ kernel,
                      float* __restrict__ output,
                      int input_size, int kernel_size)
{
    const int out_size = input_size - kernel_size + 1;
    if (out_size <= 0) return;

    // kernel 放常量内存(D2D)
    cudaMemcpyToSymbol(ck, kernel, kernel_size * sizeof(float), 0, cudaMemcpyDeviceToDevice);

    const int out_per_blk = BLOCK_THREADS * THREAD_OUT;
    dim3 block(BLOCK_THREADS);
    dim3 grid((out_size + out_per_blk - 1) / out_per_blk);

    // 只用一份 tile 的 shared(无流水,简单稳)
    size_t shmem_bytes = (size_t)(out_per_blk + kernel_size - 1) * sizeof(float);

    conv1d_bk_cpasync_kernel<<<grid, block, shmem_bytes>>>(input, output, input_size, kernel_size);
}

0.27392 ms 94.4th percentile (B200)

  • 常量显存,命中常量缓存并走广播时,延迟大致在 ~10–20 个周期(SM 级别)量级,且一次取数供一整个 warp,等效到“每线程”近似可忽略。
  • cp.async.cg.shared.global 中的 .cg 表示 cache global,只走 L2,不经过 L1。适合大块数据搬运(避免 L1 污染)。
  • 每个输入元素平均会被重复用 ~K 次。把它放到 shared 后,这些重复访问都变成了 SMEM 命中(单周期、超高带宽),极大降低了 DRAM/L2 读流量。

基础版

#include <cuda_runtime.h>

__global__ void reverse_array(float* input, int N) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    if(idx > N / 2) return;
    int other_idx = N - 1 - idx;
    float ta = input[idx];
    float tb = input[other_idx];
    input[idx] = tb;
    input[other_idx] = ta;
}

0.1115 ms 23.1th percentile (B200)

经典双指针反转数组。

共享内存

#include <cuda_runtime.h>
#define BLOCK_DIM 256
__global__ void reverse_array(float* input, int N) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    if(idx >= N) return;
    __shared__ float copy_of_input[BLOCK_DIM];
    copy_of_input[threadIdx.x] = input[idx];
    __syncthreads();
    input[N - 1 - idx] = copy_of_input[threadIdx.x];
}

// input is device pointer
extern "C" void solve(float* input, int N) {
    int threadsPerBlock = BLOCK_DIM;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
    reverse_array<<<blocksPerGrid, threadsPerBlock>>>(input, N);
    //cudaDeviceSynchronize();
}

0.08007 ms 69.2th percentile (B200)

过共享内存做中转。

__global__ void relu_kernel(const float* input, float* output, int N) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    if(idx < N){
        const float tmp = input[idx];
        output[idx] = tmp > 0 ? tmp : 0;
    }
}

0.09597 ms 15.4th percentile (B200)

常用激活函数之一,消灭负数。

__global__ void leaky_relu_kernel(const float* input, float* output, int N) {
    const int idx = blockDim.x * blockIdx.x + threadIdx.x;
    if(idx >= N)    return;
    auto tmp = input[idx];
    output[idx] = tmp >= 0 ? tmp : tmp * 0.01;
}

0.18015 ms 6.7th percentile (B200)

仁慈版relu。

__global__ void fnv1a_hash_kernel(const int* input, unsigned int* output, int N, int R) {
    const int idx = blockDim.x * blockIdx.x + threadIdx.x;
    if(idx>=N) return;
    auto temp = input[idx];
    while(R--){
        temp = fnv1a_hash(temp);
    }
    output[idx] = temp;
}

0.05202 ms 12.5th percentile (B200)

extern "C" void solve(const float* A, float* B, const int N) {
    cudaMemcpy(B, A, N*N*sizeof(float), cudaMemcpyDeviceToDevice);
}

0.0381 ms 52.0th percentile (B200)

__global__ void count_equal_kernel(const int* input, int* output, int N, int K) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if(idx >= N) return;
    int warp_num = 0;
    warp_num += input[idx] == K;
    for(int stride = warpSize / 2; stride > 0; stride /= 2)
        warp_num += __shfl_down_sync(0xffffffff,warp_num,stride);
    if(threadIdx.x % warpSize == 0) atomicAdd(output,warp_num);
}

2.05445 ms 0.0th percentile (B200)

__global__ void count_2d_equal_kernel(const int* input, int* output, int N, int M, int K) {
    int col = blockDim.x * blockIdx.x + threadIdx.x;
    int row = blockDim.y * blockIdx.y + threadIdx.y;
    if(col == 0 && row < N){
        for(int i = 0; i < M; i++){
            bool equal = input[row * M + i] == K ;
            atomicAdd(output, equal);
        }
    }
}

34.01452 ms 0.0th percentile (B200)

中等题

分层归约

#include <cuda_runtime.h>
#define BLOCK_SIZE 1024
__global__ void reduction(const float* input, float* output, int N){
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    //1. copy to shared
    //2. shared reduction
    //3. warp reduction
    //4. block sum
    __shared__ float sharedMem[BLOCK_SIZE];
    //拷到共享内存(越界用0),不能提前return
    float v = (idx < N) ? input[idx] : 0.0f;
    sharedMem[threadIdx.x] = v;
    __syncthreads(); 

    for(int stride = blockDim.x / 2; stride >= warpSize; stride /= 2){
        if(threadIdx.x < stride){
            sharedMem[threadIdx.x] += sharedMem[threadIdx.x + stride];
        }
        __syncthreads();    
    }

    if(threadIdx.x < warpSize){
        float sum = sharedMem[threadIdx.x];          
        unsigned mask = 0xffffffff;
        sum += __shfl_down_sync(mask, sum, 16);
        sum += __shfl_down_sync(mask, sum, 8);
        sum += __shfl_down_sync(mask, sum, 4);
        sum += __shfl_down_sync(mask, sum, 2);
        sum += __shfl_down_sync(mask, sum, 1);
        if(threadIdx.x == 0){
            atomicAdd(output, sum);
        }
    }
}
// input, output are device pointers
extern "C" void solve(const float* input, float* output, int N) {  
    reduction<<<(N + BLOCK_SIZE - 1) / BLOCK_SIZE ,BLOCK_SIZE>>>(input, output, N);
}

0.27248 ms 24.6th percentile (B200)

sweet spot

#include <cuda_runtime.h>

#ifndef BLOCK_SIZE
#define BLOCK_SIZE 256
#endif
#define FULL_MASK 0xffffffffu

__inline__ __device__ float warp_reduce_sum(float v) {
    v += __shfl_down_sync(FULL_MASK, v, 16);
    v += __shfl_down_sync(FULL_MASK, v, 8);
    v += __shfl_down_sync(FULL_MASK, v, 4);
    v += __shfl_down_sync(FULL_MASK, v, 2);
    v += __shfl_down_sync(FULL_MASK, v, 1);
    return v;
}

template<int BS>
__global__ void reduce_limit_blocks(const float* __restrict__ x,
                                    float* __restrict__ out,
                                    int n) {
    __shared__ float smem[BS];
    const int tid   = threadIdx.x;
    const int bid   = blockIdx.x;
    const int gsize = gridDim.x * (BS * 2);   // 2x 展开
    int i = bid * (BS * 2) + tid;

    // 线程本地累加(两元素/线程)
    float sum = 0.f;
    if (i < n)              sum += x[i];
    if (i + BS < n)         sum += x[i + BS];

    // grid-stride 回合也按 2x 展开
    for (i += gsize; i < n; i += gsize) {
        sum += x[i];
        if (i + BS < n) sum += x[i + BS];
    }

    smem[tid] = sum;
    __syncthreads();

    // block 内规约(到 32 停)
    #pragma unroll
    for (int s = BS / 2; s >= 32; s >>= 1) {
        if (tid < s) smem[tid] += smem[tid + s];
        __syncthreads();
    }

    // warp 规约
    if (tid < 32) {
        float v = smem[tid];
        v = warp_reduce_sum(v);
        if (tid == 0) atomicAdd(out, v);
    }
}

extern "C" void solve(const float* input, float* output, int N) {
    // 设备信息
    int dev = 0, sm = 0;
    cudaGetDevice(&dev);
    cudaDeviceGetAttribute(&sm, cudaDevAttrMultiProcessorCount, dev);

    // 自然网格(两元素/线程)
    int natural = (N + (BLOCK_SIZE * 2) - 1) / (BLOCK_SIZE * 2);

    // 关键:限制但别太小。经验:SM * 6~10 比较稳
    int grid = sm * 8;
    if (natural < grid) grid = natural;
    if (grid < 1) grid = 1;

    reduce_limit_blocks<BLOCK_SIZE><<<grid, BLOCK_SIZE>>>(input, output, N);
}

0.05636 ms 60.0th percentile (B200)

限制block数 + 双元素读写 + 精心设计参数,吃到了带宽甜点位。

双层旋风

#include <cuda_runtime.h>
#include <stdint.h>

#ifndef BLOCK_SIZE
#define BLOCK_SIZE 256   // 可试 128/192/320 做 AB
#endif

#define FULL_MASK 0xffffffffu

__inline__ __device__ float warp_reduce_sum(float v) {
    v += __shfl_down_sync(FULL_MASK, v, 16);
    v += __shfl_down_sync(FULL_MASK, v, 8);
    v += __shfl_down_sync(FULL_MASK, v, 4);
    v += __shfl_down_sync(FULL_MASK, v, 2);
    v += __shfl_down_sync(FULL_MASK, v, 1);
    return v;
}

template<int BS>
__global__ void reduce_fast(const float* __restrict__ x,
                            float* __restrict__ out,
                            int n) {
    const int tid      = threadIdx.x;
    const int lane     = tid & 31;
    const int warp_id  = tid >> 5;             // 0..(BS/32-1)
    const int warps    = BS / 32;

    float sum = 0.f;

    // -------- 向量化主循环:float4 + 2× 展开 => 每迭代处理 8 个元素/线程 --------
    const bool aligned16 = ((reinterpret_cast<uintptr_t>(x) & 15u) == 0);
    if (aligned16) {
        const int n4 = n >> 2;                 // 以 float4 计
        const float4* __restrict__ v = reinterpret_cast<const float4*>(x);

        int j = blockIdx.x * BS + tid;         // 每线程一个向量索引
        const int stride4 = gridDim.x * BS;

        // 每回合拿两组:j 与 j+stride4(手动 2× 展开)
        for (; j < n4; j += (stride4 << 1)) {
            float4 a = v[j];
            sum += a.x + a.y + a.z + a.w;

            int j2 = j + stride4;
            if (j2 < n4) {
                float4 b = v[j2];
                sum += b.x + b.y + b.z + b.w;
            }
        }

        // 处理 4 对齐剩余的尾巴(0~3 个),仍用 grid-stride
        int base = n4 << 2;
        for (int i = base + blockIdx.x * BS + tid; i < n; i += gridDim.x * BS) {
            sum += x[i];
        }
    } else {
        // 非 16B 对齐:回退到标量 2× 展开
        const int gsize = gridDim.x * (BS * 2);
        int i = blockIdx.x * (BS * 2) + tid;
        if (i < n)       sum += x[i];
        if (i + BS < n)  sum += x[i + BS];
        for (i += gsize; i < n; i += gsize) {
            sum += x[i];
            if (i + BS < n) sum += x[i + BS];
        }
    }

    // -------- 纯 warp 规约 + 仅一次同步 --------
    sum = warp_reduce_sum(sum);

    __shared__ float smem[BS / 32];            // 每个 warp 一个槽
    if (lane == 0) smem[warp_id] = sum;
    __syncthreads();                           // 仅此一次

    // 由 warp0 做最后规约
    float block_sum = 0.f;
    if (warp_id == 0) {
        block_sum = (tid < warps) ? smem[lane] : 0.f;
        block_sum = warp_reduce_sum(block_sum);
        if (lane == 0) atomicAdd(out, block_sum);
    }
}

extern "C" void solve(const float* input, float* output, int N) {
    int dev = 0, sm = 0;
    cudaGetDevice(&dev);
    cudaDeviceGetAttribute(&sm, cudaDevAttrMultiProcessorCount, dev);

    // grid 适度加大以饱和带宽。经验:SM * 10~12 往往比 *8 更满
    int natural = (N + (BLOCK_SIZE * 8) - 1) / (BLOCK_SIZE * 8); // 估算向量化吞吐
    int grid = sm * 12;
    if (natural > 0) grid = min(grid, natural);
    if (grid < 1) grid = 1;

    // 若调用方没清零,这里别清。确保测试基准一致
    reduce_fast<BLOCK_SIZE><<<grid, BLOCK_SIZE>>>(input, output, N);
}

0.04402 ms 98.7th percentile (B200)

两级warp归约 + 一次同步,避免了shared频繁读写。向量化。

天地同寿

#include <cuda_runtime.h>
#define BLOCK_SIZE 256
#define UPPER_DIV(A, B) ((A + B - 1) / B)
__global__ void block_max_kernel(const float* input, float* output, int N){
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    //same as reduction, but get max instead of sum
    __shared__ float sharedMem[BLOCK_SIZE];
    float v = (idx < N) ? input[idx] : 0.0f;
    sharedMem[threadIdx.x] = v;
    __syncthreads(); 

    for(int stride = blockDim.x / 2; stride >= warpSize; stride /= 2){
        if(threadIdx.x < stride){
            sharedMem[threadIdx.x] = fmaxf(sharedMem[threadIdx.x], sharedMem[threadIdx.x + stride]);
        }
        __syncthreads();    
    }

    if(threadIdx.x < warpSize){
        float blockMax = sharedMem[threadIdx.x];          
        for(int stride = warpSize / 2; stride > 0; stride /= 2){
            blockMax = fmaxf(blockMax, __shfl_down_sync(0xffffffff, blockMax, stride));
        }
        if(threadIdx.x == 0){
            output[blockIdx.x] = blockMax;
        }
    }  
}
__global__ void softmax_kernel(const float* input, float* output, float* inputMax, double* expSum, int N) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    __shared__ float sharedMem[BLOCK_SIZE];
    float v = (idx < N) ? __expf(input[idx] - inputMax[0]) : 0.0f;
    sharedMem[threadIdx.x] = v;
    __syncthreads(); 

    for(int stride = blockDim.x / 2; stride >= warpSize; stride /= 2){
        if(threadIdx.x < stride){
            sharedMem[threadIdx.x] += sharedMem[threadIdx.x + stride];
        }
        __syncthreads();    
    }

    if(threadIdx.x < warpSize){
        float blockSum = sharedMem[threadIdx.x];          
        for(int stride = warpSize / 2; stride > 0; stride /= 2){
            blockSum += __shfl_down_sync(0xffffffff, blockSum, stride);
        }
        if(threadIdx.x == 0){
            atomicAdd(expSum, blockSum);
        }
    }
}
__global__ void softmax_tail_kernel(const float* input, float* output, float* inputMax, double* expSum, int N) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    if(idx >= N ) return;
    auto sum = *expSum;
    output[idx] = __expf(input[idx] - inputMax[0]) / sum;
}

// input, output are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* input, float* output, int N) {
    const int blocksPerGrid = UPPER_DIV(N, BLOCK_SIZE);
    double *expSum;
    float *blockMax;
    cudaStream_t asyncStream;
    cudaStreamCreate(&asyncStream);
    cudaMallocAsync(&expSum, sizeof(double), asyncStream);
    cudaMallocAsync(&blockMax, sizeof(float) * blocksPerGrid, asyncStream);
    block_max_kernel<<<blocksPerGrid, BLOCK_SIZE, 0, asyncStream>>>(input, blockMax, N); 
    int blocks = UPPER_DIV(blocksPerGrid, BLOCK_SIZE);
    for(; blocks > 1 ; blocks = UPPER_DIV(blocks, BLOCK_SIZE)){
        block_max_kernel<<<blocks, BLOCK_SIZE, 0, asyncStream>>>(blockMax, blockMax, N);    
    }
    softmax_kernel<<<blocksPerGrid, BLOCK_SIZE, 0, asyncStream>>>(input, output, blockMax, expSum, N);
    softmax_tail_kernel<<<blocksPerGrid, BLOCK_SIZE, 0, asyncStream>>>(input, output, blockMax, expSum, N);
    cudaFreeAsync(expSum, asyncStream);
    cudaFreeAsync(blockMax, asyncStream);
    cudaStreamDestroy(asyncStream);
}

0.42211 ms 20.0th percentile (B200)

乍一看和归约很相似,只是从求和变成求和+归一化。

但是坑点在于,求和过程需要先求幂,而直接求幂则必然溢出,简单举例:e^1000能用浮点数表示吗?

解决方法是:求出最大值,然后把所有input除以最大值再求幂。一来不影响结果,因为分子分母都除。二来不会溢出,因为e^-1000能用浮点数表示。

流程变为求最值+求和+归一化。注意求最值和求和流程基本一样,只是atomicMax(float)没有实现,所以这里做了用循环blockMax替代。

浮点原子化

#include <cuda_runtime.h>
#define BLOCK_SIZE 1024
#define UPPER_DIV(A, B) ((A + B - 1) / B)
__device__ float atomic_max_float(float *address, float val)
{
    int *address_as_int = (int *)address;
    int old = *address_as_int, assumed;
    do
    {
        assumed = old;
        old = atomicCAS(address_as_int, assumed, __float_as_int(fmaxf(val, __int_as_float(assumed))));
    } while (assumed != old);
    return __int_as_float(old);
}

__global__ void max_kernel(const float* input, float* output, int N){
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    //same as reduction, but get max instead of sum
    __shared__ float sharedMem[BLOCK_SIZE];
    float v = (idx < N) ? input[idx] : 0.0f;
    sharedMem[threadIdx.x] = v;
    __syncthreads(); 

    for(int stride = blockDim.x / 2; stride >= warpSize; stride /= 2){
        if(threadIdx.x < stride){
            sharedMem[threadIdx.x] = fmaxf(sharedMem[threadIdx.x], sharedMem[threadIdx.x + stride]);
        }
        __syncthreads();    
    }

    if(threadIdx.x < warpSize){
        float blockMax = sharedMem[threadIdx.x];          
        for(int stride = warpSize / 2; stride > 0; stride /= 2){
            blockMax = fmaxf(blockMax, __shfl_down_sync(0xffffffff, blockMax, stride));
        }
        if(threadIdx.x == 0){
            atomic_max_float(output, blockMax);
        }
    }  
}
__global__ void softmax_kernel(const float* input, float* output, float* inputMax, float* expSum, int N) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    __shared__ float sharedMem[BLOCK_SIZE];
    float v = (idx < N) ? __expf(input[idx] - *inputMax) : 0.0f;
    sharedMem[threadIdx.x] = v;
    __syncthreads(); 

    for(int stride = blockDim.x / 2; stride >= warpSize; stride /= 2){
        if(threadIdx.x < stride){
            sharedMem[threadIdx.x] += sharedMem[threadIdx.x + stride];
        }
        __syncthreads();    
    }

    if(threadIdx.x < warpSize){
        float blockSum = sharedMem[threadIdx.x];          
        for(int stride = warpSize / 2; stride > 0; stride /= 2){
            blockSum += __shfl_down_sync(0xffffffff, blockSum, stride);
        }
        if(threadIdx.x == 0){
            atomicAdd(expSum, blockSum);
        }
    }
}
__global__ void softmax_tail_kernel(const float* input, float* output, float* inputMax, float* expSum, int N) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    if(idx >= N ) return;
    auto sum = *expSum;
    output[idx] = __expf(input[idx] - *inputMax) / sum;
}

// input, output are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* input, float* output, int N) {
    const int blocksPerGrid = UPPER_DIV(N, BLOCK_SIZE);
    float *expSum;
    float *inputMax;
    cudaMallocAsync(&expSum, sizeof(float), cudaStreamDefault);
    cudaMallocAsync(&inputMax, sizeof(float), cudaStreamDefault);
    max_kernel<<<blocksPerGrid, BLOCK_SIZE, 0, cudaStreamDefault>>>(input, inputMax, N); 
    softmax_kernel<<<blocksPerGrid, BLOCK_SIZE, 0, cudaStreamDefault>>>(input, output, inputMax, expSum, N);
    softmax_tail_kernel<<<blocksPerGrid, BLOCK_SIZE, 0, cudaStreamDefault>>>(input, output, inputMax, expSum, N);
}

0.0305 ms 88.6th percentile (B200)

使用atomicCAS实现了atomicMax(float),与归约保持一致。

这题本质上就是一个 reduction,只不过先做逐元素乘法,再做求和。

最稳的做法是两层归约:

  • 每个线程先沿着全局 stride 累加自己的局部和
  • warp 内用 shuffle 归约
  • block 内只让一个 warp 收尾
  • 最后对全局结果做一次 atomicAdd

这样共享内存只用来暂存 warp 结果,不需要整块树形归约。

__device__ __forceinline__ float warp_reduce_sum(float x) {
  for (int offset = warpSize / 2; offset > 0; offset >>= 1) {
    x += __shfl_down_sync(0xffffffffu, x, offset);
  }
  return x;
}

__global__ void dot_product_kernel(const float* A, const float* B, float* result, int N) {
  float thread_sum = 0.0f;
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
  for (int i = tid; i < N; i += stride) {
    thread_sum = fmaf(A[i], B[i], thread_sum);
  }

  __shared__ float warp_sums[32];
  int lane = threadIdx.x & (warpSize - 1);
  int warp = threadIdx.x / warpSize;
  thread_sum = warp_reduce_sum(thread_sum);

  if (lane == 0) warp_sums[warp] = thread_sum;
  __syncthreads();

  if (warp == 0) {
    float block_sum = (lane < (blockDim.x + warpSize - 1) / warpSize) ? warp_sums[lane] : 0.0f;
    block_sum = warp_reduce_sum(block_sum);
    if (lane == 0) atomicAdd(result, block_sum);
  }
}

0.05 ms 13.8th (A100)

这里的关键不是“乘法”,而是 reduction 路径怎么短。

完整代码:solutions/reduction/dot-product.cu

这题的核心不是比较本身,而是怎么把两个有序数组切成彼此独立的小段。

baseline

基础版可以先不做 tile merge,而是直接让每个元素自己去找最终位置。

做法很直接:

  • A[i] 用二分找出它前面应该有多少个 B 元素
  • B[j] 用二分找出它前面应该有多少个 A 元素
  • 然后各自把自己写到 C 的唯一位置

真正难想的是“二分里到底比较什么”:

  • A 这边要找 lower_bound(B, A[i])
  • B 这边要找 upper_bound(A, B[j])

为什么要这样分:

  • A[i] 来说,要数的是严格小于它的 B。所以一旦看到 B[mid] >= A[i],答案一定在左边,这就是 lower_bound
  • B[j] 来说,要把所有等于它的 A 也算进前缀里。所以一旦看到 A[mid] > B[j],才说明右边太大,这就是 upper_bound

这两个判断一左一右,重复元素就不会撞到同一个位置,稳定顺序也自然成立。

#define BLOCK_SIZE 256

__device__ inline int binary_search(const float* B, int N, float k, bool is_upper) {
  int l = 0, r = N;
  while (l < r) {
    int mid = (l + r) / 2;
    if ((!is_upper && B[mid] >= k) || (is_upper && B[mid] > k)) {
      r = mid;
    } else {
      l = mid + 1;
    }
  }
  return l;
}

__global__ void asc_merge_A(const float* A, const float* B, float* C, int M, int N) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx >= M) return;
  int countB = binary_search(B, N, A[idx], false);
  C[countB + idx] = A[idx];
}

__global__ void asc_merge_B(const float* A, const float* B, float* C, int M, int N) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx >= N) return;
  int countA = binary_search(A, M, B[idx], true);
  C[countA + idx] = B[idx];
}

extern "C" void solve(const float* A, const float* B, float* C, int M, int N) {
  asc_merge_A<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(A, B, C, M, N);
  asc_merge_B<<<(N + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(A, B, C, M, N);
}

2.05 ms 50.0th (A100)

这版最适合先把“相等元素时为什么一个用 lower_bound、一个用 upper_bound”这件事想透。

optimized

再往上走,才是 merge path

  • 先在全局对角线上给每个 block 分一段输出区间
  • 用二分找到这段区间对应的 A/B 分界点
  • 把这块输入搬到 shared memory
  • 再在 block 内给每个线程分一个更小的对角线段

这样每个 block 都是在合并自己的一小块局部问题,不需要线程之间反复抢位置。

template <typename T>
__device__ __forceinline__ int merge_path_partition(const T* a, int m, const T* b, int n, int diag) {
  int low = max(0, diag - n);
  int high = min(diag, m);
  while (low < high) {
    int mid = (low + high) >> 1;
    int j = diag - mid;
    if (j > 0 && mid < m && b[j - 1] >= a[mid]) low = mid + 1;
    else high = mid;
  }
  return low;
}

__global__ void parallel_merge_kernel(const float* a, const float* b, float* c, int m, int n) {
  __shared__ float tile[kTileItems];
  int block_diag = blockIdx.x * kTileItems;
  int total = m + n;
  if (block_diag >= total) return;

  int tile_count = min(kTileItems, total - block_diag);
  int a0 = merge_path_partition(a, m, b, n, block_diag);
  int a1 = merge_path_partition(a, m, b, n, block_diag + tile_count);
  int b0 = block_diag - a0;
  int b1 = block_diag + tile_count - a1;

  // load tile_a / tile_b to shared memory
  // each thread merges its own diagonal slice
  // write back contiguous outputs
}

1.02 ms 83.3th (A100)

这题真正值钱的是“先切输出,再反推输入”的思路,和 GEMM 那类题很像。

完整代码:solutions/reduction/parallel-merge.cu

这题最顺的想法不是先看 A / B,而是先看输出 C

先把 C 切成 16 x 16 的小块。一个 block 负责一个 C tile,一个线程负责一个输出 C[row, col]。这样再反推输入,就会发现这个 block 需要的只是:

  • A 的一段 16 x K
  • B 的一段 K x 16

C 往回看,访存图案就是一个十字架。这个视角定下来以后,分块、边界、量化收尾都会清楚很多。

真正的技术难点不在 GEMM 主体,而在这三个地方:

  • zero point 什么时候减,放在内层还是外层
  • 量化收尾的舍入方式,是否和判题一致
  • 想加速时,怎样保留 int32 累加结果再统一量化

公共前置

先记住这两个式子,后面三版都围着它们转:

int corrected = raw - zp_b * row_sum_a - zp_a * col_sum_b + K * zp_a * zp_b;
int q = nearbyintf(raw_or_corrected * scale_a * scale_b / scale_c) + zp_c;

第一版可以直接在内层减 zero point。后两版更适合先算 raw int32 GEMM,最后一次性修正并量化。

scalar

scalar 的核心不是“朴素三重循环”,而是先把 C 定成线程映射。

  • 一个 block 对应一个 16 x 16 输出块
  • 一个线程只管一个输出元素
  • 每个线程沿着 K 方向扫一遍
  • C[row, col] 反推,只需要读 A[row, :]B[:, col]
constexpr int kTile = 16;

__device__ __forceinline__ int8_t clamp_to_int8(int x) {
  return static_cast<int8_t>(max(-128, min(127, x)));
}

__global__ void scalar_kernel(const int8_t* A,
                              const int8_t* B,
                              int8_t* C,
                              int M,
                              int N,
                              int K,
                              float scale_A,
                              float scale_B,
                              float scale_C,
                              int zp_A,
                              int zp_B,
                              int zp_C) {
  int row = blockIdx.y * blockDim.y + threadIdx.y;
  int col = blockIdx.x * blockDim.x + threadIdx.x;
  if (row >= M || col >= N) return;

  int acc = 0;
  for (int k = 0; k < K; ++k) {
    int a = static_cast<int>(A[row * K + k]) - zp_A;
    int b = static_cast<int>(B[k * N + col]) - zp_B;
    acc += a * b;
  }

  float x = static_cast<float>(acc) * scale_A * scale_B / scale_C;
  C[row * N + col] = clamp_to_int8(static_cast<int>(nearbyintf(x)) + zp_C);
}

extern "C" void solve(...) {
  dim3 threads(kTile, kTile);
  dim3 blocks((N + kTile - 1) / kTile, (M + kTile - 1) / kTile);
  scalar_kernel<<<blocks, threads>>>(...);
}

49.04 ms 4.5th (A100)

这一版里,共享内存不是必须的。先把输出映射和量化细节做对,比先上 shared memory 更重要。

tensor core

到 A100 就可以考虑 wmmaint8 路线了。

  • 一个 warp 负责一个 16 x 16 输出块
  • 沿 K 方向按 16 递进
  • 先得到 int32 accumulator
  • 再把块内结果量化写回
#include <mma.h>
using namespace nvcuda;

__global__ void wmma_int8_kernel(const int8_t* A,
                                 const int8_t* B,
                                 int8_t* C,
                                 int M,
                                 int N,
                                 int K,
                                 float scale_A,
                                 float scale_B,
                                 float scale_C,
                                 int zp_C) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 720
  constexpr int WM = 16, WN = 16, WK = 16;
  int warp_id = threadIdx.x / 32;
  int row = blockIdx.y * 64 + (warp_id / 2) * WM;
  int col = blockIdx.x * 32 + (warp_id % 2) * WN;

  wmma::fragment<wmma::accumulator, WM, WN, WK, int> acc;
  wmma::fill_fragment(acc, 0);

  for (int kk = 0; kk < K; kk += WK) {
    wmma::fragment<wmma::matrix_a, WM, WN, WK, signed char, wmma::row_major> a;
    wmma::fragment<wmma::matrix_b, WM, WN, WK, signed char, wmma::row_major> b;
    wmma::load_matrix_sync(a, reinterpret_cast<const signed char*>(A + row * K + kk), K);
    wmma::load_matrix_sync(b, reinterpret_cast<const signed char*>(B + kk * N + col), N);
    wmma::mma_sync(acc, a, b, acc);
  }

  // store acc -> int32 tile, then quantize to int8
#endif
}

5.48 ms 95.5th (A100)

如果 zero_point_A == 0 && zero_point_B == 0,这条路径最直接。
如果 zero point 非零,思路还是一样:

  • 先用 tensor core 算 raw_C
  • 再用公共公式做修正和量化

完整代码:solutions/quantization/int8-quantized-matmul.cu

BatchNorm 的关键不是公式,而是统计量和归一化之间的拆分方式。

baseline

最朴素的写法是每个 channel 单独起一个 block:

  • 先扫一遍求 mean
  • 再扫一遍求 variance
  • 最后第三遍写回归一化结果
__global__ void batch_norm_baseline_kernel(const float* input,
                                           const float* gamma,
                                           const float* beta,
                                           float* output,
                                           int N,
                                           int C,
                                           float eps) {
  __shared__ float mean;
  __shared__ float variance;

  int channel = blockIdx.x;
  int tid = threadIdx.x;

  float thread_sum = 0.0f;
  for (int row = tid; row < N; row += blockDim.x) {
    thread_sum += input[row * C + channel];
  }

  float sum = block_reduce_sum(thread_sum);
  if (tid == 0) mean = sum / float(N);
  __syncthreads();

  float thread_sq_sum = 0.0f;
  for (int row = tid; row < N; row += blockDim.x) {
    float diff = input[row * C + channel] - mean;
    thread_sq_sum += diff * diff;
  }

  float sq_sum = block_reduce_sum(thread_sq_sum);
  if (tid == 0) variance = sq_sum / float(N);
  __syncthreads();

  float inv_std = rsqrtf(variance + eps);
  for (int row = tid; row < N; row += blockDim.x) {
    int index = row * C + channel;
    output[index] = gamma[channel] * ((input[index] - mean) * inv_std) + beta[channel];
  }
}

0.19 ms 43.8th (A100)

这版简单直接,但一个 channel 要来回扫三次,N 大时带宽利用不算好。

tiled

思路变成:

  • 先按 (row tile, channel tile) 分块累加 sum / sumsq
  • 再单独 kernel 生成 mean / inv_std
  • 最后按二维 tile 把输出写回
__global__ void accumulate_stats_tiled_kernel(const float* input,
                                              float* sum,
                                              float* sumsq,
                                              int N,
                                              int C) {
  __shared__ float shared_sum[kStatsThreadRows][kTileChannels];
  __shared__ float shared_sumsq[kStatsThreadRows][kTileChannels];

  int channel = blockIdx.x * blockDim.x + threadIdx.x;
  int row_begin = blockIdx.y * kStatsRowsPerBlock;
  int row_end = min(row_begin + kStatsRowsPerBlock, N);

  float local_sum = 0.0f;
  float local_sumsq = 0.0f;
  if (channel < C) {
    for (int row = row_begin + threadIdx.y; row < row_end; row += blockDim.y) {
      float v = input[row * C + channel];
      local_sum += v;
      local_sumsq += v * v;
    }
  }

  shared_sum[threadIdx.y][threadIdx.x] = local_sum;
  shared_sumsq[threadIdx.y][threadIdx.x] = local_sumsq;
  __syncthreads();

  if (threadIdx.y == 0 && channel < C) {
    float total_sum = 0.0f, total_sumsq = 0.0f;
    for (int i = 0; i < kStatsThreadRows; ++i) {
      total_sum += shared_sum[i][threadIdx.x];
      total_sumsq += shared_sumsq[i][threadIdx.x];
    }
    atomicAdd(sum + channel, total_sum);
    atomicAdd(sumsq + channel, total_sumsq);
  }
}

0.06 ms 93.8th (A100)

这版的重点是把统计和写回都改成二维分块,让 channel 方向始终连续,适合 NC 都比较大的情况。

这题最值得学的是:统计量是按 channel 聚合的,所以分块时要优先保证 channel 方向的连续性。

完整代码:solutions/normalization/batch-normalization.cu

RoPE 的核心不是矩阵乘,而是成对变换。

如果直接按元素看,很容易绕进去。先把一行看成两半:

  • 每一行拆成前半和后半
  • 每个位置只和自己的配对位置交互
  • 低半区写 q_low * cos - q_high * sin
  • 高半区写 q_high * cos + q_low * sin

baseline

最直观的版本是一个 block 处理一个 token。

  • 先把这一整行 Q 搬进 shared memory
  • 每个线程处理这一行的一个维度
  • 低半区去读高半区配对值,高半区去读低半区配对值

这样想法最顺,因为“配对元素一定在同一行里”,所以先把整行缓存下来,后面的交叉访问就只是 shared memory 里的读写。

__global__ void ropeToken(float* Q, float* cos, float* sin, float* output, int M, int D) {
  int bid = blockIdx.x;
  int tid = threadIdx.x;
  extern __shared__ float token[];
  int offset = bid * D;

  token[tid] = Q[offset + tid];
  __syncthreads();

  output[offset + tid] = token[tid] * cos[offset + tid];
  if (tid >= D / 2) {
    output[offset + tid] += token[tid - D / 2] * sin[offset + tid];
  } else {
    output[offset + tid] -= token[tid + D / 2] * sin[offset + tid];
  }
}

extern "C" void solve(float* Q, float* cos, float* sin, float* output, int M, int D) {
  ropeToken<<<M, D, D * sizeof(float)>>>(Q, cos, sin, output, M, D);
  cudaDeviceSynchronize();
}

1.53 ms 9.5th (A100)

这版的关键不是公式,而是先确认“旋转只发生在同一个 token 内部”,所以分块应该先按行切,再在行内找配对维度。

optimized

再往上走,最自然的并行单位就不是“一个元素”,而是“一个 pair”。这样每个线程直接同时处理一对位置,不需要先把整行搬进 shared memory。D == 128 时还可以继续做 float4 向量化,一次处理 4 对。

__global__ void rope_pairs_kernel(const float* Q,
                                  const float* cos,
                                  const float* sin,
                                  float* output,
                                  int M,
                                  int D) {
  size_t half = D >> 1;
  size_t pair_idx = size_t(blockIdx.x) * blockDim.x + threadIdx.x;
  size_t total_pairs = size_t(M) * half;
  if (pair_idx >= total_pairs) return;

  size_t row = pair_idx / half;
  size_t col = pair_idx - row * half;
  size_t base = row * size_t(D) + col;
  size_t high = base + half;

  float q_low = Q[base];
  float q_high = Q[high];
  output[base] = fmaf(q_low, cos[base], -q_high * sin[base]);
  output[high] = fmaf(q_high, cos[high], q_low * sin[high]);
}

1.22 ms 97.6th (A100)

D = 128 这类固定热路径,可以专门做 float4 版本,把前半和后半各按向量读出来再一起写回。

完整代码:solutions/transformer/rotary-positional-embedding.cu

RMSNorm 可以拆成两个阶段:

  • 先求 sum(x^2),拿到全局 scale
  • 再做 y = x * scale * gamma + beta

baseline

基础版就是老老实实拆成三步:

  • 第一个 kernel 先算每个 block 的平方和
  • 第二个 kernel 再把这些 partial sums 归约成总和
  • 把总和拷回 host 算出 inv_rms
  • 第三个 kernel 做归一化写回

这个版本不追求极限性能,但结构最直观,也最容易验证对不对。

namespace {

constexpr int kBlockSize = 256;

__global__ void rms_sum_squares(const float* input, double* partial_sums, int N) {
  __shared__ double shared[kBlockSize];

  int tid = threadIdx.x;
  int global_tid = blockIdx.x * blockDim.x + tid;
  int stride = blockDim.x * gridDim.x;

  double local_sum = 0.0;
  for (int idx = global_tid; idx < N; idx += stride) {
    double value = static_cast<double>(input[idx]);
    local_sum += value * value;
  }
  shared[tid] = local_sum;
  __syncthreads();

  for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
    if (tid < offset) shared[tid] += shared[tid + offset];
    __syncthreads();
  }

  if (tid == 0) partial_sums[blockIdx.x] = shared[0];
}

__global__ void rms_reduce_partial_sums(const double* partial_sums, double* total_sum, int count) {
  __shared__ double shared[kBlockSize];

  int tid = threadIdx.x;
  double local_sum = 0.0;
  for (int idx = tid; idx < count; idx += blockDim.x) {
    local_sum += partial_sums[idx];
  }
  shared[tid] = local_sum;
  __syncthreads();

  for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
    if (tid < offset) shared[tid] += shared[tid + offset];
    __syncthreads();
  }

  if (tid == 0) total_sum[0] = shared[0];
}

__global__ void rms_normalize(const float* input, float gamma, float beta, float* output, int N,
                              float inv_rms) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < N) {
    output[idx] = input[idx] * inv_rms * gamma + beta;
  }
}

}  // namespace

0.23 ms 9.9th (A100)

optimized

真正麻烦的是这两个阶段之间要共享一个标量,而且不想把整段流程拆成太多 kernel。这里比较顺的做法是:

  • 优先尝试 cooperative launch
  • 用整个 grid 先归约平方和
  • grid.sync() 后由一个线程写出全局 scale
  • grid.sync() 一次,所有线程直接做归一化写回

这样只要 cooperative launch 成功,整题就是单 kernel 完成。再配一个普通 fallback 路径兜底。

template <int kThreads>
__global__ void rms_cooperative_kernel(const float* input,
                                       float gamma,
                                       float beta,
                                       float* output,
                                       int N,
                                       float eps) {
  cg::grid_group grid = cg::this_grid();
  float local_sum = 0.0f;

  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
    local_sum += input[i] * input[i];
  }

  float block_sum = block_reduce_sum<kThreads>(local_sum);
  if (threadIdx.x == 0) atomicAdd(&g_sum_sq, block_sum);
  grid.sync();

  if (blockIdx.x == 0 && threadIdx.x == 0) {
    g_scale = rsqrtf(g_sum_sq / float(N) + eps) * gamma;
  }
  grid.sync();

  float scale = g_scale;
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
    output[i] = fmaf(input[i], scale, beta);
  }
}

0.04 ms 74.1th (A100)

这题最值钱的点不是公式,而是怎么把“先求 scale,再用 scale”压成一条高效执行路径。

完整代码:solutions/normalization/rms-normalization.cu

这题最容易写成纯标量版,但真正该利用的是一个 tile 里所有元素共享同一个 scale。

也就是说,输出矩阵虽然是逐元素写回,但 scale 不是逐元素查的,而是按块查的。顺着这个观察,优化方向就很自然:

  • 如果 N 能按 4 对齐,优先用 float4 向量化
  • 如果 tile_size 比较大,直接让一个 block 对应一个 scale tile
  • 这样同一个 block 里的线程都复用同一个 scale,不必反复算索引
template <int kTileSize>
__global__ void weight_dequant_tile_vec4_kernel(const float* X, const float* S, float* Y, int M, int N) {
  constexpr int kBlockRows = 8;
  constexpr int kVecWidth = 4;

  int tile_col = blockIdx.x;
  int row = blockIdx.y * kBlockRows + threadIdx.y;
  int col = tile_col * kTileSize + threadIdx.x * kVecWidth;
  if (row >= M || col >= N) return;

  int scale_cols = (N + kTileSize - 1) / kTileSize;
  float scale = S[(row / kTileSize) * scale_cols + tile_col];
  float4 v = reinterpret_cast<const float4*>(X + row * N + col)[0];
  reinterpret_cast<float4*>(Y + row * N + col)[0] =
      make_float4(v.x * scale, v.y * scale, v.z * scale, v.w * scale);
}

0.34 ms 75.0th (A100)

这题和普通逐元素 kernel 的差别在于:优化目标不是少做乘法,而是把“同一个 tile 共用同一个 scale”这件事吃干净。

完整代码:solutions/quantization/weight-dequantization.cu

困难题

这题不是某一个 kernel 难,而是整条计算图怎么串起来。

如果直接看 packed weights 很容易乱,最稳的方式是先把 block 拆成五段:

  1. LN1
  2. QKV -> attention -> output projection
  3. 第一次 residual
  4. LN2
  5. FC -> GELU -> projection -> residual

当前实现走的是很朴素但清晰的分段流水:

  • LayerNorm 单独 kernel
  • QKV projection 单独 kernel
  • attention 按 (token, head) 展开
  • 输出投影走通用线性层
  • FFN 再走一遍 up -> GELU -> proj
extern "C" void solve(const float* x, float* output, const float* weights, int seq_len) {
  // allocate ln1 / qkv / attn / residual / ln2 / ffn buffers

  layerNorm<<<seq_len, 1>>>(x, ln1, weights, seq_len);
  qkv<<<seq_len, 256>>>(ln1, qkv_buf, weights, seq_len);

  attn<<<dim3(seq_len, kNumHeads), kHeadDim,
         static_cast<size_t>(seq_len) * sizeof(float)>>>(qkv_buf, attn_concat, weights, seq_len);

  linear_bias<<<linear_grid, 256>>>(
      attn_concat, attn_proj, weights + kWAttnOffset, weights + kBAttnOffset,
      seq_len, kDModel, kDModel);

  add_residual<<<(token_count + 255) / 256, 256>>>(x, attn_proj, residual1, token_count);
  layerNorm2<<<seq_len, 1>>>(residual1, ln2, weights, seq_len);
  ffn<<<seq_len, 256>>>(ln2, ff2, weights, seq_len);
  add_residual<<<(token_count + 255) / 256, 256>>>(residual1, ff2, output, token_count);
}

这一版的价值主要在于把 GPT-2 block 的结构完整跑通,而不是把每一层都做到最优。

里面最值得单独看的是 attention 这步:实现按 (row, head) 展开,一个 block 处理一个 query token 的一个 head,先算整行 score,再做 softmax,再拿权重去乘 V

__global__ void attn(const float* qkv, float* output, const float* weights, int seq_len) {
  int row = blockIdx.x;
  int head = blockIdx.y;
  int lane = threadIdx.x;
  if (row >= seq_len || head >= kNumHeads || lane >= kHeadDim) return;

  extern __shared__ float scores[];

  if (lane == 0) {
    // compute full score row
    // softmax over seq_len
  }
  __syncthreads();

  float acc = 0.0f;
  for (int j = 0; j < seq_len; ++j) {
    acc += scores[j] * V(row=j, head, lane);
  }
  output[row * kDModel + head * kHeadDim + lane] = acc;
}

所以这题的重点不是某一个花哨技巧,而是先把 transformer block 的数据流理顺:

  • packed weights 怎么拆
  • 中间 buffer 怎么接
  • residual 放在什么位置
  • attention 的 (token, head, dim) 映射怎么落到 kernel

完整代码:solutions/transformer/gpt-2-transformer-block.cu

About

一步步通关GPU编程

Topics

Resources

Contributing

Stars

Watchers

Forks

Contributors

Languages