Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/layer/vulkan/gemm_vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ int Gemm_vulkan::create_pipeline(const Option& opt)
use_cooperative_matrix = vkdev->info.support_cooperative_matrix() && opt.use_cooperative_matrix && (opt.use_fp16_storage || opt.use_fp16_packed);

bool use_bf16_cooperative_matrix = false;
if (vkdev->info.support_bf16_cooperative_matrix() && opt.use_cooperative_matrix && (opt.use_bf16_storage || opt.use_bf16_packed))
if (vkdev->info.support_bf16_cooperative_matrix() && opt.use_cooperative_matrix && opt.use_bf16_storage)
{
use_cooperative_matrix = true;
use_bf16_cooperative_matrix = true;
Expand Down
2 changes: 1 addition & 1 deletion src/layer/vulkan/sdpa_vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ int SDPA_vulkan::create_pipeline(const Option& opt)
use_cooperative_matrix = vkdev->info.support_cooperative_matrix() && opt.use_cooperative_matrix && (opt.use_fp16_storage || opt.use_fp16_packed);

bool use_bf16_cooperative_matrix = false;
if (vkdev->info.support_bf16_cooperative_matrix() && opt.use_cooperative_matrix && (opt.use_bf16_storage || opt.use_bf16_packed))
if (vkdev->info.support_bf16_cooperative_matrix() && opt.use_cooperative_matrix && opt.use_bf16_storage)
{
use_cooperative_matrix = true;
use_bf16_cooperative_matrix = true;
Expand Down
60 changes: 30 additions & 30 deletions src/layer/vulkan/shader/gemm_cm.comp

Large diffs are not rendered by default.

40 changes: 20 additions & 20 deletions src/layer/vulkan/shader/sdpa_cross_cm.comp
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ void main()
const uvec4 ai8m8d2 = (ai8 % 8) / 2;
const uvec4 ai8m2 = ai8 % 2;

#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
if (gk4.r < psc(GK)) v4.r = unpackBFloat2x16(A_blob_data[ai4d8.r][ai4m8d2.r])[ai4m2.r];
if (gk4.g < psc(GK)) v4.g = unpackBFloat2x16(A_blob_data[ai4d8.g][ai4m8d2.g])[ai4m2.g];
if (gk4.b < psc(GK)) v4.b = unpackBFloat2x16(A_blob_data[ai4d8.b][ai4m8d2.b])[ai4m2.b];
Expand Down Expand Up @@ -352,7 +352,7 @@ void main()
const uvec4 bi8m8d2 = (bi8 % 8) / 2;
const uvec4 bi8m2 = bi8 % 2;

#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
if (gn4.r < psc(GN)) v4.r = unpackBFloat2x16(B_blob_data[bi4d8.r][bi4m8d2.r])[bi4m2.r];
if (gn4.g < psc(GN)) v4.g = unpackBFloat2x16(B_blob_data[bi4d8.g][bi4m8d2.g])[bi4m2.g];
if (gn4.b < psc(GN)) v4.b = unpackBFloat2x16(B_blob_data[bi4d8.b][bi4m8d2.b])[bi4m2.b];
Expand Down Expand Up @@ -457,7 +457,7 @@ void main()
const uvec4 bi8m8d2 = (bi8 % 8) / 2;
const uvec4 bi8m2 = bi8 % 2;

#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
if (gk4.r < psc(GK)) v4.r = unpackBFloat2x16(B_blob_data[bi4d8.r][bi4m8d2.r])[bi4m2.r];
if (gk4.g < psc(GK)) v4.g = unpackBFloat2x16(B_blob_data[bi4d8.g][bi4m8d2.g])[bi4m2.g];
if (gk4.b < psc(GK)) v4.b = unpackBFloat2x16(B_blob_data[bi4d8.b][bi4m8d2.b])[bi4m2.b];
Expand Down Expand Up @@ -633,7 +633,7 @@ void main()
const uvec4 ai8m8d2 = (ai8 % 8) / 2;
const uvec4 ai8m2 = ai8 % 2;

#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
if (gk4.r < psc(GK)) v4.r = unpackBFloat2x16(A_blob_data[ai4d8.r][ai4m8d2.r])[ai4m2.r];
if (gk4.g < psc(GK)) v4.g = unpackBFloat2x16(A_blob_data[ai4d8.g][ai4m8d2.g])[ai4m2.g];
if (gk4.b < psc(GK)) v4.b = unpackBFloat2x16(A_blob_data[ai4d8.b][ai4m8d2.b])[ai4m2.b];
Expand Down Expand Up @@ -732,7 +732,7 @@ void main()
const uvec4 bi8m8d2 = (bi8 % 8) / 2;
const uvec4 bi8m2 = bi8 % 2;

#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
if (gn4.r < psc(GN)) v4.r = unpackBFloat2x16(B_blob_data[bi4d8.r][bi4m8d2.r])[bi4m2.r];
if (gn4.g < psc(GN)) v4.g = unpackBFloat2x16(B_blob_data[bi4d8.g][bi4m8d2.g])[bi4m2.g];
if (gn4.b < psc(GN)) v4.b = unpackBFloat2x16(B_blob_data[bi4d8.b][bi4m8d2.b])[bi4m2.b];
Expand Down Expand Up @@ -837,7 +837,7 @@ void main()
const uvec4 bi8m8d2 = (bi8 % 8) / 2;
const uvec4 bi8m2 = bi8 % 2;

#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
if (gk4.r < psc(GK)) v4.r = unpackBFloat2x16(B_blob_data[bi4d8.r][bi4m8d2.r])[bi4m2.r];
if (gk4.g < psc(GK)) v4.g = unpackBFloat2x16(B_blob_data[bi4d8.g][bi4m8d2.g])[bi4m2.g];
if (gk4.b < psc(GK)) v4.b = unpackBFloat2x16(B_blob_data[bi4d8.b][bi4m8d2.b])[bi4m2.b];
Expand Down Expand Up @@ -872,7 +872,7 @@ void main()
}

#if ncnn_VK_KHR_cooperative_matrix
#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
coopmat<bfloat16_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> A[UNROLL_SG_M];
coopmat<bfloat16_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> B[UNROLL_SG_N];
#else
Expand Down Expand Up @@ -993,7 +993,7 @@ void main()
barrier();

#if ncnn_VK_KHR_cooperative_matrix
#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
coopmat<bfloat16_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> A[UNROLL_SG_M];
coopmat<bfloat16_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> B[UNROLL_SG_N];
#else
Expand Down Expand Up @@ -1131,7 +1131,7 @@ void main()
const uvec4 ai8m8d2 = (ai8 % 8) / 2;
const uvec4 ai8m2 = ai8 % 2;

#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
if (gk4.r < psc(GK)) v4.r = unpackBFloat2x16(A_blob_data[ai4d8.r][ai4m8d2.r])[ai4m2.r];
if (gk4.g < psc(GK)) v4.g = unpackBFloat2x16(A_blob_data[ai4d8.g][ai4m8d2.g])[ai4m2.g];
if (gk4.b < psc(GK)) v4.b = unpackBFloat2x16(A_blob_data[ai4d8.b][ai4m8d2.b])[ai4m2.b];
Expand Down Expand Up @@ -1230,7 +1230,7 @@ void main()
const uvec4 bi8m8d2 = (bi8 % 8) / 2;
const uvec4 bi8m2 = bi8 % 2;

#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
if (gn4.r < psc(GN)) v4.r = unpackBFloat2x16(B_blob_data[bi4d8.r][bi4m8d2.r])[bi4m2.r];
if (gn4.g < psc(GN)) v4.g = unpackBFloat2x16(B_blob_data[bi4d8.g][bi4m8d2.g])[bi4m2.g];
if (gn4.b < psc(GN)) v4.b = unpackBFloat2x16(B_blob_data[bi4d8.b][bi4m8d2.b])[bi4m2.b];
Expand Down Expand Up @@ -1335,7 +1335,7 @@ void main()
const uvec4 bi8m8d2 = (bi8 % 8) / 2;
const uvec4 bi8m2 = bi8 % 2;

#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
if (gk4.r < psc(GK)) v4.r = unpackBFloat2x16(B_blob_data[bi4d8.r][bi4m8d2.r])[bi4m2.r];
if (gk4.g < psc(GK)) v4.g = unpackBFloat2x16(B_blob_data[bi4d8.g][bi4m8d2.g])[bi4m2.g];
if (gk4.b < psc(GK)) v4.b = unpackBFloat2x16(B_blob_data[bi4d8.b][bi4m8d2.b])[bi4m2.b];
Expand Down Expand Up @@ -1371,7 +1371,7 @@ void main()
barrier();

#if ncnn_VK_KHR_cooperative_matrix
#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
coopmat<bfloat16_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> A[UNROLL_SG_M];
coopmat<bfloat16_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> B[UNROLL_SG_N];
#else
Expand Down Expand Up @@ -1502,7 +1502,7 @@ void main()
const uvec4 ai8m8d2 = (ai8 % 8) / 2;
const uvec4 ai8m2 = ai8 % 2;

#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
if (gk4.r < psc(GK)) v4.r = unpackBFloat2x16(A_blob_data[ai4d8.r][ai4m8d2.r])[ai4m2.r];
if (gk4.g < psc(GK)) v4.g = unpackBFloat2x16(A_blob_data[ai4d8.g][ai4m8d2.g])[ai4m2.g];
if (gk4.b < psc(GK)) v4.b = unpackBFloat2x16(A_blob_data[ai4d8.b][ai4m8d2.b])[ai4m2.b];
Expand Down Expand Up @@ -1591,7 +1591,7 @@ void main()
const uvec4 bi8m8d2 = (bi8 % 8) / 2;
const uvec4 bi8m2 = bi8 % 2;

#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
if (gn4.r < psc(GN)) v4.r = unpackBFloat2x16(B_blob_data[bi4d8.r][bi4m8d2.r])[bi4m2.r];
if (gn4.g < psc(GN)) v4.g = unpackBFloat2x16(B_blob_data[bi4d8.g][bi4m8d2.g])[bi4m2.g];
if (gn4.b < psc(GN)) v4.b = unpackBFloat2x16(B_blob_data[bi4d8.b][bi4m8d2.b])[bi4m2.b];
Expand Down Expand Up @@ -1686,7 +1686,7 @@ void main()
const uvec4 bi8m8d2 = (bi8 % 8) / 2;
const uvec4 bi8m2 = bi8 % 2;

#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
if (gk4.r < psc(GK)) v4.r = unpackBFloat2x16(B_blob_data[bi4d8.r][bi4m8d2.r])[bi4m2.r];
if (gk4.g < psc(GK)) v4.g = unpackBFloat2x16(B_blob_data[bi4d8.g][bi4m8d2.g])[bi4m2.g];
if (gk4.b < psc(GK)) v4.b = unpackBFloat2x16(B_blob_data[bi4d8.b][bi4m8d2.b])[bi4m2.b];
Expand Down Expand Up @@ -1722,7 +1722,7 @@ void main()
barrier();

#if ncnn_VK_KHR_cooperative_matrix
#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
coopmat<bfloat16_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> A[UNROLL_SG_M];
coopmat<bfloat16_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> B[UNROLL_SG_N];
#else
Expand Down Expand Up @@ -1832,7 +1832,7 @@ void main()
if (gn * 8 + 6 < psc(GN)) vd.r = float(buffer_ld1(attn_mask_blob_data, ci8.b));
if (gn * 8 + 7 < psc(GN)) vd.g = float(buffer_ld1(attn_mask_blob_data, ci8.a));

#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
uvec4 v = uvec4(packBFloat2x16(va), packBFloat2x16(vb), packBFloat2x16(vc), packBFloat2x16(vd));
#else
uvec4 v = uvec4(packHalf2x16(va), packHalf2x16(vb), packHalf2x16(vc), packHalf2x16(vd));
Expand Down Expand Up @@ -1863,7 +1863,7 @@ void main()
#if NCNN_fp16_arithmetic
coopMatLoad(mask, tmp_o, ((sgi * UNROLL_SG_N + zn) * UNROLL_SG_M + zm) * (Nd8p * M), Nd8p, gl_CooperativeMatrixLayoutRowMajor);
#else
#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
coopmat<bfloat16_t, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator> mask_fp16;
#else
coopmat<float16_t, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator> mask_fp16;
Expand Down Expand Up @@ -1894,7 +1894,7 @@ void main()
#if NCNN_fp16_arithmetic
coopMatStore(sum[zn][zm], tmp_o, ((sgi * UNROLL_SG_N + zn) * UNROLL_SG_M + zm) * (Nd8p * M), Nd8p, gl_CooperativeMatrixLayoutRowMajor);
#else
#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
coopmat<bfloat16_t, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator> sum_fp16 = coopmat<bfloat16_t, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator>(sum[zn][zm]);
#else
coopmat<float16_t, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator> sum_fp16 = coopmat<float16_t, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator>(sum[zn][zm]);
Expand Down Expand Up @@ -1972,7 +1972,7 @@ void main()
{
uvec4 v = tmp_o[(((sgi * UNROLL_SG_N + zn) * UNROLL_SG_M + zm) * M + i) * Nd8p + j];

#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
afpvec4 vab = afpvec4(unpackBFloat2x16(v.r), unpackBFloat2x16(v.g));
afpvec4 vcd = afpvec4(unpackBFloat2x16(v.b), unpackBFloat2x16(v.a));
#else
Expand Down
26 changes: 13 additions & 13 deletions src/layer/vulkan/shader/sdpa_fa_cm.comp
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ void main()
barrier();

#if ncnn_VK_KHR_cooperative_matrix
#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
coopmat<bfloat16_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> qm[UNROLL_SG_M];
coopmat<bfloat16_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> km;
#else
Expand Down Expand Up @@ -491,7 +491,7 @@ void main()
coopmat<float, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator> a;
coopMatLoad(a, tmp_s, ((sgi * UNROLL_SG_M + zm) * UNROLL_P_N + zp) * M * Np, Np, gl_CooperativeMatrixLayoutRowMajor);

#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
coopmat<bfloat16_t, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator> b = coopmat<bfloat16_t, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator>(a);
#else
coopmat<float16_t, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator> b = coopmat<float16_t, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator>(a);
Expand All @@ -503,7 +503,7 @@ void main()
barrier();

// load P
#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
coopmat<bfloat16_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> pm[UNROLL_SG_M][UNROLL_P_N];
#else
coopmat<float16_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> pm[UNROLL_SG_M][UNROLL_P_N];
Expand Down Expand Up @@ -561,7 +561,7 @@ void main()

// load V
#if ncnn_VK_KHR_cooperative_matrix
#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
coopmat<bfloat16_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> vm;
#else
coopmat<float16_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> vm;
Expand Down Expand Up @@ -593,14 +593,14 @@ void main()
[[dont_unroll]] for (; j < dst_seqlen_d16; j++)
{
#if ncnn_VK_KHR_cooperative_matrix
coopmat<afp, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator> qkm[UNROLL_SG_M];
coopmat<float, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator> qkm[UNROLL_SG_M];
#elif ncnn_VK_NV_cooperative_matrix
fcoopmatNV<32, gl_ScopeSubgroup, M, N> qkm[UNROLL_SG_M];
#endif
[[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
{
#if ncnn_VK_KHR_cooperative_matrix
qkm[zm] = coopmat<afp, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator>(0.f);
qkm[zm] = coopmat<float, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator>(0.f);
#elif ncnn_VK_NV_cooperative_matrix
qkm[zm] = fcoopmatNV<32, gl_ScopeSubgroup, M, N>(0.f);
#endif
Expand Down Expand Up @@ -654,7 +654,7 @@ void main()
barrier();

#if ncnn_VK_KHR_cooperative_matrix
#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
coopmat<bfloat16_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> qm;
coopmat<bfloat16_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> km;
#else
Expand Down Expand Up @@ -841,7 +841,7 @@ void main()
#if ncnn_VK_KHR_cooperative_matrix
[[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
{
coopmat<afp, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator> cc;
coopmat<float, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator> cc;
coopMatLoad(cc, smem_correction, (sgi * UNROLL_SG_M + zm) * M, 0, gl_CooperativeMatrixLayoutColumnMajor);

[[unroll]] for (uint c = 0; c < MAX_OUT_CHUNKS; c++)
Expand Down Expand Up @@ -888,10 +888,10 @@ void main()
// convert P from fp32 to fp16
[[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
{
coopmat<afp, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator> a;
coopmat<float, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator> a;
coopMatLoad(a, tmp_s, (sgi * UNROLL_SG_M + zm) * M * Np, Np, gl_CooperativeMatrixLayoutRowMajor);

#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
coopmat<bfloat16_t, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator> b = coopmat<bfloat16_t, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator>(a);
#else
coopmat<float16_t, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator> b = coopmat<float16_t, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator>(a);
Expand All @@ -902,7 +902,7 @@ void main()
barrier();

// load P
#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
coopmat<bfloat16_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> pm[UNROLL_SG_M];
#else
coopmat<float16_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> pm[UNROLL_SG_M];
Expand Down Expand Up @@ -951,7 +951,7 @@ void main()

// load V
#if ncnn_VK_KHR_cooperative_matrix
#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
coopmat<bfloat16_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> vm;
#else
coopmat<float16_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> vm;
Expand Down Expand Up @@ -1012,7 +1012,7 @@ void main()
vec4 v1 = vec4(tmp_o[oi8.r], tmp_o[oi8.g], tmp_o[oi8.b], tmp_o[oi8.a]) * inv_sum;

uvec4 out_data;
#if NCNN_bf16_storage || NCNN_bf16_packed
#if NCNN_bf16_storage
out_data.x = packBFloat2x16(v0.rg);
out_data.y = packBFloat2x16(v0.ba);
out_data.z = packBFloat2x16(v1.rg);
Expand Down
6 changes: 6 additions & 0 deletions tests/perf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,9 @@ ncnn_add_layer_perf(BinaryOp)
ncnn_add_layer_perf(Concat)
ncnn_add_layer_perf(Sigmoid)
ncnn_add_layer_perf(BatchNorm)

# SDPA perf tests (decode and prefill phases)
if(WITH_LAYER_sdpa)
ncnn_add_perf(sdpa_decode)
ncnn_add_perf(sdpa_prefill)
endif()
Loading
Loading