Skip to content

Commit 7ba22c6

Browse files
authored
vulkan: Support unaligned tensors for ROPE (ggml-org#22637)
1 parent f4cc787 commit 7ba22c6

4 files changed

Lines changed: 47 additions & 4 deletions

File tree

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1354,6 +1354,8 @@ struct vk_op_rope_push_constants {
13541354
uint32_t nb11;
13551355
uint32_t nb12;
13561356
uint32_t nb13;
1357+
uint32_t a_offset;
1358+
uint32_t d_offset;
13571359
};
13581360
static_assert(sizeof(vk_op_rope_push_constants) <= 128, "sizeof(vk_op_rope_push_constants) must be <= 128");
13591361

@@ -10126,6 +10128,15 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk
1012610128
GGML_UNUSED(src3);
1012710129
}
1012810130

10131+
template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_rope_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
10132+
p.a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
10133+
p.d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
10134+
10135+
GGML_UNUSED(src1);
10136+
GGML_UNUSED(src2);
10137+
GGML_UNUSED(src3);
10138+
}
10139+
1012910140
template<typename PC>
1013010141
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst, ggml_op op, PC&& pc) {
1013110142
VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
@@ -11270,6 +11281,7 @@ static vk_op_rope_push_constants ggml_vk_make_rope_constants(const ggml_tensor *
1127011281
(uint32_t)src0->ne[2],
1127111282
nb01, nb02, nb03,
1127211283
nb11, nb12, nb13,
11284+
0, 0, // a_offset, d_offset filled in by init_pushconst_tensor_offsets
1127311285
};
1127411286

1127511287
return rope;
@@ -11365,6 +11377,11 @@ static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx,
1136511377
GGML_ASSERT(buf[i] != nullptr);
1136611378
}
1136711379

11380+
// a_offset is unused (the fused path reads from shared memory), but the rope/set_rows dst can be misaligned.
11381+
// Round the binding offset down to the storage buffer alignment; the in-element shift goes in pc.rope.d_offset.
11382+
pc.rope.d_offset = get_misalign_bytes(ctx, tensors[5]) / ggml_type_size(tensors[5]->type);
11383+
offset[5] &= ~(size_t(ctx->device->properties.limits.minStorageBufferOffsetAlignment) - 1);
11384+
1136811385
std::array<uint32_t, 3> elements;
1136911386
elements = { (uint32_t)rms->src[0]->ne[1], (uint32_t)rms->src[0]->ne[2], (uint32_t)rms->src[0]->ne[3] };
1137011387

ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ uint rope_a_coord(const uint i0, const uint i01, const uint i02, const uint i03,
99
// Per-row offset in shared memory
1010
const uint ix = i0;
1111
#else
12-
const uint ix = i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i0;
12+
const uint ix = p.a_offset + i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i0;
1313
#endif
1414
return ix;
1515
}
@@ -48,6 +48,7 @@ void rope_norm(const uint i0, const uint i1, const uint i2, const uint i3, rope_
4848
idst = i1*p.nb11 + i0;
4949
idst += rope_data_i[i2].x * p.set_rows_stride;
5050
}
51+
idst += p.d_offset;
5152

5253
if (i0 >= p.n_dims) {
5354
rope_data_d[idst + 0] = ROPE_D_TYPE(rope_data_a[ix + 0]);
@@ -84,6 +85,7 @@ void rope_neox(const uint i0, const uint i1, const uint i2, const uint i3, rope_
8485
idst = i1*p.nb11 + i0/2;
8586
idst += rope_data_i[i2].x * p.set_rows_stride;
8687
}
88+
idst += p.d_offset;
8789

8890
if (i0 >= p.n_dims) {
8991
rope_data_d[idst + i0/2 + 0] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 0]);
@@ -121,6 +123,7 @@ void rope_multi(const uint i0, const uint i1, const uint i2, const uint i3, rope
121123
idst = i1*p.nb11 + i0/2;
122124
idst += rope_data_i[i2].x * p.set_rows_stride;
123125
}
126+
idst += p.d_offset;
124127

125128
if (i0 >= p.n_dims) {
126129
rope_data_d[idst + i0/2 + 0] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 0]);
@@ -176,7 +179,7 @@ void rope_vision(const uint i0, const uint i1, const uint i2, const uint i3, rop
176179
return;
177180
}
178181

179-
const uint idst = i0/2 + i1 * p.nb11 + i2 * p.nb12 + i3 * p.nb13;
182+
const uint idst = p.d_offset + i0/2 + i1 * p.nb11 + i2 * p.nb12 + i3 * p.nb13;
180183
const uint ix = rope_a_coord(i0/2, i1, i2, i3, p);
181184

182185
const int sect_dims = p.sections[0] + p.sections[1];

ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ struct rope_params {
2626
uint nb11;
2727
uint nb12;
2828
uint nb13;
29+
30+
uint a_offset;
31+
uint d_offset;
2932
};
3033

3134
#endif // !defined(GGML_ROPE_PARAMS)

tests/test-backend-ops.cpp

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4851,6 +4851,21 @@ struct test_rope : public test_case {
48514851

48524852
a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
48534853
ggml_set_name(a, "view_of_a");
4854+
} else if (v == 2) {
4855+
// second-half slice along dim 0 (mimics build_rope_2d in clip.cpp).
4856+
// The non-zero view offset (ne_a[0] * elem_size) often produces a
4857+
// non-aligned buffer offset, which exercises backends' alignment paths.
4858+
auto ne = ne_a; ne[0] *= 2;
4859+
a = ggml_new_tensor(ctx, type, 4, ne.data());
4860+
if (forward) {
4861+
ggml_set_param(a);
4862+
}
4863+
ggml_set_name(a, "a");
4864+
4865+
a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3],
4866+
a->nb[1], a->nb[2], a->nb[3],
4867+
ne_a[0] * ggml_element_size(a));
4868+
ggml_set_name(a, "view_of_a");
48544869
} else {
48554870
a = ggml_new_tensor(ctx, type, 4, ne_a.data());
48564871
if (forward) {
@@ -4913,8 +4928,6 @@ struct test_rope : public test_case {
49134928
} else {
49144929
out = ggml_rope_ext_back(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
49154930
}
4916-
4917-
// TODO: add test with a non-contiguous view as input ; this case is needed for build_rope_2d in clip.cpp
49184931
}
49194932
ggml_set_name(out, "out");
49204933

@@ -8687,6 +8700,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
86878700

86888701
test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1}, 64, GGML_ROPE_TYPE_NEOX, 512, fs, ef, af, ff, v, fw)); // neox (falcon 40B)
86898702
}
8703+
8704+
// build_rope_2d-style: ROPE on a non-contiguous view
8705+
// that starts at a non-zero offset along dim 0
8706+
// (e.g. gemma4v vision second-half view).
8707+
for (int rmode : { GGML_ROPE_TYPE_NORMAL, GGML_ROPE_TYPE_NEOX, GGML_ROPE_TYPE_MROPE, GGML_ROPE_TYPE_IMROPE, GGML_ROPE_TYPE_VISION }) {
8708+
test_cases.emplace_back(new test_rope(type, { 36, 16, 2457, 1}, 36, rmode, 512, fs, ef, af, ff, 2, fw));
8709+
}
86908710
}
86918711

86928712
all = false;

0 commit comments

Comments
 (0)