Skip to content

Commit 0ed235e

Browse files
authored
[CUDA] Added a cudaMemcpy2DAsync fast path to ggml_cuda_cpy (ggml-org#25057)
* [CUDA] Added a cudaMemcpy2DAsync fast path to ggml_cuda_cpy Add a CUDA ggml_cpy fast path for same-type, same-shape strided copies that are just 2D pitched block copies. When tensors are not fully contiguous but each row is contiguous, it now uses cudaMemcpy2DAsync instead of the slow element-wise scalar copy kernel. This fixes the GDN recurrent snapshot update with -np 4, where rollback slots are separated by cache stride gaps. * Add new tests that execute the new optimized strided copy path * Return unsupported for strided copy in OpenVINO, as new tests are failing
1 parent 9bebfcb commit 0ed235e

3 files changed

Lines changed: 77 additions & 7 deletions

File tree

ggml/src/ggml-cuda/cpy.cu

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,46 @@ static void ggml_cpy_f32_iq4_nl_cuda(
386386
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
387387
}
388388

389+
// check if a same-type copy reduces to a 2D strided copy (height rows of width
390+
// contiguous bytes), so it can use cudaMemcpy2DAsync instead of the scalar kernel
391+
static bool ggml_cuda_cpy_as_memcpy_2d(const ggml_tensor * src0, const ggml_tensor * src1,
392+
size_t & width, size_t & height, size_t & spitch, size_t & dpitch) {
393+
// require matching shape: a reshaped copy maps elements by flat order, which the
394+
// prefix walk below does not handle
395+
if (src0->type != src1->type || !ggml_are_same_shape(src0, src1)) {
396+
return false;
397+
}
398+
399+
// grow the contiguous prefix block shared by both tensors
400+
size_t block_nb = ggml_element_size(src0);
401+
int d = 0;
402+
for (; d < GGML_MAX_DIMS; ++d) {
403+
if (src0->nb[d] != block_nb || src1->nb[d] != block_nb) {
404+
break;
405+
}
406+
block_nb *= src0->ne[d];
407+
}
408+
409+
// d == 0: nothing contiguous; d == GGML_MAX_DIMS: fully contiguous (handled by memcpy)
410+
if (d == 0 || d == GGML_MAX_DIMS) {
411+
return false;
412+
}
413+
414+
// dim d carries the rows; everything above it must be a single element
415+
for (int i = d + 1; i < GGML_MAX_DIMS; ++i) {
416+
if (src0->ne[i] != 1) {
417+
return false;
418+
}
419+
}
420+
421+
width = block_nb;
422+
height = src0->ne[d];
423+
spitch = src0->nb[d];
424+
dpitch = src1->nb[d];
425+
426+
return spitch >= width && dpitch >= width;
427+
}
428+
389429
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
390430
const int64_t ne = ggml_nelements(src0);
391431
GGML_ASSERT(ne == ggml_nelements(src1));
@@ -421,6 +461,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
421461
const bool can_be_transposed = nb01 == (int64_t)ggml_element_size(src0) &&
422462
src0->ne[3] == 1 && nb02 == ne00 * ne01 * (int64_t)ggml_element_size(src0);
423463

464+
size_t mc_width = 0, mc_height = 0, mc_spitch = 0, mc_dpitch = 0;
465+
424466
if (src0->type == src1->type && contiguous_srcs) {
425467
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
426468
#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
@@ -431,6 +473,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
431473
{
432474
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
433475
}
476+
} else if (ggml_cuda_cpy_as_memcpy_2d(src0, src1, mc_width, mc_height, mc_spitch, mc_dpitch)) {
477+
CUDA_CHECK(cudaMemcpy2DAsync(src1_ddc, mc_dpitch, src0_ddc, mc_spitch,
478+
mc_width, mc_height, cudaMemcpyDeviceToDevice, main_stream));
434479
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
435480
if (can_be_transposed) {
436481
ggml_cpy_scalar_cuda<float, float, true>

ggml/src/ggml-openvino/ggml-openvino.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1053,6 +1053,10 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
10531053
(op->ne[0] == 2 && op->ne[1] == 4 && op->ne[2] == 3 && op->ne[3] == 2)) {
10541054
return true;
10551055
}
1056+
// CPY into a strided view of a larger buffer (recurrent-state snapshots) not supported
1057+
if (op->view_src && ggml_nbytes(op) != ggml_nbytes(op->view_src)) {
1058+
return true;
1059+
}
10561060
break;
10571061
}
10581062
case GGML_OP_MUL_MAT: {

tests/test-backend-ops.cpp

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2890,12 +2890,17 @@ struct test_cpy : public test_case {
28902890
const std::array<int64_t, 4> ne_dst;
28912891
const std::array<int64_t, 4> permute_src;
28922892
const std::array<int64_t, 4> permute_dst;
2893+
const std::array<int64_t, 4> dst_alloc; // if set, dst is a view into a larger buffer (strided)
28932894
bool _src_use_permute;
28942895
bool _dst_use_permute;
28952896
bool _src_transpose;
28962897
bool _use_dst_shape;
2898+
bool _use_dst_alloc;
28972899

28982900
std::string vars() override {
2901+
if (_use_dst_alloc) {
2902+
return VARS_TO_STR8(type_src, type_dst, ne_src, ne_dst, permute_src, permute_dst, _src_transpose, dst_alloc);
2903+
}
28992904
if (_use_dst_shape) {
29002905
return VARS_TO_STR7(type_src, type_dst, ne_src, ne_dst, permute_src, permute_dst, _src_transpose);
29012906
}
@@ -2943,12 +2948,15 @@ struct test_cpy : public test_case {
29432948
std::array<int64_t, 4> ne_dst = {-1, -1, -1, -1},
29442949
std::array<int64_t, 4> permute_src = {0, 0, 0, 0},
29452950
std::array<int64_t, 4> permute_dst = {0, 0, 0, 0},
2946-
bool transpose_src = false)
2951+
bool transpose_src = false,
2952+
std::array<int64_t, 4> dst_alloc = {0, 0, 0, 0})
29472953
: type_src(type_src), type_dst(type_dst), ne_src(ne_src), ne_dst(ne_dst), permute_src(permute_src), permute_dst(permute_dst),
2954+
dst_alloc(dst_alloc),
29482955
_src_use_permute(permute_src[0] + permute_src[1] + permute_src[2] + permute_src[3] > 0),
29492956
_dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0),
29502957
_src_transpose(transpose_src),
2951-
_use_dst_shape(ne_dst[0] >= 0 && ne_dst[1] >= 0 && ne_dst[2] >= 0 && ne_dst[3] >= 0){}
2958+
_use_dst_shape(ne_dst[0] >= 0 && ne_dst[1] >= 0 && ne_dst[2] >= 0 && ne_dst[3] >= 0),
2959+
_use_dst_alloc(dst_alloc[0] > 0){}
29522960

29532961
ggml_tensor * build_graph(ggml_context * ctx) override {
29542962
ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne_src.data());
@@ -2966,12 +2974,23 @@ struct test_cpy : public test_case {
29662974
}
29672975

29682976
std::array<int64_t, 4> dst_ne = _use_dst_shape ? ne_dst : std::array<int64_t, 4>{src->ne[0], src->ne[1], src->ne[2], src->ne[3]};
2969-
ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, dst_ne.data());
2970-
ggml_set_name(dst, "dst");
2977+
ggml_tensor * dst;
29712978

2972-
if (_dst_use_permute) {
2973-
dst = ggml_permute(ctx, dst, permute_dst[0], permute_dst[1], permute_dst[2], permute_dst[3]);
2974-
ggml_set_name(dst, "dst_permuted");
2979+
if (_use_dst_alloc) {
2980+
// view a sub-block of a larger buffer -> strided dst
2981+
ggml_tensor * dst_buf = ggml_new_tensor(ctx, type_dst, 4, dst_alloc.data());
2982+
ggml_set_name(dst_buf, "dst_buf");
2983+
dst = ggml_view_4d(ctx, dst_buf, dst_ne[0], dst_ne[1], dst_ne[2], dst_ne[3],
2984+
dst_buf->nb[1], dst_buf->nb[2], dst_buf->nb[3], 0);
2985+
ggml_set_name(dst, "dst_view");
2986+
} else {
2987+
dst = ggml_new_tensor(ctx, type_dst, 4, dst_ne.data());
2988+
ggml_set_name(dst, "dst");
2989+
2990+
if (_dst_use_permute) {
2991+
dst = ggml_permute(ctx, dst, permute_dst[0], permute_dst[1], permute_dst[2], permute_dst[3]);
2992+
ggml_set_name(dst, "dst_permuted");
2993+
}
29752994
}
29762995

29772996
ggml_tensor * out = ggml_cpy(ctx, src, dst);
@@ -8181,6 +8200,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
81818200
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
81828201
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 2097121, 1, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}));
81838202
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 2, 524281, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}));
8203+
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {128, 2, 3, 1}, {128, 2, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, false, {128, 4, 3, 1})); // strided dst
8204+
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {128, 2, 3, 1}, {128, 2, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, false, {128, 4, 3, 1})); // strided dst
81848205

81858206
// CPY - different src/dst shapes (reshaping via CPY)
81868207
// Use permutations of {3, 5, 7, 32}. Total elements: 3*5*7*32 = 3360.

0 commit comments

Comments
 (0)