Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2164,6 +2164,31 @@ static bool ggml_cuda_should_fuse_mul_mat(const ggml_tensor * ffn_up,
return false;
}

// Check that the fused output does not overlap with source tensors
auto overlaps = [](const ggml_tensor * a, const ggml_tensor * b) -> bool {
if (!a->data || !b->data) {
return false;
}
const uintptr_t a_start = (uintptr_t) a->data;
const uintptr_t a_end = a_start + ggml_nbytes(a);
const uintptr_t b_start = (uintptr_t) b->data;
const uintptr_t b_end = b_start + ggml_nbytes(b);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For future-proof-ness it's better to use ggml_backend_buft_get_alloc_size() instead of ggml_nbytes(). Same in ggml_cuda_check_fusion_memory_ranges().

return a_start < b_end && b_start < a_end;
};

if (overlaps(glu, ffn_up->src[0]) || overlaps(glu, ffn_up->src[1]) || overlaps(glu, ffn_gate->src[0])) {
return false;
}

if (has_bias) {
const ggml_tensor * up_bias_data = ffn_up_bias->src[0] == ffn_up ? ffn_up_bias->src[1] : ffn_up_bias->src[0];
const ggml_tensor * gate_bias_data =
ffn_gate_bias->src[0] == ffn_gate ? ffn_gate_bias->src[1] : ffn_gate_bias->src[0];
if (overlaps(glu, up_bias_data) || overlaps(glu, gate_bias_data)) {
return false;
}
}

return true;
}

Expand Down
Loading