Skip to content

Commit 3aaf602

Browse files
author
Iwan Kawrakow
committed
Remove some unnecessary copies in the MLA attention
1 parent 37c4fbd commit 3aaf602

File tree

1 file changed

+6
-4
lines changed

1 file changed

+6
-4
lines changed

src/llama.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13463,7 +13463,7 @@ struct llm_build_context {
1346313463
0);
1346413464
cb(kv_cache_trans, "kv_cache_trans", il);
1346513465

13466-
q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
13466+
//q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
1346713467
q_pe = ggml_rope_ext(
1346813468
ctx0, q_pe, inp_pos, nullptr,
1346913469
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -13472,7 +13472,7 @@ struct llm_build_context {
1347213472
cb(q_pe, "q_pe", il);
1347313473

1347413474
// shared RoPE key
13475-
k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
13475+
//k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
1347613476
k_pe = ggml_rope_ext(
1347713477
ctx0, k_pe, inp_pos, nullptr,
1347813478
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -13508,15 +13508,17 @@ struct llm_build_context {
1350813508
struct ggml_tensor * kq_nope = ggml_mul_mat(ctx0, kv_cache, q_nope2_perm);
1350913509
cb(kq_nope, "kq_nope", il);
1351013510

13511-
struct ggml_tensor * q_pe_perm = ggml_permute(ctx0, q_pe, 0, 3, 2, 1);
13512-
cb(q_pe_perm, "q_pe_perm", il);
13511+
// Huh? This is not used anywhere
13512+
//struct ggml_tensor * q_pe_perm = ggml_permute(ctx0, q_pe, 0, 3, 2, 1);
13513+
//cb(q_pe_perm, "q_pe_perm", il);
1351313514

1351413515
struct ggml_tensor * kq_pe = ggml_mul_mat(ctx0, kr_cache, q_pe);
1351513516
cb(kq_pe, "kq_pe", il);
1351613517

1351713518
struct ggml_tensor * kq = ggml_add(ctx0, kq_nope, kq_pe);
1351813519
cb(kq, "kq", il);
1351913520

13521+
// We need this copy because soft_max expects a contiguous tensor
1352013522
kq = ggml_cont(ctx0, ggml_permute(ctx0, kq, 0, 2, 1, 3));
1352113523
cb(kq, "kq_perm", il);
1352213524

0 commit comments

Comments
 (0)