@@ -13463,7 +13463,7 @@ struct llm_build_context {
13463
13463
0);
13464
13464
cb(kv_cache_trans, "kv_cache_trans", il);
13465
13465
13466
- q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
13466
+ // q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
13467
13467
q_pe = ggml_rope_ext(
13468
13468
ctx0, q_pe, inp_pos, nullptr,
13469
13469
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -13472,7 +13472,7 @@ struct llm_build_context {
13472
13472
cb(q_pe, "q_pe", il);
13473
13473
13474
13474
// shared RoPE key
13475
- k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
13475
+ // k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
13476
13476
k_pe = ggml_rope_ext(
13477
13477
ctx0, k_pe, inp_pos, nullptr,
13478
13478
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -13508,15 +13508,17 @@ struct llm_build_context {
13508
13508
struct ggml_tensor * kq_nope = ggml_mul_mat(ctx0, kv_cache, q_nope2_perm);
13509
13509
cb(kq_nope, "kq_nope", il);
13510
13510
13511
- struct ggml_tensor * q_pe_perm = ggml_permute(ctx0, q_pe, 0, 3, 2, 1);
13512
- cb(q_pe_perm, "q_pe_perm", il);
13511
+ // Huh? This is not used anywhere
13512
+ //struct ggml_tensor * q_pe_perm = ggml_permute(ctx0, q_pe, 0, 3, 2, 1);
13513
+ //cb(q_pe_perm, "q_pe_perm", il);
13513
13514
13514
13515
struct ggml_tensor * kq_pe = ggml_mul_mat(ctx0, kr_cache, q_pe);
13515
13516
cb(kq_pe, "kq_pe", il);
13516
13517
13517
13518
struct ggml_tensor * kq = ggml_add(ctx0, kq_nope, kq_pe);
13518
13519
cb(kq, "kq", il);
13519
13520
13521
+ // We need this copy because soft_max expects a contiguous tensor
13520
13522
kq = ggml_cont(ctx0, ggml_permute(ctx0, kq, 0, 2, 1, 3));
13521
13523
cb(kq, "kq_perm", il);
13522
13524
0 commit comments