@@ -385,6 +385,8 @@ void llama_context::reserve() {
385385
386386 synchronize ();
387387
388+ const int64_t t_start_us = ggml_time_us ();
389+
388390 const uint32_t n_seqs = cparams.n_seq_max ;
389391 const uint32_t n_tokens = std::min (cparams.n_ctx , cparams.n_ubatch );
390392
@@ -528,6 +530,10 @@ void llama_context::reserve() {
528530 } else {
529531 LLAMA_LOG_INFO (" %s: graph splits = %d (with bs=%d), %d (with bs=1)\n " , __func__, n_splits_pp, n_tokens, n_splits_tg);
530532 }
533+
534+ const int64_t t_end_us = ggml_time_us ();
535+
536+ LLAMA_LOG_INFO (" %s: reserve took %.2f ms\n " , __func__, (t_end_us - t_start_us)/1000.0 );
531537}
532538
533539void llama_context::synchronize () {
@@ -983,6 +989,10 @@ void llama_context::set_embeddings(bool value) {
983989void llama_context::set_causal_attn (bool value) {
984990 LLAMA_LOG_DEBUG (" %s: value = %d\n " , __func__, value);
985991
992+ if (cparams.causal_attn == value) {
993+ return ;
994+ }
995+
986996 cparams.causal_attn = value;
987997
988998 reserve ();
@@ -1035,6 +1045,12 @@ void llama_context::set_adapter_lora(
10351045 float scale) {
10361046 LLAMA_LOG_DEBUG (" %s: adapter = %p, scale = %f\n " , __func__, (void *) adapter, scale);
10371047
1048+ if (auto it = loras.find (adapter); it != loras.end ()) {
1049+ if (it->second == scale) {
1050+ return ;
1051+ }
1052+ }
1053+
10381054 loras[adapter] = scale;
10391055
10401056 reserve ();
@@ -1044,9 +1060,9 @@ bool llama_context::rm_adapter_lora(
10441060 llama_adapter_lora * adapter) {
10451061 LLAMA_LOG_DEBUG (" %s: adapter = %p\n " , __func__, (void *) adapter);
10461062
1047- auto pos = loras.find (adapter);
1048- if (pos != loras.end ()) {
1049- loras.erase (pos );
1063+ auto it = loras.find (adapter);
1064+ if (it != loras.end ()) {
1065+ loras.erase (it );
10501066
10511067 reserve ();
10521068
@@ -1059,6 +1075,10 @@ bool llama_context::rm_adapter_lora(
10591075void llama_context::clear_adapter_lora () {
10601076 LLAMA_LOG_DEBUG (" %s: call\n " , __func__);
10611077
1078+ if (loras.empty ()) {
1079+ return ;
1080+ }
1081+
10621082 loras.clear ();
10631083
10641084 reserve ();
@@ -1072,13 +1092,9 @@ bool llama_context::apply_adapter_cvec(
10721092 int32_t il_end) {
10731093 LLAMA_LOG_DEBUG (" %s: il_start = %d, il_end = %d\n " , __func__, il_start, il_end);
10741094
1075- bool res = cvec. apply (model, data, len, n_embd, il_start, il_end);
1095+ // TODO: should we reserve?
10761096
1077- if (res) {
1078- reserve ();
1079- }
1080-
1081- return res;
1097+ return cvec.apply (model, data, len, n_embd, il_start, il_end);
10821098}
10831099
10841100llm_graph_result * llama_context::process_ubatch (const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
0 commit comments