Skip to content

Commit 4b74410

Browse files
committed
cont : reserve only when changes occur + timing
1 parent 55dd591 commit 4b74410

1 file changed

Lines changed: 25 additions & 9 deletions

File tree

src/llama-context.cpp

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,8 @@ void llama_context::reserve() {
385385

386386
synchronize();
387387

388+
const int64_t t_start_us = ggml_time_us();
389+
388390
const uint32_t n_seqs = cparams.n_seq_max;
389391
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
390392

@@ -528,6 +530,10 @@ void llama_context::reserve() {
528530
} else {
529531
LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
530532
}
533+
534+
const int64_t t_end_us = ggml_time_us();
535+
536+
LLAMA_LOG_INFO("%s: reserve took %.2f ms\n", __func__, (t_end_us - t_start_us)/1000.0);
531537
}
532538

533539
void llama_context::synchronize() {
@@ -983,6 +989,10 @@ void llama_context::set_embeddings(bool value) {
983989
void llama_context::set_causal_attn(bool value) {
984990
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
985991

992+
if (cparams.causal_attn == value) {
993+
return;
994+
}
995+
986996
cparams.causal_attn = value;
987997

988998
reserve();
@@ -1035,6 +1045,12 @@ void llama_context::set_adapter_lora(
10351045
float scale) {
10361046
LLAMA_LOG_DEBUG("%s: adapter = %p, scale = %f\n", __func__, (void *) adapter, scale);
10371047

1048+
if (auto it = loras.find(adapter); it != loras.end()) {
1049+
if (it->second == scale) {
1050+
return;
1051+
}
1052+
}
1053+
10381054
loras[adapter] = scale;
10391055

10401056
reserve();
@@ -1044,9 +1060,9 @@ bool llama_context::rm_adapter_lora(
10441060
llama_adapter_lora * adapter) {
10451061
LLAMA_LOG_DEBUG("%s: adapter = %p\n", __func__, (void *) adapter);
10461062

1047-
auto pos = loras.find(adapter);
1048-
if (pos != loras.end()) {
1049-
loras.erase(pos);
1063+
auto it = loras.find(adapter);
1064+
if (it != loras.end()) {
1065+
loras.erase(it);
10501066

10511067
reserve();
10521068

@@ -1059,6 +1075,10 @@ bool llama_context::rm_adapter_lora(
10591075
void llama_context::clear_adapter_lora() {
10601076
LLAMA_LOG_DEBUG("%s: call\n", __func__);
10611077

1078+
if (loras.empty()) {
1079+
return;
1080+
}
1081+
10621082
loras.clear();
10631083

10641084
reserve();
@@ -1072,13 +1092,9 @@ bool llama_context::apply_adapter_cvec(
10721092
int32_t il_end) {
10731093
LLAMA_LOG_DEBUG("%s: il_start = %d, il_end = %d\n", __func__, il_start, il_end);
10741094

1075-
bool res = cvec.apply(model, data, len, n_embd, il_start, il_end);
1095+
// TODO: should we reserve?
10761096

1077-
if (res) {
1078-
reserve();
1079-
}
1080-
1081-
return res;
1097+
return cvec.apply(model, data, len, n_embd, il_start, il_end);
10821098
}
10831099

10841100
llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {

0 commit comments

Comments
 (0)