mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-24 10:24:35 +00:00
server : fix kv cache management (#3588)
This commit is contained in:
parent
b8fe4b5cc9
commit
57dd55e2c7
@ -405,6 +405,7 @@ struct llama_server_context
|
||||
// compare the evaluated prompt with the new prompt
|
||||
n_past = common_part(embd, prompt_tokens);
|
||||
embd = prompt_tokens;
|
||||
|
||||
if (n_past == num_prompt_tokens)
|
||||
{
|
||||
// we have to evaluate at least 1 token to generate logits.
|
||||
@ -412,6 +413,9 @@ struct llama_server_context
|
||||
n_past--;
|
||||
}
|
||||
|
||||
// since #3228 we now have to manually manage the KV cache
|
||||
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
||||
|
||||
LOG_VERBOSE("prompt ingested", {
|
||||
{"n_past", n_past},
|
||||
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|
||||
@ -461,9 +465,6 @@ struct llama_server_context
|
||||
// compare the evaluated prompt with the new prompt
|
||||
n_past = common_part(embd, prompt_tokens);
|
||||
|
||||
// since #3228 we now have to manually manage the KV cache
|
||||
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
||||
|
||||
embd = prompt_tokens;
|
||||
if (n_past == num_prompt_tokens)
|
||||
{
|
||||
@ -471,6 +472,9 @@ struct llama_server_context
|
||||
n_past--;
|
||||
}
|
||||
|
||||
// since #3228 we now have to manually manage the KV cache
|
||||
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
||||
|
||||
LOG_VERBOSE("prompt ingested", {
|
||||
{"n_past", n_past},
|
||||
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|
||||
|
Loading…
Reference in New Issue
Block a user