mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 02:44:36 +00:00
server : fix kv cache management (#3588)
This commit is contained in:
parent
b8fe4b5cc9
commit
57dd55e2c7
@ -405,6 +405,7 @@ struct llama_server_context
|
|||||||
// compare the evaluated prompt with the new prompt
|
// compare the evaluated prompt with the new prompt
|
||||||
n_past = common_part(embd, prompt_tokens);
|
n_past = common_part(embd, prompt_tokens);
|
||||||
embd = prompt_tokens;
|
embd = prompt_tokens;
|
||||||
|
|
||||||
if (n_past == num_prompt_tokens)
|
if (n_past == num_prompt_tokens)
|
||||||
{
|
{
|
||||||
// we have to evaluate at least 1 token to generate logits.
|
// we have to evaluate at least 1 token to generate logits.
|
||||||
@ -412,6 +413,9 @@ struct llama_server_context
|
|||||||
n_past--;
|
n_past--;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// since #3228 we now have to manually manage the KV cache
|
||||||
|
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
||||||
|
|
||||||
LOG_VERBOSE("prompt ingested", {
|
LOG_VERBOSE("prompt ingested", {
|
||||||
{"n_past", n_past},
|
{"n_past", n_past},
|
||||||
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|
||||||
@ -461,9 +465,6 @@ struct llama_server_context
|
|||||||
// compare the evaluated prompt with the new prompt
|
// compare the evaluated prompt with the new prompt
|
||||||
n_past = common_part(embd, prompt_tokens);
|
n_past = common_part(embd, prompt_tokens);
|
||||||
|
|
||||||
// since #3228 we now have to manually manage the KV cache
|
|
||||||
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
|
||||||
|
|
||||||
embd = prompt_tokens;
|
embd = prompt_tokens;
|
||||||
if (n_past == num_prompt_tokens)
|
if (n_past == num_prompt_tokens)
|
||||||
{
|
{
|
||||||
@ -471,6 +472,9 @@ struct llama_server_context
|
|||||||
n_past--;
|
n_past--;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// since #3228 we now have to manually manage the KV cache
|
||||||
|
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
||||||
|
|
||||||
LOG_VERBOSE("prompt ingested", {
|
LOG_VERBOSE("prompt ingested", {
|
||||||
{"n_past", n_past},
|
{"n_past", n_past},
|
||||||
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|
||||||
|
Loading…
Reference in New Issue
Block a user