server : fix kv cache management (#3588)

This commit is contained in:
Georgi Gerganov 2023-10-12 09:29:04 +03:00 committed by GitHub
parent b8fe4b5cc9
commit 57dd55e2c7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -405,6 +405,7 @@ struct llama_server_context
// compare the evaluated prompt with the new prompt // compare the evaluated prompt with the new prompt
n_past = common_part(embd, prompt_tokens); n_past = common_part(embd, prompt_tokens);
embd = prompt_tokens; embd = prompt_tokens;
if (n_past == num_prompt_tokens) if (n_past == num_prompt_tokens)
{ {
// we have to evaluate at least 1 token to generate logits. // we have to evaluate at least 1 token to generate logits.
@ -412,6 +413,9 @@ struct llama_server_context
n_past--; n_past--;
} }
// since #3228 we now have to manually manage the KV cache
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
LOG_VERBOSE("prompt ingested", { LOG_VERBOSE("prompt ingested", {
{"n_past", n_past}, {"n_past", n_past},
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)}, {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
@ -461,9 +465,6 @@ struct llama_server_context
// compare the evaluated prompt with the new prompt // compare the evaluated prompt with the new prompt
n_past = common_part(embd, prompt_tokens); n_past = common_part(embd, prompt_tokens);
// since #3228 we now have to manually manage the KV cache
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
embd = prompt_tokens; embd = prompt_tokens;
if (n_past == num_prompt_tokens) if (n_past == num_prompt_tokens)
{ {
@ -471,6 +472,9 @@ struct llama_server_context
n_past--; n_past--;
} }
// since #3228 we now have to manually manage the KV cache
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
LOG_VERBOSE("prompt ingested", { LOG_VERBOSE("prompt ingested", {
{"n_past", n_past}, {"n_past", n_past},
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)}, {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},