diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d992feeef..ee0ababb1 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -405,6 +405,7 @@ struct llama_server_context // compare the evaluated prompt with the new prompt n_past = common_part(embd, prompt_tokens); embd = prompt_tokens; + if (n_past == num_prompt_tokens) { // we have to evaluate at least 1 token to generate logits. @@ -412,6 +413,9 @@ struct llama_server_context n_past--; } + // since #3228 we now have to manually manage the KV cache + llama_kv_cache_seq_rm(ctx, 0, n_past, -1); + LOG_VERBOSE("prompt ingested", { {"n_past", n_past}, {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)}, @@ -461,9 +465,6 @@ struct llama_server_context // compare the evaluated prompt with the new prompt n_past = common_part(embd, prompt_tokens); - // since #3228 we now have to manually manage the KV cache - llama_kv_cache_seq_rm(ctx, 0, n_past, -1); - embd = prompt_tokens; if (n_past == num_prompt_tokens) { @@ -471,6 +472,9 @@ struct llama_server_context n_past--; } + // since #3228 we now have to manually manage the KV cache + llama_kv_cache_seq_rm(ctx, 0, n_past, -1); + LOG_VERBOSE("prompt ingested", { {"n_past", n_past}, {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},