mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-24 10:24:35 +00:00
server : fix crash when prompt exceeds context size (#3996)
This commit is contained in:
parent
34b0a08207
commit
d96ca7ded7
@ -1557,15 +1557,6 @@ struct llama_server_context
|
||||
|
||||
slot.num_prompt_tokens = prompt_tokens.size();
|
||||
|
||||
if (!slot.params.cache_prompt)
|
||||
{
|
||||
llama_sampling_reset(slot.ctx_sampling);
|
||||
|
||||
slot.n_past = 0;
|
||||
slot.num_prompt_tokens_processed = slot.num_prompt_tokens;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (slot.params.n_keep < 0)
|
||||
{
|
||||
slot.params.n_keep = slot.num_prompt_tokens;
|
||||
@ -1595,6 +1586,15 @@ struct llama_server_context
|
||||
GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
|
||||
}
|
||||
|
||||
if (!slot.params.cache_prompt)
|
||||
{
|
||||
llama_sampling_reset(slot.ctx_sampling);
|
||||
|
||||
slot.n_past = 0;
|
||||
slot.num_prompt_tokens_processed = slot.num_prompt_tokens;
|
||||
}
|
||||
else
|
||||
{
|
||||
// push the prompt into the sampling context (do not apply grammar)
|
||||
for (auto &token : prompt_tokens)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user