server : fix crash when prompt exceeds context size (#3996)

2024-12-24 18:34:36 +00:00 · 2023-11-11 05:48:21 +00:00 · 2023-11-11 05:48:21 +00:00 · d96ca7ded7
commit d96ca7ded7
parent 34b0a08207
1 changed files with 29 additions and 29 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1557,15 +1557,6 @@ struct llama_server_context

                    slot.num_prompt_tokens = prompt_tokens.size();

-                    if (!slot.params.cache_prompt)
-                    {
-                        llama_sampling_reset(slot.ctx_sampling);
-
-                        slot.n_past = 0;
-                        slot.num_prompt_tokens_processed = slot.num_prompt_tokens;
-                    }
-                    else
-                    {
                    if (slot.params.n_keep < 0)
                    {
                        slot.params.n_keep = slot.num_prompt_tokens;
@ -1595,6 +1586,15 @@ struct llama_server_context
                        GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
                    }

+                    if (!slot.params.cache_prompt)
+                    {
+                        llama_sampling_reset(slot.ctx_sampling);
+
+                        slot.n_past = 0;
+                        slot.num_prompt_tokens_processed = slot.num_prompt_tokens;
+                    }
+                    else
+                    {
                        // push the prompt into the sampling context (do not apply grammar)
                        for (auto &token : prompt_tokens)
                        {