From 03fb8a002df2e96104f9e06de9c78d2a8ed91e92 Mon Sep 17 00:00:00 2001 From: maor-ps <154728172+maor-ps@users.noreply.github.com> Date: Sat, 4 May 2024 12:06:40 +0300 Subject: [PATCH] If first token generated from the server is the stop word the server will crash (#7038) This will reproduce the issue in llama13b { 'prompt': 'Q: hello world \nA: ', 'stop': ['\n'], 'temperature': 0.0, 'n_predict': 10, 'cache_prompt': True, 'n_probs': 10 } --- examples/server/server.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f60530cf3..ff0814b2f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1383,9 +1383,10 @@ struct server_context { if (!slot.params.stream && slot.stopped_word) { const std::vector stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false); + size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size()); probs = std::vector( slot.generated_token_probs.begin(), - slot.generated_token_probs.end() - stop_word_toks.size()); + slot.generated_token_probs.end() - safe_offset); } else { probs = std::vector( slot.generated_token_probs.begin(),