From bc5ba007b2c83ac95875e68724dabfc12159fc61 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 25 Oct 2024 10:13:46 +0300 Subject: [PATCH] server : check that the prompt fits in the slot's context (#10030) ggml-ci --- convert_hf_to_gguf.py | 3 +++ convert_hf_to_gguf_update.py | 1 + examples/server/server.cpp | 7 ++++++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 7e552a71b..a34dabe23 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -573,6 +573,9 @@ class Model: if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 res = "bert-bge" + if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7": + # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5 + res = "bert-bge-large" if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": # ref: https://huggingface.co/mosaicml/mpt-7b res = "mpt" diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 022354a3b..28cd02e5a 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -72,6 +72,7 @@ models = [ {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", }, {"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", }, {"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", }, + {"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", }, {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", }, diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 58f93694f..2821877b2 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1882,12 +1882,17 @@ struct server_context { } if (slot.inf_type == SERVER_TASK_INF_TYPE_EMBEDDING || slot.inf_type == SERVER_TASK_INF_TYPE_RERANK) { - // this prompt is too large to process - discard it if (slot.n_prompt_tokens > n_ubatch) { slot.release(); send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER); continue; } + + if (slot.n_prompt_tokens > slot.n_ctx) { + slot.release(); + send_error(slot, "input is larger than the max context size. skipping", ERROR_TYPE_SERVER); + continue; + } } else { if (!params.ctx_shift) { // if context shift is disabled, we make sure prompt size is smaller than KV size