diff --git a/examples/server/README.md b/examples/server/README.md index 257ef45c6..ecd24c899 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -438,6 +438,8 @@ These words will not be included in the completion, so make sure to add them to `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true` +`return_tokens`: Return the raw generated token ids in the `tokens` field. Otherwise `tokens` remains empty. Default: `false` + `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values. `timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false` @@ -451,7 +453,7 @@ These words will not be included in the completion, so make sure to add them to ```json { "content": "", - "tokens": [ generated token ids ], + "tokens": [ generated token ids if requested ], "probs": [ { "prob": float, @@ -469,7 +471,7 @@ These words will not be included in the completion, so make sure to add them to Notice that each `probs` is an array of length `n_probs`. - `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string. -- `tokens`: Same as `content` but represented as raw token ids. +- `tokens`: Same as `content` but represented as raw token ids. Only populated if `"return_tokens": true` or `"stream": true` in the request. - `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options) - `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.). - `model`: The path to the model loaded with `-m` diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 34ddf0b2f..4675ac593 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -79,8 +79,9 @@ enum error_type { }; struct slot_params { - bool stream = true; - bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt + bool stream = true; + bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt + bool return_tokens = false; int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half @@ -199,6 +200,7 @@ struct server_task { params.stream = json_value(data, "stream", false); params.cache_prompt = json_value(data, "cache_prompt", true); + params.return_tokens = json_value(data, "return_tokens", false); params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict)); params.n_indent = json_value(data, "n_indent", defaults.n_indent); params.n_keep = json_value(data, "n_keep", defaults.n_keep); @@ -543,7 +545,7 @@ struct server_task_result_cmpl_final : server_task_result { json choices = json::array({json{ {"finish_reason", finish_reason}, {"index", 0}, - {"message", json{ + {"message", json { {"content", content}, {"tokens", tokens}, {"role", "assistant"} @@ -998,7 +1000,6 @@ struct server_slot { n_prompt_tokens = 0; last_nl_pos = 0; generated_text = ""; - generated_tokens = {}; has_new_line = false; truncated = false; stop = STOP_TYPE_NONE; @@ -1008,6 +1009,7 @@ struct server_slot { n_sent_token_probs = 0; task_type = SERVER_TASK_TYPE_COMPLETION; + generated_tokens.clear(); generated_token_probs.clear(); } @@ -1748,9 +1750,10 @@ struct server_context { const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special); slot.sampled = result.tok; - // search stop word and delete it slot.generated_text += token_str; - slot.generated_tokens.push_back(result.tok); + if (slot.params.return_tokens) { + slot.generated_tokens.push_back(result.tok); + } slot.has_next_token = true; // check if there is incomplete UTF-8 character at the end @@ -1775,6 +1778,7 @@ struct server_context { break; } + // search stop word and delete it if (!incomplete) { size_t pos = std::min(slot.n_sent_text, slot.generated_text.size()); diff --git a/examples/server/tests/unit/test_completion.py b/examples/server/tests/unit/test_completion.py index 062ebcd4a..39b5c8dbe 100644 --- a/examples/server/tests/unit/test_completion.py +++ b/examples/server/tests/unit/test_completion.py @@ -10,16 +10,17 @@ def create_server(): global server server = ServerPreset.tinyllama2() -@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [ - ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False), - ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False), +@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated,return_tokens", [ + ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False, False), + ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False, True), ]) -def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool): +def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool, return_tokens: bool): global server server.start() res = server.make_request("POST", "/completion", data={ "n_predict": n_predict, "prompt": prompt, + "return_tokens": return_tokens, }) assert res.status_code == 200 assert res.body["timings"]["prompt_n"] == n_prompt @@ -27,6 +28,10 @@ def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, assert res.body["truncated"] == truncated assert type(res.body["has_new_line"]) == bool assert match_regex(re_content, res.body["content"]) + if return_tokens: + assert res.body["tokens"] != [] + else: + assert res.body["tokens"] == [] @pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [ @@ -56,6 +61,7 @@ def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_promp assert data["generation_settings"]["seed"] == server.seed assert match_regex(re_content, content) else: + assert data["tokens"] != [] content += data["content"]