mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 03:14:35 +00:00
server : return tokens ids only if requested
Some checks failed
Python Type-Check / pyright type-check (push) Has been cancelled
Some checks failed
Python Type-Check / pyright type-check (push) Has been cancelled
ggml-ci
This commit is contained in:
parent
d58f8a1b6b
commit
8bcfc5551e
@ -438,6 +438,8 @@ These words will not be included in the completion, so make sure to add them to
|
|||||||
|
|
||||||
`cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true`
|
`cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true`
|
||||||
|
|
||||||
|
`return_tokens`: Return the raw generated token ids in the `tokens` field. Otherwise `tokens` remains empty. Default: `false`
|
||||||
|
|
||||||
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
|
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
|
||||||
|
|
||||||
`timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false`
|
`timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false`
|
||||||
@ -451,7 +453,7 @@ These words will not be included in the completion, so make sure to add them to
|
|||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"content": "<the token generated by the model>",
|
"content": "<the token generated by the model>",
|
||||||
"tokens": [ generated token ids ],
|
"tokens": [ generated token ids if requested ],
|
||||||
"probs": [
|
"probs": [
|
||||||
{
|
{
|
||||||
"prob": float,
|
"prob": float,
|
||||||
@ -469,7 +471,7 @@ These words will not be included in the completion, so make sure to add them to
|
|||||||
Notice that each `probs` is an array of length `n_probs`.
|
Notice that each `probs` is an array of length `n_probs`.
|
||||||
|
|
||||||
- `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
|
- `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
|
||||||
- `tokens`: Same as `content` but represented as raw token ids.
|
- `tokens`: Same as `content` but represented as raw token ids. Only populated if `"return_tokens": true` or `"stream": true` in the request.
|
||||||
- `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
|
- `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
|
||||||
- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.).
|
- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.).
|
||||||
- `model`: The path to the model loaded with `-m`
|
- `model`: The path to the model loaded with `-m`
|
||||||
|
@ -81,6 +81,7 @@ enum error_type {
|
|||||||
struct slot_params {
|
struct slot_params {
|
||||||
bool stream = true;
|
bool stream = true;
|
||||||
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
|
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
|
||||||
|
bool return_tokens = false;
|
||||||
|
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
|
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
|
||||||
@ -199,6 +200,7 @@ struct server_task {
|
|||||||
|
|
||||||
params.stream = json_value(data, "stream", false);
|
params.stream = json_value(data, "stream", false);
|
||||||
params.cache_prompt = json_value(data, "cache_prompt", true);
|
params.cache_prompt = json_value(data, "cache_prompt", true);
|
||||||
|
params.return_tokens = json_value(data, "return_tokens", false);
|
||||||
params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict));
|
params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict));
|
||||||
params.n_indent = json_value(data, "n_indent", defaults.n_indent);
|
params.n_indent = json_value(data, "n_indent", defaults.n_indent);
|
||||||
params.n_keep = json_value(data, "n_keep", defaults.n_keep);
|
params.n_keep = json_value(data, "n_keep", defaults.n_keep);
|
||||||
@ -998,7 +1000,6 @@ struct server_slot {
|
|||||||
n_prompt_tokens = 0;
|
n_prompt_tokens = 0;
|
||||||
last_nl_pos = 0;
|
last_nl_pos = 0;
|
||||||
generated_text = "";
|
generated_text = "";
|
||||||
generated_tokens = {};
|
|
||||||
has_new_line = false;
|
has_new_line = false;
|
||||||
truncated = false;
|
truncated = false;
|
||||||
stop = STOP_TYPE_NONE;
|
stop = STOP_TYPE_NONE;
|
||||||
@ -1008,6 +1009,7 @@ struct server_slot {
|
|||||||
n_sent_token_probs = 0;
|
n_sent_token_probs = 0;
|
||||||
task_type = SERVER_TASK_TYPE_COMPLETION;
|
task_type = SERVER_TASK_TYPE_COMPLETION;
|
||||||
|
|
||||||
|
generated_tokens.clear();
|
||||||
generated_token_probs.clear();
|
generated_token_probs.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1748,9 +1750,10 @@ struct server_context {
|
|||||||
const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special);
|
const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special);
|
||||||
slot.sampled = result.tok;
|
slot.sampled = result.tok;
|
||||||
|
|
||||||
// search stop word and delete it
|
|
||||||
slot.generated_text += token_str;
|
slot.generated_text += token_str;
|
||||||
|
if (slot.params.return_tokens) {
|
||||||
slot.generated_tokens.push_back(result.tok);
|
slot.generated_tokens.push_back(result.tok);
|
||||||
|
}
|
||||||
slot.has_next_token = true;
|
slot.has_next_token = true;
|
||||||
|
|
||||||
// check if there is incomplete UTF-8 character at the end
|
// check if there is incomplete UTF-8 character at the end
|
||||||
@ -1775,6 +1778,7 @@ struct server_context {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// search stop word and delete it
|
||||||
if (!incomplete) {
|
if (!incomplete) {
|
||||||
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
||||||
|
|
||||||
|
@ -10,16 +10,17 @@ def create_server():
|
|||||||
global server
|
global server
|
||||||
server = ServerPreset.tinyllama2()
|
server = ServerPreset.tinyllama2()
|
||||||
|
|
||||||
@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
|
@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated,return_tokens", [
|
||||||
("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False),
|
("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False, False),
|
||||||
("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False),
|
("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False, True),
|
||||||
])
|
])
|
||||||
def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool):
|
def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool, return_tokens: bool):
|
||||||
global server
|
global server
|
||||||
server.start()
|
server.start()
|
||||||
res = server.make_request("POST", "/completion", data={
|
res = server.make_request("POST", "/completion", data={
|
||||||
"n_predict": n_predict,
|
"n_predict": n_predict,
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
|
"return_tokens": return_tokens,
|
||||||
})
|
})
|
||||||
assert res.status_code == 200
|
assert res.status_code == 200
|
||||||
assert res.body["timings"]["prompt_n"] == n_prompt
|
assert res.body["timings"]["prompt_n"] == n_prompt
|
||||||
@ -27,6 +28,10 @@ def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int,
|
|||||||
assert res.body["truncated"] == truncated
|
assert res.body["truncated"] == truncated
|
||||||
assert type(res.body["has_new_line"]) == bool
|
assert type(res.body["has_new_line"]) == bool
|
||||||
assert match_regex(re_content, res.body["content"])
|
assert match_regex(re_content, res.body["content"])
|
||||||
|
if return_tokens:
|
||||||
|
assert res.body["tokens"] != []
|
||||||
|
else:
|
||||||
|
assert res.body["tokens"] == []
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
|
@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
|
||||||
@ -56,6 +61,7 @@ def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_promp
|
|||||||
assert data["generation_settings"]["seed"] == server.seed
|
assert data["generation_settings"]["seed"] == server.seed
|
||||||
assert match_regex(re_content, content)
|
assert match_regex(re_content, content)
|
||||||
else:
|
else:
|
||||||
|
assert data["tokens"] != []
|
||||||
content += data["content"]
|
content += data["content"]
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user