mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 03:31:46 +00:00
server : allow to specify tokens as strings in logit_bias (#5003)
* server: allow to specify tokens as strings in logit_bias * Apply suggestions from code review Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
85910c5b30
commit
684780141a
@ -185,7 +185,7 @@ node index.js
|
|||||||
|
|
||||||
`ignore_eos`: Ignore end of stream token and continue generating (default: false).
|
`ignore_eos`: Ignore end of stream token and continue generating (default: false).
|
||||||
|
|
||||||
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced (default: []).
|
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. (default: []).
|
||||||
|
|
||||||
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
|
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
|
||||||
|
|
||||||
|
@ -626,18 +626,36 @@ struct llama_server_context
|
|||||||
const int n_vocab = llama_n_vocab(model);
|
const int n_vocab = llama_n_vocab(model);
|
||||||
for (const auto &el : *logit_bias)
|
for (const auto &el : *logit_bias)
|
||||||
{
|
{
|
||||||
if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
|
if (el.is_array() && el.size() == 2)
|
||||||
|
{
|
||||||
|
float bias;
|
||||||
|
if (el[1].is_number())
|
||||||
|
{
|
||||||
|
bias = el[1].get<float>();
|
||||||
|
}
|
||||||
|
else if (el[1].is_boolean() && !el[1].get<bool>())
|
||||||
|
{
|
||||||
|
bias = -INFINITY;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (el[0].is_number_integer())
|
||||||
{
|
{
|
||||||
llama_token tok = el[0].get<llama_token>();
|
llama_token tok = el[0].get<llama_token>();
|
||||||
if (tok >= 0 && tok < n_vocab)
|
if (tok >= 0 && tok < n_vocab)
|
||||||
{
|
{
|
||||||
if (el[1].is_number())
|
slot->sparams.logit_bias[tok] = bias;
|
||||||
{
|
|
||||||
slot->sparams.logit_bias[tok] = el[1].get<float>();
|
|
||||||
}
|
}
|
||||||
else if (el[1].is_boolean() && !el[1].get<bool>())
|
}
|
||||||
|
else if (el[0].is_string())
|
||||||
{
|
{
|
||||||
slot->sparams.logit_bias[tok] = -INFINITY;
|
auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
|
||||||
|
for (auto tok : toks)
|
||||||
|
{
|
||||||
|
slot->sparams.logit_bias[tok] = bias;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user