server : fix multibyte handle in partial response (#3706)

This commit is contained in:
Jhen-Jie Hong 2023-10-21 19:58:03 +08:00 committed by GitHub
parent 778c070d1b
commit 17b23eb9cb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1005,32 +1005,6 @@ struct llama_server_context
slot.generated_text += token_str;
slot.has_next_token = true;
size_t pos = std::min(slot.sent_count, slot.generated_text.size());
const std::string str_test = slot.generated_text.substr(pos);
bool is_stop_full = false;
size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
if (stop_pos != std::string::npos) {
is_stop_full = true;
slot.generated_text.erase(
slot.generated_text.begin() + pos + stop_pos,
slot.generated_text.end());
pos = std::min(slot.sent_count, slot.generated_text.size());
} else {
is_stop_full = false;
stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
}
// check if there is any token to predict
if(stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) {
// no send the stop word in the response
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
slot.sent_count += result.text_to_send.size();
// add the token to slot queue and cache
}
slot.add_token_string(result);
if(slot.params.stream) {
send_partial_response(slot, result);
}
if (slot.multibyte_pending > 0)
{
slot.multibyte_pending -= token_str.size();
@ -1059,6 +1033,36 @@ struct llama_server_context
}
}
if (slot.multibyte_pending == 0)
{
size_t pos = std::min(slot.sent_count, slot.generated_text.size());
const std::string str_test = slot.generated_text.substr(pos);
bool is_stop_full = false;
size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
if (stop_pos != std::string::npos) {
is_stop_full = true;
slot.generated_text.erase(
slot.generated_text.begin() + pos + stop_pos,
slot.generated_text.end());
pos = std::min(slot.sent_count, slot.generated_text.size());
} else {
is_stop_full = false;
stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
}
// check if there is any token to predict
if(stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) {
// no send the stop word in the response
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
slot.sent_count += result.text_to_send.size();
// add the token to slot queue and cache
}
slot.add_token_string(result);
if (slot.params.stream) {
send_partial_response(slot, result);
}
}
if (slot.multibyte_pending > 0 && !slot.has_next_token)
{
slot.has_next_token = true;