server : fix multibyte handle in partial response (#3706)

2025-01-13 20:14:29 +00:00 · 2023-10-21 19:58:03 +08:00 · 2023-10-21 19:58:03 +08:00 · 17b23eb9cb
commit 17b23eb9cb
parent 778c070d1b
1 changed files with 30 additions and 26 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1005,32 +1005,6 @@ struct llama_server_context
        slot.generated_text += token_str;
        slot.has_next_token = true;

-        size_t pos = std::min(slot.sent_count, slot.generated_text.size());
-        const std::string str_test = slot.generated_text.substr(pos);
-        bool is_stop_full = false;
-        size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
-        if (stop_pos != std::string::npos) {
-            is_stop_full = true;
-            slot.generated_text.erase(
-                slot.generated_text.begin() + pos + stop_pos,
-                slot.generated_text.end());
-            pos = std::min(slot.sent_count, slot.generated_text.size());
-        } else {
-            is_stop_full = false;
-            stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
-        }
-
-        // check if there is any token to predict
-        if(stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) {
-            // no send the stop word in the response
-            result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
-            slot.sent_count += result.text_to_send.size();
-            // add the token to slot queue and cache
-        }
-        slot.add_token_string(result);
-        if(slot.params.stream) {
-            send_partial_response(slot, result);
-        }
        if (slot.multibyte_pending > 0)
        {
            slot.multibyte_pending -= token_str.size();
@ -1059,6 +1033,36 @@ struct llama_server_context
            }
        }

+        if (slot.multibyte_pending == 0)
+        {
+            size_t pos = std::min(slot.sent_count, slot.generated_text.size());
+            const std::string str_test = slot.generated_text.substr(pos);
+            bool is_stop_full = false;
+            size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
+            if (stop_pos != std::string::npos) {
+                is_stop_full = true;
+                slot.generated_text.erase(
+                    slot.generated_text.begin() + pos + stop_pos,
+                    slot.generated_text.end());
+                pos = std::min(slot.sent_count, slot.generated_text.size());
+            } else {
+                is_stop_full = false;
+                stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
+            }
+
+            // check if there is any token to predict
+            if(stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) {
+                // no send the stop word in the response
+                result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
+                slot.sent_count += result.text_to_send.size();
+                // add the token to slot queue and cache
+            }
+            slot.add_token_string(result);
+            if (slot.params.stream) {
+                send_partial_response(slot, result);
+            }
+        }
+
        if (slot.multibyte_pending > 0 && !slot.has_next_token)
        {
            slot.has_next_token = true;