server : completion requests remember slot_id

2024-12-27 20:04:35 +00:00 · 2023-10-22 19:34:48 +03:00 · 2023-10-22 19:34:48 +03:00 · a8063171bd
commit a8063171bd
parent f305d6434f
3 changed files with 1680 additions and 1673 deletions
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -413,7 +413,7 @@
          currentMessages.push(data);
          slot_id = data.slot_id;
          if (selected_image && !data.multimodal) {
-            alert("The server was no compiled for multimodal or the model projector can't be loaded.");
+            alert("The server was not compiled for multimodal or the model projector can't be loaded.");
            return;
          }
          transcriptUpdate([...history, [char, currentMessages]])
@ -470,6 +470,7 @@
      transcriptUpdate([...session.value.transcript, ["", prompt]]);
      await runLlama(prompt, {
        ...params.value,
        slot_id: slot_id,
        stop: [],
      }, "");
    }
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -655,6 +655,7 @@ struct llama_server_context
    bool launch_slot_with_data(llama_client_slot* &slot, json data) {
        slot_params default_params;
        llama_sampling_params default_sparams;
        slot->params.stream           = json_value(data, "stream",            false);
        slot->params.cache_prompt     = json_value(data, "cache_prompt",      false);
        slot->params.n_predict        = json_value(data, "n_predict",         default_params.n_predict);
@ -1141,10 +1142,10 @@ struct llama_server_context
        res.stop = false;
        res.result_json = json
        {
-            {"content",    tkn.text_to_send },
+            {"content",    tkn.text_to_send},
            {"stop",       false},
-            {"slot_id",    slot.id },
+            {"slot_id",    slot.id},
-            {"multimodal", multimodal }
+            {"multimodal", multimodal}
        };
        if (slot.sparams.n_probs > 0)
        {
@ -1351,7 +1352,7 @@ struct llama_server_context
            switch (task.type)
            {
                case COMPLETION_TASK: {
-                    llama_client_slot* slot = get_slot(json_value(task.data, "slot_id", -1));
+                    llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
                    if (slot == nullptr)
                    {
                        LOG_TEE("slot unavailable\n");
@ -1515,7 +1516,9 @@ struct llama_server_context
                        prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
                        prefix_tokens.push_back(llama_token_middle(ctx));
                        prompt_tokens = prefix_tokens;
-                    } else {
+                    }
                    else
                    {
                        prompt_tokens = tokenize(slot.prompt, system_prompt.empty());  // add BOS if there isn't system prompt
                    }
@ -2069,7 +2072,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
 }
 static json format_partial_response(
-    llama_server_context &llama, llama_client_slot* slot, const std::string &content, const std::vector<completion_token_output> &probs
+    llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs
 ) {
    json res = json
    {