server : parallel decoding and multimodal (#3677)

* implementing parallel decoding in server example * crash fixed * save dev progress * refactored sampling function * completion endpoint working * multiple client support * grammar + no stream completion * cached prompt support * chat.mjs support cached prompt + some fixes * server ui now support multiple clients * unused change reverted * fixed timings per slot * add context swap * add changes to README.md * llava multimodal integration * fixed tokens probs * add multimodal input - alfa * refactor code + remove unused comments + improved README.md * fix compilation errors with llvm * notify the user from server ui that multimodality is unavialable * some ci fixes * fix ci make build undefined ref errors * fix long prompt than ctx proposed in #3639 * fixed premature end due stop word * context shift fixed * fix llava implementation * sync README.md changes * readme change * update api like OpenAI * multimodal support enabled by default * fix make bui;d errors * fix multiple clients * fix zig build * new sampling API * latest changes of sampling API * server : coding-style normalization * server : coding-style normalization (part 2) * server : remove beam-search functionality * server : bug fix in ingest_images n_tokens is incremented internally by llama_batch_add * server : use refs + use llama_batch_clear() * server : snake case * server : minor sync * added thread safe pipeline * server : bach has to be allocated for n_parallel sequences * server : no need for atomic int - already using mutex * server : logs + minor code style * server : fix multibyte handle in partial response (#3706) * fix image load + view image in chat * make : silence stb warnings * clip : link to ggml, not to llama * server : fix switch fallthrough * server : fix crash in Debug on macOS (I have no idea why this fixes it!?) * server : refactor ctx_sampling init + n_ctx + names * server : bug fix for prompt caching * Do not save/load image_data to localStorage * editorconfig : new line in index.html * server : completion requests remember slot_id * Update readme to document multimodal in server * server : minor style * Update readme to document multimodal in server * server : hide ctx_sampling->prev behind API (#3696) * server : apply fix from #3722 * server : fix slot reuse * server : add comment about changing slot_state to bool --------- Co-authored-by: FSSRepo <go778sgt@gmail.com> Co-authored-by: Damian Stewart <d@damianstewart.com> Co-authored-by: Steward Garcia <57494570+FSSRepo@users.noreply.github.com> Co-authored-by: Jhen-Jie Hong <iainst0409@gmail.com> Co-authored-by: M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
2024-12-25 02:44:36 +00:00 · 2023-10-22 22:53:08 +03:00 · 2023-10-22 22:53:08 +03:00 · 438c2ca830
commit 438c2ca830
parent 9e70cc0322
12 changed files with 3980 additions and 2950 deletions
--- a/.gitignore
+++ b/.gitignore
@ -10,6 +10,7 @@
 *.gcno
 *.gcda
 *.dot
 *.bat
 *.metallib
 .DS_Store
 .build/
--- a/4
+++ b/4
@ -605,8 +605,8 @@ embedding: examples/embedding/embedding.cpp                   build-info.h ggml.
 save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
+	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
 gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
--- a/build.zig
+++ b/build.zig
@ -131,6 +131,7 @@ pub fn build(b: *std.build.Builder) !void {
    const sampling = make.obj("sampling", "common/sampling.cpp");
    const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
    const train = make.obj("train", "common/train.cpp");
    const clip = make.obj("clip", "examples/llava/clip.cpp");
    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser });
    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
@ -139,7 +140,7 @@ pub fn build(b: *std.build.Builder) !void {
    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser, clip });
    if (server.target.isWindows()) {
        server.linkSystemLibrary("ws2_32");
    }
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@ -1,7 +1,7 @@
 set(TARGET clip)
 add_library(${TARGET} clip.cpp clip.h)
 install(TARGETS ${TARGET} LIBRARY)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if (NOT MSVC)
    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -610,8 +610,8 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
        int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
        for (int i = 0; i < 3; ++i) {
-            new_clip->image_mean[i] = *((float *)gguf_get_arr_data(ctx, idx_mean));
+            new_clip->image_mean[i] = *((const float *)gguf_get_arr_data(ctx, idx_mean));
-            new_clip->image_std[i] = *((float *)gguf_get_arr_data(ctx, idx_std));
+            new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std));
        }
        if (verbosity >= 2) {
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -6,7 +6,7 @@ install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
 if (WIN32)
    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -24,6 +24,10 @@ Command line options:
 -   `--port`: Set the port to listen. Default: `8080`.
 -   `--path`: path from which to serve static files (default examples/server/public)
 -   `--embedding`: Enable embedding extraction, Default: disabled.
 -   `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
 -   `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
 -   `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
 -   `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
 ## Build
@ -158,6 +162,8 @@ node index.js
    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:` In this case, `[img-12]` will be replaced by the embeddings of the image id 12 in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
    *Result JSON:*
    Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
@ -188,6 +194,12 @@ node index.js
    `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
    `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
    `cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)
    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
 -   **POST** `/tokenize`: Tokenize a given text.
    *Options:*
@ -218,8 +230,32 @@ node index.js
    It also accepts all the options of `/completion` except `stream` and `prompt`.
 -   **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
 ## More examples
 ### Change system prompt on runtime
 To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt` to achieve that. This only needs to be done once to establish it.
 `prompt`: Specify a context that you want all connecting clients to respect.
 `anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint.
 `assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint.
 ```json
 {
    "system_prompt": {
        "prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:",
        "anti_prompt": "User:",
        "assistant_name": "Assistant:"
    }
 }
 ```
 **NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`.
 ### Interactive mode
 Check the sample in [chat.mjs](chat.mjs).
--- a/examples/server/api_like_OAI.py
+++ b/examples/server/api_like_OAI.py
@ -8,6 +8,7 @@ import json
 app = Flask(__name__)
 slot_id = -1
 parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
 parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')
@ -77,7 +78,8 @@ def make_postData(body, chat=False, stream=False):
    if(is_present(body, "stop")): postData["stop"] += body["stop"]
    postData["n_keep"] = -1
    postData["stream"] = stream
-
+    postData["cache_prompt"] = True
    postData["slot_id"] = slot_id
    return postData
 def make_resData(data, chat=False, promptToken=[]):
@ -128,6 +130,7 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
            }
        ]
    }
    slot_id = data["slot_id"]
    if (chat):
        if (start):
            resData["choices"][0]["delta"] =  {
--- a/examples/server/chat.mjs
+++ b/examples/server/chat.mjs
@ -7,6 +7,11 @@ const args = process.argv.slice(2);
 const grammarJsonSchemaFile = args.find(
    (_, index) => args[index - 1] === "--grammar-json-schema"
 );
 const no_cached_prompt = args.find(
    (_, index) => args[index - 1] === "--no-cache-prompt"
 ) ?? "false";
 const grammarFile = args.find((_, index) => args[index - 1] === "--grammar");
 // Example usage: function,arguments
@ -30,6 +35,9 @@ if (grammarFile) {
    grammar = readFileSync(grammarFile, 'utf-8')
 }
 // for cached prompt
 let slot_id = -1;
 const API_URL = 'http://127.0.0.1:8080'
 const chat = [
@ -76,6 +84,8 @@ async function chat_completion(question) {
            top_p: 0.9,
            n_keep: n_keep,
            n_predict: 256,
            cache_prompt: no_cached_prompt === "false",
            slot_id: slot_id,
            stop: ["\n### Human:"], // stop completion after generating this
            grammar,
            stream: true,
@ -92,6 +102,7 @@ async function chat_completion(question) {
        const t = Buffer.from(chunk).toString('utf8')
        if (t.startsWith('data: ')) {
            const message = JSON.parse(t.substring(6))
            slot_id = message.slot_id
            answer += message.content
            process.stdout.write(message.content)
            if (message.stop) {
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -125,6 +125,7 @@
      background-color: #222;
      color: #ddd;
    }
    code {
      font-family: monospace;
      padding: 0.1em 0.3em;
@ -141,7 +142,8 @@
      display: inline;
    }
-    header, footer {
+    header,
    footer {
      text-align: center;
    }
@ -163,6 +165,7 @@
      0% {
        background-position: 0%;
      }
      100% {
        background-position: 100%;
      }
@ -181,6 +184,7 @@
        --loading-color-1: #22222200;
        --loading-color-2: #222222ff;
      }
      .popover-content {
        background-color: black;
      }
@ -194,6 +198,8 @@
    import { llama } from '/completion.js';
    import { SchemaConverter } from '/json-schema-to-grammar.mjs';
    let selected_image = false;
    var slot_id = -1;
    const session = signal({
      prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
@ -203,6 +209,7 @@
      type: "chat",  // "chat" | "completion"
      char: "Llama",
      user: "User",
      image_selected: ''
    })
    const params = signal({
@ -220,7 +227,9 @@
      mirostat_tau: 5, // target entropy
      mirostat_eta: 0.1, // learning rate
      grammar: '',
-      n_probs: 0, // no completion_probabilities
+      n_probs: 0, // no completion_probabilities,
      image_data: [],
      cache_prompt: true
    })
    /* START: Support for storing prompt templates and parameters in borwser LocalStorage */
@ -270,6 +279,7 @@
      // saved templates were successfuly imported.
      console.log('Processing saved templates and updating default template')
      params.value = { ...params.value, image_data: [] };
      //console.log(importedTemplates);
      savedUserTemplates.value = importedTemplates;
@ -294,7 +304,9 @@
    function userTemplateApply(t) {
      session.value = t.data.session;
      session.value = { ...session.value, image_selected: '' };
      params.value = t.data.params;
      params.value = { ...params.value, image_data: [] };
    }
    function userTemplateResetToDefaultAndApply() {
@ -399,6 +411,11 @@
          console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
        } else {
          currentMessages.push(data);
          slot_id = data.slot_id;
          if (selected_image && !data.multimodal) {
            alert("The server was not compiled for multimodal or the model projector can't be loaded.");
            return;
          }
          transcriptUpdate([...history, [char, currentMessages]])
        }
@ -419,7 +436,7 @@
      transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
-      const prompt = template(session.value.template, {
+      let prompt = template(session.value.template, {
        message: msg,
        history: session.value.transcript.flatMap(
          ([name, data]) =>
@ -434,9 +451,12 @@
            )
        ).join("\n"),
      });
-
+      if (selected_image) {
        prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:[img-10]${msg}\nASSISTANT:`;
      }
      await runLlama(prompt, {
        ...params.value,
        slot_id: slot_id,
        stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
      }, "{{char}}");
    }
@ -450,6 +470,7 @@
      transcriptUpdate([...session.value.transcript, ["", prompt]]);
      await runLlama(prompt, {
        ...params.value,
        slot_id: slot_id,
        stop: [],
      }, "");
    }
@ -467,6 +488,27 @@
      transcriptUpdate([]);
    }
    const uploadImage = (e) => {
      e.preventDefault();
      document.getElementById("fileInput").click();
      document.getElementById("fileInput").addEventListener("change", function (event) {
        const selectedFile = event.target.files[0];
        if (selectedFile) {
          const reader = new FileReader();
          reader.onload = function () {
            const image_data = reader.result;
            session.value = { ...session.value, image_selected: image_data };
            params.value = {
              ...params.value, image_data: [
                { data: image_data.replace(/data:image\/[^;]+;base64,/, ''), id: 10 }]
            }
          };
          selected_image = true;
          reader.readAsDataURL(selectedFile);
        }
      });
    }
    function MessageInput() {
      const message = useSignal("")
@ -497,6 +539,7 @@
          </div>
          <div class="right">
            <button type="submit" disabled=${generating.value}>Send</button>
            <button onclick=${uploadImage}>Upload Image</button>
            <button onclick=${stop} disabled=${!generating.value}>Stop</button>
            <button onclick=${reset}>Reset</button>
          </div>
@ -549,6 +592,7 @@
      return html`
        <section id="chat" ref=${container}>
          <img style="width: 60%;${!session.value.image_selected ? `display: none;` : ``}" src="${session.value.image_selected}"/>
          ${messages.flatMap(chatLine)}
        </section>`;
    };
@ -952,8 +996,11 @@
 </head>
 <body>
-  <div id="container"></div>
+  <div id="container">
    <input type="file" id="fileInput" accept="image/*" style="display: none;">
  </div>
  <div id="portal"></div>
 </body>
 </html>
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp