Merge branch 'master' of https://github.com/piDack/llama.cpp into support_glm_edge_model

2024-12-26 03:14:35 +00:00 · 2024-11-08 03:53:05 +00:00 · 2024-11-08 03:53:05 +00:00 · a249dc0fbb
commit a249dc0fbb
parent 677058f470 76c6e7f105
39 changed files with 28922 additions and 1483 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -24,6 +24,16 @@ insert_final_newline = unset
 [examples/server/public/*]
 indent_size = 2
 [examples/server/public/deps_*]
 trim_trailing_whitespace = unset
 indent_style = unset
 indent_size = unset
 [examples/server/deps_*]
 trim_trailing_whitespace = unset
 indent_style = unset
 indent_size = unset
 [examples/llama.swiftui/llama.swiftui.xcodeproj/*]
 indent_style = tab
--- a/17
+++ b/17
@ -1455,22 +1455,13 @@ llama-server: \
 	examples/server/server.cpp \
 	examples/server/utils.hpp \
 	examples/server/httplib.h \
 	examples/server/colorthemes.css.hpp \
 	examples/server/style.css.hpp \
 	examples/server/theme-beeninorder.css.hpp \
 	examples/server/theme-ketivah.css.hpp \
 	examples/server/theme-mangotango.css.hpp \
 	examples/server/theme-playground.css.hpp \
 	examples/server/theme-polarnight.css.hpp \
 	examples/server/theme-snowstorm.css.hpp \
 	examples/server/index.html.hpp \
 	examples/server/index-new.html.hpp \
 	examples/server/index.js.hpp \
 	examples/server/completion.js.hpp \
 	examples/server/system-prompts.js.hpp \
 	examples/server/prompt-formats.js.hpp \
 	examples/server/json-schema-to-grammar.mjs.hpp \
 	examples/server/loading.html.hpp \
 	examples/server/deps_daisyui.min.css.hpp \
 	examples/server/deps_markdown-it.js.hpp \
 	examples/server/deps_tailwindcss.js.hpp \
 	examples/server/deps_vue.esm-browser.js.hpp \
 	common/json.hpp \
 	common/stb_image.h \
 	$(OBJ_ALL)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -3748,10 +3748,7 @@ class JaisModel(Model):
        # Embeddings scale
        self.embeddings_scale = 1.0
        # note: For some JAIS flavors, output is tied to (same as) wte in original model
        self.output_is_wte = False
        if 'mup_embeddings_scale' in self.hparams:
            self.output_is_wte = True   # Hack (?)
            self.embeddings_scale = self.hparams['mup_embeddings_scale']
        elif 'embeddings_scale' in self.hparams:
            self.embeddings_scale = self.hparams['embeddings_scale']
@ -3808,10 +3805,7 @@ class JaisModel(Model):
        if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
            tensors.append((new_name, data_torch * self.embeddings_scale))
            if self.output_is_wte:
                tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
        elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
            assert not self.output_is_wte
            tensors.append((new_name, data_torch * self.width_scale))
        else:
            tensors.append((new_name, data_torch))
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -15,22 +15,13 @@ set(TARGET_SRCS
    httplib.h
 )
 set(PUBLIC_ASSETS
    colorthemes.css
    style.css
    theme-beeninorder.css
    theme-ketivah.css
    theme-mangotango.css
    theme-playground.css
    theme-polarnight.css
    theme-snowstorm.css
    index.html
    index-new.html
    index.js
    completion.js
    system-prompts.js
    prompt-formats.js
    json-schema-to-grammar.mjs
    loading.html
    deps_daisyui.min.css
    deps_markdown-it.js
    deps_tailwindcss.js
    deps_vue.esm-browser.js
 )
 foreach(asset ${PUBLIC_ASSETS})
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -928,6 +928,16 @@ Apart from error types supported by OAI, we also have custom types that are spec
 }
 ```
 ### Legacy completion web UI
 A new chat-based UI has replaced the old completion-based since [this PR](https://github.com/ggerganov/llama.cpp/pull/10175). If you want to use the old completion, start the server with `--path ./examples/server/public_legacy`
 For example:
 ```sh
 ./llama-server -m my_model.gguf -c 8192 --path ./examples/server/public_legacy
 ```
 ### Extending or building alternative Web Front End
 You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method.
--- a/examples/server/chat.mjs
+++ b/examples/server/chat.mjs
@ -1,7 +1,7 @@
 import * as readline from 'node:readline'
 import { stdin, stdout } from 'node:process'
 import { readFileSync } from 'node:fs'
-import { SchemaConverter }  from './public/json-schema-to-grammar.mjs'
+import { SchemaConverter }  from './public_legacy/json-schema-to-grammar.mjs'
 const args = process.argv.slice(2);
 const grammarJsonSchemaFile = args.find(
--- a/examples/server/deps.sh
+++ b/examples/server/deps.sh
@ -6,5 +6,20 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 PUBLIC=$DIR/public
 echo "download js bundle files"
-curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/index.js
+
-echo >> $PUBLIC/index.js # add newline
+# Note for contributors: Always pin to a specific version "maj.min.patch" to avoid breaking the CI
 curl -L https://cdn.tailwindcss.com/3.4.14 > $PUBLIC/deps_tailwindcss.js
 echo >> $PUBLIC/deps_tailwindcss.js # add newline
 curl -L https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/styled.min.css > $PUBLIC/deps_daisyui.min.css
 curl -L https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/themes.min.css >> $PUBLIC/deps_daisyui.min.css
 echo >> $PUBLIC/deps_daisyui.min.css # add newline
 curl -L https://unpkg.com/vue@3.5.12/dist/vue.esm-browser.js > $PUBLIC/deps_vue.esm-browser.js
 echo >> $PUBLIC/deps_vue.esm-browser.js # add newline
 curl -L https://cdnjs.cloudflare.com/ajax/libs/markdown-it/13.0.2/markdown-it.js > $PUBLIC/deps_markdown-it.js
 echo >> $PUBLIC/deps_markdown-it.js # add newline
 ls -lah $PUBLIC
--- a/examples/server/public/completion.js
+++ b/examples/server/public/completion.js
@ -1,12 +1,16 @@
 const paramDefaults = {
  stream: true,
  n_predict: 500,
  temperature: 0.2,
  stop: ["</s>"]
 };
 let generation_settings = null;
 export class CompletionError extends Error {
  constructor(message, name, data) {
    super(message);
    this.name = name;
  }
 };
 // Completes the prompt as a generator. Recommended for most use cases.
 //
@ -29,7 +33,7 @@ export async function* llama(prompt, params = {}, config = {}) {
  const completionParams = { ...paramDefaults, ...params, prompt };
-  const response = await fetch(`${api_url}/completion`, {
+  const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, {
    method: 'POST',
    body: JSON.stringify(completionParams),
    headers: {
@ -41,6 +45,18 @@ export async function* llama(prompt, params = {}, config = {}) {
    signal: controller.signal,
  });
  const status = response.status;
  if (status !== 200) {
    try {
      const body = await response.json();
      if (body && body.error && body.error.message) {
        throw new CompletionError(body.error.message, 'ServerError');
      }
    } catch (err) {
      throw new CompletionError(err.message, 'ServerError');
    }
  }
  const reader = response.body.getReader();
  const decoder = new TextDecoder();
@ -78,7 +94,12 @@ export async function* llama(prompt, params = {}, config = {}) {
      for (const line of lines) {
        const match = regex.exec(line);
        if (match) {
-          result[match[1]] = match[2]
+          result[match[1]] = match[2];
          if (result.data === '[DONE]') {
            cont = false;
            break;
          }
          // since we know this is llama.cpp, let's just decode the json in data
          if (result.data) {
            result.data = JSON.parse(result.data);
--- a/examples/server/public/deps_daisyui.min.css
+++ b/examples/server/public/deps_daisyui.min.css
--- a/examples/server/public/deps_markdown-it.js
+++ b/examples/server/public/deps_markdown-it.js
--- a/examples/server/public/deps_tailwindcss.js
+++ b/examples/server/public/deps_tailwindcss.js
--- a/examples/server/public/deps_vue.esm-browser.js
+++ b/examples/server/public/deps_vue.esm-browser.js
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
--- a/examples/server/public_legacy/colorthemes.css
+++ b/examples/server/public_legacy/colorthemes.css
--- a/examples/server/public_legacy/completion.js
+++ b/examples/server/public_legacy/completion.js
@ -0,0 +1,209 @@
 const paramDefaults = {
  stream: true,
  n_predict: 500,
  temperature: 0.2,
  stop: ["</s>"]
 };
 let generation_settings = null;
 // Completes the prompt as a generator. Recommended for most use cases.
 //
 // Example:
 //
 //    import { llama } from '/completion.js'
 //
 //    const request = llama("Tell me a joke", {n_predict: 800})
 //    for await (const chunk of request) {
 //      document.write(chunk.data.content)
 //    }
 //
 export async function* llama(prompt, params = {}, config = {}) {
  let controller = config.controller;
  const api_url = config.api_url?.replace(/\/+$/, '') || "";
  if (!controller) {
    controller = new AbortController();
  }
  const completionParams = { ...paramDefaults, ...params, prompt };
  const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, {
    method: 'POST',
    body: JSON.stringify(completionParams),
    headers: {
      'Connection': 'keep-alive',
      'Content-Type': 'application/json',
      'Accept': 'text/event-stream',
      ...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
    },
    signal: controller.signal,
  });
  const reader = response.body.getReader();
  const decoder = new TextDecoder();
  let content = "";
  let leftover = ""; // Buffer for partially read lines
  try {
    let cont = true;
    while (cont) {
      const result = await reader.read();
      if (result.done) {
        break;
      }
      // Add any leftover data to the current chunk of data
      const text = leftover + decoder.decode(result.value);
      // Check if the last character is a line break
      const endsWithLineBreak = text.endsWith('\n');
      // Split the text into lines
      let lines = text.split('\n');
      // If the text doesn't end with a line break, then the last line is incomplete
      // Store it in leftover to be added to the next chunk of data
      if (!endsWithLineBreak) {
        leftover = lines.pop();
      } else {
        leftover = ""; // Reset leftover if we have a line break at the end
      }
      // Parse all sse events and add them to result
      const regex = /^(\S+):\s(.*)$/gm;
      for (const line of lines) {
        const match = regex.exec(line);
        if (match) {
          result[match[1]] = match[2];
          if (result.data === '[DONE]') {
            cont = false;
            break;
          }
          // since we know this is llama.cpp, let's just decode the json in data
          if (result.data) {
            result.data = JSON.parse(result.data);
            content += result.data.content;
            // yield
            yield result;
            // if we got a stop token from server, we will break here
            if (result.data.stop) {
              if (result.data.generation_settings) {
                generation_settings = result.data.generation_settings;
              }
              cont = false;
              break;
            }
          }
          if (result.error) {
            try {
              result.error = JSON.parse(result.error);
              if (result.error.message.includes('slot unavailable')) {
                // Throw an error to be caught by upstream callers
                throw new Error('slot unavailable');
              } else {
                console.error(`llama.cpp error [${result.error.code} - ${result.error.type}]: ${result.error.message}`);
              }
            } catch(e) {
              console.error(`llama.cpp error ${result.error}`)
            }
          }
        }
      }
    }
  } catch (e) {
    if (e.name !== 'AbortError') {
      console.error("llama error: ", e);
    }
    throw e;
  }
  finally {
    controller.abort();
  }
  return content;
 }
 // Call llama, return an event target that you can subscribe to
 //
 // Example:
 //
 //    import { llamaEventTarget } from '/completion.js'
 //
 //    const conn = llamaEventTarget(prompt)
 //    conn.addEventListener("message", (chunk) => {
 //      document.write(chunk.detail.content)
 //    })
 //
 export const llamaEventTarget = (prompt, params = {}, config = {}) => {
  const eventTarget = new EventTarget();
  (async () => {
    let content = "";
    for await (const chunk of llama(prompt, params, config)) {
      if (chunk.data) {
        content += chunk.data.content;
        eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
      }
      if (chunk.data.generation_settings) {
        eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
      }
      if (chunk.data.timings) {
        eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
      }
    }
    eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
  })();
  return eventTarget;
 }
 // Call llama, return a promise that resolves to the completed text. This does not support streaming
 //
 // Example:
 //
 //     llamaPromise(prompt).then((content) => {
 //       document.write(content)
 //     })
 //
 //     or
 //
 //     const content = await llamaPromise(prompt)
 //     document.write(content)
 //
 export const llamaPromise = (prompt, params = {}, config = {}) => {
  return new Promise(async (resolve, reject) => {
    let content = "";
    try {
      for await (const chunk of llama(prompt, params, config)) {
        content += chunk.data.content;
      }
      resolve(content);
    } catch (error) {
      reject(error);
    }
  });
 };
 /**
 * (deprecated)
 */
 export const llamaComplete = async (params, controller, callback) => {
  for await (const chunk of llama(params.prompt, params, { controller })) {
    callback(chunk);
  }
 }
 // Get the model info from the server. This is useful for getting the context window and so on.
 export const llamaModelInfo = async (config = {}) => {
  if (!generation_settings) {
    const api_url = config.api_url?.replace(/\/+$/, '') || "";
    const props = await fetch(`${api_url}/props`).then(r => r.json());
    generation_settings = props.default_generation_settings;
  }
  return generation_settings;
 }
--- a/examples/server/public_legacy/favicon.ico
+++ b/examples/server/public_legacy/favicon.ico
--- a/examples/server/public_legacy/index-new.html
+++ b/examples/server/public_legacy/index-new.html
--- a/examples/server/public_legacy/index.html
+++ b/examples/server/public_legacy/index.html
--- a/examples/server/public_legacy/index.js
+++ b/examples/server/public_legacy/index.js
--- a/examples/server/public_legacy/json-schema-to-grammar.mjs
+++ b/examples/server/public_legacy/json-schema-to-grammar.mjs
--- a/examples/server/public_legacy/loading.html
+++ b/examples/server/public_legacy/loading.html
@ -0,0 +1,12 @@
 <!DOCTYPE html>
 <html>
    <head>
        <meta http-equiv="refresh" content="5">
    </head>
    <body>
        <div id="loading">
            The model is loading. Please wait.<br/>
            The user interface will appear soon.
        </div>
    </body>
 </html>
--- a/examples/server/public_legacy/prompt-formats.js
+++ b/examples/server/public_legacy/prompt-formats.js
--- a/examples/server/public_legacy/style.css
+++ b/examples/server/public_legacy/style.css
--- a/examples/server/public_legacy/system-prompts.js
+++ b/examples/server/public_legacy/system-prompts.js
--- a/examples/server/public_legacy/theme-beeninorder.css
+++ b/examples/server/public_legacy/theme-beeninorder.css
--- a/examples/server/public_legacy/theme-ketivah.css
+++ b/examples/server/public_legacy/theme-ketivah.css
--- a/examples/server/public_legacy/theme-mangotango.css
+++ b/examples/server/public_legacy/theme-mangotango.css
--- a/examples/server/public_legacy/theme-playground.css
+++ b/examples/server/public_legacy/theme-playground.css
--- a/examples/server/public_legacy/theme-polarnight.css
+++ b/examples/server/public_legacy/theme-polarnight.css
--- a/examples/server/public_legacy/theme-snowstorm.css
+++ b/examples/server/public_legacy/theme-snowstorm.css
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -14,22 +14,13 @@
 #define MIMETYPE_JSON "application/json; charset=utf-8"
 // auto generated files (update with ./deps.sh)
 #include "colorthemes.css.hpp"
 #include "style.css.hpp"
 #include "theme-beeninorder.css.hpp"
 #include "theme-ketivah.css.hpp"
 #include "theme-mangotango.css.hpp"
 #include "theme-playground.css.hpp"
 #include "theme-polarnight.css.hpp"
 #include "theme-snowstorm.css.hpp"
 #include "index.html.hpp"
 #include "index-new.html.hpp"
 #include "index.js.hpp"
 #include "completion.js.hpp"
 #include "system-prompts.js.hpp"
 #include "prompt-formats.js.hpp"
 #include "json-schema-to-grammar.mjs.hpp"
 #include "loading.html.hpp"
 #include "deps_daisyui.min.css.hpp"
 #include "deps_markdown-it.js.hpp"
 #include "deps_tailwindcss.js.hpp"
 #include "deps_vue.esm-browser.js.hpp"
 #include <atomic>
 #include <condition_variable>
@ -2285,16 +2276,6 @@ int main(int argc, char ** argv) {
    std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};
    svr->set_default_headers({{"Server", "llama.cpp"}});
    // CORS preflight
    svr->Options(R"(.*)", [](const httplib::Request &, httplib::Response & res) {
        // Access-Control-Allow-Origin is already set by middleware
        res.set_header("Access-Control-Allow-Credentials", "true");
        res.set_header("Access-Control-Allow-Methods",     "POST");
        res.set_header("Access-Control-Allow-Headers",     "*");
        return res.set_content("", "text/html"); // blank response, no data
    });
    svr->set_logger(log_server_request);
    auto res_error = [](httplib::Response & res, const json & error_data) {
@ -2407,6 +2388,14 @@ int main(int argc, char ** argv) {
    // register server middlewares
    svr->set_pre_routing_handler([&middleware_validate_api_key, &middleware_server_state](const httplib::Request & req, httplib::Response & res) {
        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
        // If this is OPTIONS request, skip validation because browsers don't include Authorization header
        if (req.method == "OPTIONS") {
            res.set_header("Access-Control-Allow-Credentials", "true");
            res.set_header("Access-Control-Allow-Methods",     "GET, POST");
            res.set_header("Access-Control-Allow-Headers",     "*");
            res.set_content("", "text/html"); // blank response, no data
            return httplib::Server::HandlerResponse::Handled; // skip further processing
        }
        if (!middleware_server_state(req, res)) {
            return httplib::Server::HandlerResponse::Handled;
        }
@ -3116,33 +3105,19 @@ int main(int argc, char ** argv) {
    // register static assets routes
    if (!params.public_path.empty()) {
        // Set the base directory for serving static files
-        svr->set_base_dir(params.public_path);
+        bool is_found = svr->set_mount_point("/", params.public_path);
-    }
+        if (!is_found) {
-
+            LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str());
-    if (!params.api_keys.empty()) {
+            return 1;
-        // for now, if API key is set, web UI is unusable
+        }
        svr->Get("/", [&](const httplib::Request &, httplib::Response & res) {
            return res.set_content("Web UI is disabled because API key is set.", "text/html; charset=utf-8");
        });
    } else {
        // using embedded static files
-        svr->Get("/",                           handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
+        svr->Get("/",                        handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
-        svr->Get("/index.js",                   handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
+        svr->Get("/completion.js",           handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
-        svr->Get("/completion.js",              handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
+        svr->Get("/deps_daisyui.min.css",    handle_static_file(deps_daisyui_min_css, deps_daisyui_min_css_len, "text/css; charset=utf-8"));
-        svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
+        svr->Get("/deps_markdown-it.js",     handle_static_file(deps_markdown_it_js, deps_markdown_it_js_len, "text/javascript; charset=utf-8"));
-
+        svr->Get("/deps_tailwindcss.js",     handle_static_file(deps_tailwindcss_js, deps_tailwindcss_js_len, "text/javascript; charset=utf-8"));
-        // add new-ui files
+        svr->Get("/deps_vue.esm-browser.js", handle_static_file(deps_vue_esm_browser_js, deps_vue_esm_browser_js_len, "text/javascript; charset=utf-8"));
        svr->Get("/colorthemes.css",       handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
        svr->Get("/style.css",             handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
        svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8"));
        svr->Get("/theme-ketivah.css",     handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
        svr->Get("/theme-mangotango.css",  handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
        svr->Get("/theme-playground.css",  handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
        svr->Get("/theme-polarnight.css",  handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
        svr->Get("/theme-snowstorm.css",   handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
        svr->Get("/index-new.html",        handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
        svr->Get("/system-prompts.js",     handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
        svr->Get("/prompt-formats.js",     handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
    }
    // register API routes
--- a/examples/server/tests/features/security.feature
+++ b/examples/server/tests/features/security.feature
@ -64,5 +64,5 @@ Feature: Security
      | localhost       | Access-Control-Allow-Origin      | localhost         |
      | web.mydomain.fr | Access-Control-Allow-Origin      | web.mydomain.fr   |
      | origin          | Access-Control-Allow-Credentials | true              |
-      | web.mydomain.fr | Access-Control-Allow-Methods     | POST              |
+      | web.mydomain.fr | Access-Control-Allow-Methods     | GET, POST         |
      | web.mydomain.fr | Access-Control-Allow-Headers     | *                 |
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -218,12 +218,12 @@ include(CMakePackageConfigHelpers)
 # all public headers
 set(GGML_PUBLIC_HEADERS
    include/ggml.h
    include/ggml-cpu.h
    include/ggml-alloc.h
    include/ggml-backend.h
    include/ggml-blas.h
    include/ggml-cann.h
    include/ggml-cuda.h
    include/ggml.h
    include/ggml-kompute.h
    include/ggml-metal.h
    include/ggml-rpc.h
--- a/grammars/README.md
+++ b/grammars/README.md
@ -124,7 +124,7 @@ You can use GBNF grammars:
 - In [llama-cli](../examples/main), passed as the `--json` / `-j` flag
 - To convert to a grammar ahead of time:
    - in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
-    - in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
+    - in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public_legacy/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
 Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@ -114,46 +114,22 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
    # replace filenames:
    #
-    # CMakelists.txt          -> ggml/CMakeLists.txt
+    # CMakelists.txt       -> ggml/CMakeLists.txt
-    # src/CMakeLists.txt      -> ggml/src/CMakeLists.txt
+    # src/CMakeLists.txt   -> ggml/src/CMakeLists.txt
-    # cmake/FindSIMD.cmake    -> ggml/cmake/FindSIMD.cmake
+    # cmake/FindSIMD.cmake -> ggml/cmake/FindSIMD.cmake
    #
-    # src/ggml.c              -> ggml/src/ggml.c
+    # src/ggml*.c          -> ggml/src/ggml*.c
-    # src/ggml-aarch64.c      -> ggml/src/ggml-aarch64.c
+    # src/ggml*.cpp        -> ggml/src/ggml*.cpp
-    # src/ggml-aarch64.h      -> ggml/src/ggml-aarch64.h
+    # src/ggml*.h          -> ggml/src/ggml*.h
-    # src/ggml-alloc.c        -> ggml/src/ggml-alloc.c
+    # src/ggml*.cu         -> ggml/src/ggml*.cu
-    # src/ggml-amx/*          -> ggml/src/ggml-amx/
+    # src/ggml*.m          -> ggml/src/ggml*.m
-    # src/ggml-amx.cpp        -> ggml/src/ggml-amx.cpp
+    # src/ggml-amx/*       -> ggml/src/ggml-amx/
-    # src/ggml-backend-impl.h -> ggml/src/ggml-backend-impl.h
+    # src/ggml-cann/*      -> ggml/src/ggml-cann/
-    # src/ggml-backend.cpp    -> ggml/src/ggml-backend.cpp
+    # src/ggml-cuda/*      -> ggml/src/ggml-cuda/
-    # src/ggml-cann/*         -> ggml/src/ggml-cann/
+    # src/ggml-sycl/*      -> ggml/src/ggml-sycl/
-    # src/ggml-cann.cpp       -> ggml/src/ggml-cann.cpp
+    # src/vulkan-shaders/* -> ggml/src/vulkan-shaders/
    # src/ggml-common.h       -> ggml/src/ggml-common.h
    # src/ggml-cuda/*         -> ggml/src/ggml-cuda/
    # src/ggml-cuda.cu        -> ggml/src/ggml-cuda.cu
    # src/ggml-impl.h         -> ggml/src/ggml-impl.h
    # src/ggml-kompute.cpp    -> ggml/src/ggml-kompute.cpp
    # src/ggml-metal.m        -> ggml/src/ggml-metal.m
    # src/ggml-quants.c       -> ggml/src/ggml-quants.c
    # src/ggml-quants.h       -> ggml/src/ggml-quants.h
    # src/ggml-rpc.cpp        -> ggml/src/ggml-rpc.cpp
    # src/ggml-sycl/*         -> ggml/src/ggml-sycl/
    # src/ggml-sycl.cpp       -> ggml/src/ggml-sycl.cpp
    # src/ggml-vulkan.cpp     -> ggml/src/ggml-vulkan.cpp
    # src/vulkan-shaders/*    -> ggml/src/vulkan-shaders/
    #
-    # include/ggml.h         -> ggml/include/ggml.h
+    # include/ggml*.h -> ggml/include/ggml*.h
    # include/ggml-alloc.h   -> ggml/include/ggml-alloc.h
    # include/ggml-amx.h     -> ggml/include/ggml-amx.h
    # include/ggml-backend.h -> ggml/include/ggml-backend.h
    # include/ggml-blas.h    -> ggml/include/ggml-blas.h
    # include/ggml-cann.h    -> ggml/include/ggml-cann.h
    # include/ggml-cuda.h    -> ggml/include/ggml-cuda.h
    # include/ggml-kompute.h -> ggml/include/ggml-kompute.h
    # include/ggml-metal.h   -> ggml/include/ggml-metal.h
    # include/ggml-rpc.h     -> ggml/include/ggml-rpc.h
    # include/ggml-sycl.h    -> ggml/include/ggml-sycl.h
    # include/ggml-vulkan.h  -> ggml/include/ggml-vulkan.h
    #
    # tests/test-opt.cpp           -> tests/test-opt.cpp
    # tests/test-grad0.cpp         -> tests/test-grad0.cpp
@ -168,41 +144,17 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
        -e 's/([[:space:]]|[ab]\/)CMakeLists.txt/\1ggml\/CMakeLists.txt/g' \
        -e 's/([[:space:]]|[ab]\/)src\/CMakeLists.txt/\1ggml\/src\/CMakeLists.txt/g' \
        -e 's/([[:space:]]|[ab]\/)cmake\/FindSIMD.cmake/\1ggml\/cmake\/FindSIMD.cmake/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml\.c/\1ggml\/src\/ggml.c/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.c/\1ggml\/src\/ggml\1.c/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.c/\1ggml\/src\/ggml-aarch64.c/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cpp/\1ggml\/src\/ggml\1.cpp/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.h/\1ggml\/src\/ggml-aarch64.h/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.h/\1ggml\/src\/ggml\1.h/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml-alloc\.c/\1ggml\/src\/ggml-alloc.c/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cu/\1ggml\/src\/ggml\1.cu/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.m/\1ggml\/src\/ggml\1.m/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-amx\//\1ggml\/src\/ggml-amx\//g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-amx\.cpp/\1ggml\/src\/ggml-amx.cpp/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-backend-impl\.h/\1ggml\/src\/ggml-backend-impl.h/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.cpp/\1ggml\/src\/ggml-backend.cpp/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-cann\.cpp/\1ggml\/src\/ggml-cann.cpp/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-common\.h/\1ggml\/src\/ggml-common.h/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\.cu/\1ggml\/src\/ggml-cuda.cu/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-impl\.h/\1ggml\/src\/ggml-impl.h/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-kompute\.cpp/\1ggml\/src\/ggml-kompute.cpp/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-metal\.m/\1ggml\/src\/ggml-metal.m/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.c/\1ggml\/src\/ggml-quants.c/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.h/\1ggml\/src\/ggml-quants.h/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-rpc\.cpp/\1ggml\/src\/ggml-rpc.cpp/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\.cpp/\1ggml\/src\/ggml-sycl.cpp/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-vulkan\.cpp/\1ggml\/src\/ggml-vulkan.cpp/g' \
        -e 's/([[:space:]]|[ab]\/)src\/vulkan-shaders\//\1ggml\/src\/vulkan-shaders\//g' \
-        -e 's/([[:space:]]|[ab]\/)include\/ggml\.h/\1ggml\/include\/ggml.h/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml(.*)\.h/\1ggml\/include\/ggml\1.h/g' \
        -e 's/([[:space:]]|[ab]\/)include\/ggml-alloc\.h/\1ggml\/include\/ggml-alloc.h/g' \
        -e 's/([[:space:]]|[ab]\/)include\/ggml-amx\.h/\1ggml\/include\/ggml-amx.h/g' \
        -e 's/([[:space:]]|[ab]\/)include\/ggml-backend\.h/\1ggml\/include\/ggml-backend.h/g' \
        -e 's/([[:space:]]|[ab]\/)include\/ggml-blas\.h/\1ggml\/include\/ggml-blas.h/g' \
        -e 's/([[:space:]]|[ab]\/)include\/ggml-cann\.h/\1ggml\/include\/ggml-cann.h/g' \
        -e 's/([[:space:]]|[ab]\/)include\/ggml-cuda\.h/\1ggml\/include\/ggml-cuda.h/g' \
        -e 's/([[:space:]]|[ab]\/)include\/ggml-kompute\.h/\1ggml\/include\/ggml-kompute.h/g' \
        -e 's/([[:space:]]|[ab]\/)include\/ggml-metal\.h/\1ggml\/include\/ggml-metal.h/g' \
        -e 's/([[:space:]]|[ab]\/)include\/ggml-rpc\.h/\1ggml\/include\/ggml-rpc.h/g' \
        -e 's/([[:space:]]|[ab]\/)include\/ggml-sycl\.h/\1ggml\/include\/ggml-sycl.h/g' \
        -e 's/([[:space:]]|[ab]\/)include\/ggml-vulkan\.h/\1ggml\/include\/ggml-vulkan.h/g' \
        -e 's/([[:space:]]|[ab]\/)examples\/common\.h/\1examples\/common.h/g' \
        -e 's/([[:space:]]|[ab]\/)examples\/common\.cpp/\1examples\/common.cpp/g' \
        -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.h/\1examples\/common-ggml.h/g' \
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@ -1 +1 @@
-a099cb514d6687e436a5a423d1fb0448be0feb20
+89952d649e0c5cabbb9ff8c4906f5a843a789fb2
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@ -4,43 +4,18 @@ cp -rpv ../ggml/CMakeLists.txt       ./ggml/CMakeLists.txt
 cp -rpv ../ggml/src/CMakeLists.txt   ./ggml/src/CMakeLists.txt
 cp -rpv ../ggml/cmake/FindSIMD.cmake ./ggml/cmake/FindSIMD.cmake
-cp -rpv ../ggml/src/ggml.c              ./ggml/src/ggml.c
+cp -rpv ../ggml/src/ggml*.c          ./ggml/src/
-cp -rpv ../ggml/src/ggml-aarch64.c      ./ggml/src/ggml-aarch64.c
+cp -rpv ../ggml/src/ggml*.cpp        ./ggml/src/
-cp -rpv ../ggml/src/ggml-aarch64.h      ./ggml/src/ggml-aarch64.h
+cp -rpv ../ggml/src/ggml*.h          ./ggml/src/
-cp -rpv ../ggml/src/ggml-alloc.c        ./ggml/src/ggml-alloc.c
+cp -rpv ../ggml/src/ggml*.cu         ./ggml/src/
-cp -rpv ../ggml/src/ggml-amx/*          ./ggml/src/ggml-amx/
+cp -rpv ../ggml/src/ggml*.m          ./ggml/src/
-cp -rpv ../ggml/src/ggml-amx.cpp        ./ggml/src/ggml-amx.cpp
+cp -rpv ../ggml/src/ggml-amx/*       ./ggml/src/ggml-amx/
-cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml/src/ggml-backend-impl.h
+cp -rpv ../ggml/src/ggml-cann/*      ./ggml/src/ggml-cann/
-cp -rpv ../ggml/src/ggml-backend.cpp    ./ggml/src/ggml-backend.cpp
+cp -rpv ../ggml/src/ggml-cuda/*      ./ggml/src/ggml-cuda/
-cp -rpv ../ggml/src/ggml-cann/*         ./ggml/src/ggml-cann/
+cp -rpv ../ggml/src/ggml-sycl/*      ./ggml/src/ggml-sycl/
-cp -rpv ../ggml/src/ggml-cann.cpp       ./ggml/src/ggml-cann.cpp
+cp -rpv ../ggml/src/vulkan-shaders/* ./ggml/src/vulkan-shaders/
 cp -rpv ../ggml/src/ggml-common.h       ./ggml/src/ggml-common.h
 cp -rpv ../ggml/src/ggml-cuda/*         ./ggml/src/ggml-cuda/
 cp -rpv ../ggml/src/ggml-cuda.cu        ./ggml/src/ggml-cuda.cu
 cp -rpv ../ggml/src/ggml-impl.h         ./ggml/src/ggml-impl.h
 cp -rpv ../ggml/src/ggml-kompute.cpp    ./ggml/src/ggml-kompute.cpp
 cp -rpv ../ggml/src/ggml-metal.m        ./ggml/src/ggml-metal.m
 cp -rpv ../ggml/src/ggml-metal.metal    ./ggml/src/ggml-metal.metal
 cp -rpv ../ggml/src/ggml-quants.c       ./ggml/src/ggml-quants.c
 cp -rpv ../ggml/src/ggml-quants.h       ./ggml/src/ggml-quants.h
 cp -rpv ../ggml/src/ggml-rpc.cpp        ./ggml/src/ggml-rpc.cpp
 cp -rpv ../ggml/src/ggml-sycl/*         ./ggml/src/ggml-sycl/
 cp -rpv ../ggml/src/ggml-sycl.cpp       ./ggml/src/ggml-sycl.cpp
 cp -rpv ../ggml/src/ggml-vulkan.cpp     ./ggml/src/ggml-vulkan.cpp
 cp -rpv ../ggml/src/vulkan-shaders/*    ./ggml/src/vulkan-shaders/
-cp -rpv ../ggml/include/ggml.h         ./ggml/include/ggml.h
+cp -rpv ../ggml/include/ggml*.h ./ggml/include/
 cp -rpv ../ggml/include/ggml-alloc.h   ./ggml/include/ggml-alloc.h
 cp -rpv ../ggml/include/ggml-amx.h     ./ggml/include/ggml-amx.h
 cp -rpv ../ggml/include/ggml-backend.h ./ggml/include/ggml-backend.h
 cp -rpv ../ggml/include/ggml-blas.h    ./ggml/include/ggml-blas.h
 cp -rpv ../ggml/include/ggml-cann.h    ./ggml/include/ggml-cann.h
 cp -rpv ../ggml/include/ggml-cuda.h    ./ggml/include/ggml-cuda.h
 cp -rpv ../ggml/include/ggml-kompute.h ./ggml/include/ggml-kompute.h
 cp -rpv ../ggml/include/ggml-metal.h   ./ggml/include/ggml-metal.h
 cp -rpv ../ggml/include/ggml-rpc.h     ./ggml/include/ggml-rpc.h
 cp -rpv ../ggml/include/ggml-sycl.h    ./ggml/include/ggml-sycl.h
 cp -rpv ../ggml/include/ggml-vulkan.h  ./ggml/include/ggml-vulkan.h
 cp -rpv ../ggml/tests/test-opt.cpp           ./tests/test-opt.cpp
 cp -rpv ../ggml/tests/test-grad0.cpp         ./tests/test-grad0.cpp
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@ -1876,8 +1876,11 @@ static void llama_sampler_dry_reset(struct llama_sampler * smpl) {
 static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler * smpl) {
    const auto * ctx = (llama_sampler_dry *) smpl->ctx;
-    // nullptr is passed as vocab because it is only needed for raw sequence breaker processing, which we have already done and will be copying
+    llama_vocab dummy_vocab;
-    auto * result = llama_sampler_init_dry(nullptr, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
+
    // dummy vocab is passed because it is only needed for raw sequence breaker processing, which we have already done and will simply be copying
    auto * result = llama_sampler_init_dry_impl(dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
    // Copy the state, including the processed breakers
    {
        auto * result_ctx = (llama_sampler_dry *) result->ctx;
--- a/tests/run-json-schema-to-grammar.mjs
+++ b/tests/run-json-schema-to-grammar.mjs
@ -1,5 +1,5 @@
 import { readFileSync } from "fs"
-import { SchemaConverter } from "../examples/server/public/json-schema-to-grammar.mjs"
+import { SchemaConverter } from "../examples/server/public_legacy/json-schema-to-grammar.mjs"
 const [, , file] = process.argv
 const url = `file://${file}`
`@ -1 +1 @@`
	`a099cb514d6687e436a5a423d1fb0448be0feb20`	`89952d649e0c5cabbb9ff8c4906f5a843a789fb2`