From 55978ce09b69d3987d17d08d92d8cc27193e0773 Mon Sep 17 00:00:00 2001
From: Haohui Mai <ricetons@gmail.com>
Date: Thu, 23 Nov 2023 13:56:53 -0800
Subject: [PATCH 01/64] Fix incorrect format strings and uninitialized
 variables. (#4133)

* Fix incorrect format strings and uninitialized variables.

* Address comments

* Add the missing include statement
---
 examples/server/server.cpp | 2 ++
 ggml-cuda.cu               | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 1f2c55f2d..be23ad169 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1095,6 +1095,7 @@ struct llama_server_context
         std::lock_guard<std::mutex> lock(mutex_results);
         task_result res;
         res.id = id;
+        res.stop = false;
         res.error = true;
         res.result_json = { { "content", error } };
         queue_results.push_back(res);
@@ -1255,6 +1256,7 @@ struct llama_server_context
         std::lock_guard<std::mutex> lock(mutex_tasks);
         task_server task;
         task.id = id_gen++;
+        task.target_id = 0;
         task.data = data;
         task.infill_mode = infill;
         task.embedding_mode = embedding;
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 50e03de50..f0db7ae35 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -1,4 +1,5 @@
 #include <algorithm>
+#include <cinttypes>
 #include <cstddef>
 #include <cstdint>
 #include <limits>
@@ -8057,7 +8058,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
     if (tensor->op == GGML_OP_MUL_MAT) {
         if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
 #ifndef NDEBUG
-            fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %d, src1->ne[3] = %d - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
+            fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
 #endif
             return false;
         }

From b35f3d0def3efde92ed465d92a267430d957e87d Mon Sep 17 00:00:00 2001
From: Aaryaman Vasishta <aaryaman.vasishta@amd.com>
Date: Fri, 24 Nov 2023 16:52:39 +0900
Subject: [PATCH 02/64] readme : use PATH for Windows ROCm (#4195)

* Update README.md to use PATH for Windows ROCm

* Update README.md

* Update README.md
---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 276461f81..5189e1255 100644
--- a/README.md
+++ b/README.md
@@ -422,8 +422,9 @@ Building the program with BLAS support may lead to some performance improvements
     CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
     cmake --build .
     ```
-  - Using `CMake` for Windows:
+  - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS):
     ```bash
+    set PATH=%HIP_PATH%\bin;%PATH%
     mkdir build
     cd build
     cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..

From 2568a4bf548d7392e9c78c008b33b4c11d53fe95 Mon Sep 17 00:00:00 2001
From: eastriver <lee@eastriver.dev>
Date: Fri, 24 Nov 2023 18:25:10 +0900
Subject: [PATCH 03/64] main.swift : fix eos checking (#4197)

llama_token_eos(const struct llama_model *) is currently getting struct llama_context type variable context as a parameter.
---
 examples/batched.swift/Sources/main.swift | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift
index 772730382..ba15197ae 100644
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -153,7 +153,7 @@ while n_cur <= n_len {
         // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
 
         // is it an end of stream? -> mark the stream as finished
-        if new_token_id == llama_token_eos(context) || n_cur == n_len {
+        if new_token_id == llama_token_eos(model) || n_cur == n_len {
             i_batch[i] = -1
             // print("")
             if n_parallel > 1 {

From 189d68446e7ef21e8f3af3c0a3d91c35a39aec89 Mon Sep 17 00:00:00 2001
From: Galunid <karolek1231456@gmail.com>
Date: Fri, 24 Nov 2023 15:02:49 +0100
Subject: [PATCH 04/64] convert : fix tensors using grad in some models (#4173)

---
 convert-hf-to-gguf.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 1105670c1..147d5717e 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -880,20 +880,21 @@ print(f"Loading model: {dir_model.name}")
 
 hparams = Model.load_hparams(dir_model)
 
-model_class = Model.from_model_architecture(hparams["architectures"][0])
-model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
+with torch.inference_mode():
+    model_class = Model.from_model_architecture(hparams["architectures"][0])
+    model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
 
-print("Set model parameters")
-model_instance.set_gguf_parameters()
+    print("Set model parameters")
+    model_instance.set_gguf_parameters()
 
-print("Set model tokenizer")
-model_instance.set_vocab()
+    print("Set model tokenizer")
+    model_instance.set_vocab()
 
-if args.vocab_only:
-    print(f"Exporting model vocab to '{fname_out}'")
-    model_instance.write_vocab()
-else:
-    print(f"Exporting model to '{fname_out}'")
-    model_instance.write()
+    if args.vocab_only:
+        print(f"Exporting model vocab to '{fname_out}'")
+        model_instance.write_vocab()
+    else:
+        print(f"Exporting model to '{fname_out}'")
+        model_instance.write()
 
-print(f"Model successfully exported to '{fname_out}'")
+    print(f"Model successfully exported to '{fname_out}'")

From 8a052c131ed3525313cdb84e5ae4e2b6cf8d2e24 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 24 Nov 2023 18:04:31 +0100
Subject: [PATCH 05/64] ggml-cuda : support stablelm rope (#4156)

* ggml-cuda : support stablelm rope

* remove unused freq_base kernel parameter

* add n_dims parameter to llm_build_k_shift, default to n_rot via overload

* llama : fix llm_build_k_shift args

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 ggml-cuda.cu | 37 ++++++++++++++++++++++---------------
 llama.cpp    |  4 ++--
 2 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index f0db7ae35..5b80e4ae3 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -4610,8 +4610,8 @@ static __global__ void rope(
 
 template<typename T, bool has_pos>
 static __global__ void rope_neox(
-    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims
+    const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
 ) {
     const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
 
@@ -4620,23 +4620,25 @@ static __global__ void rope_neox(
     }
 
     const int row = blockDim.x*blockIdx.x + threadIdx.x;
-    const int i = row*ncols + col/2;
+    const int ib = col / n_dims;
+    const int ic = col % n_dims;
+
+    const int i = row*ncols + ib*n_dims + ic/2;
     const int i2 = row/p_delta_rows;
 
-    // simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
-    const float cur_rot = -float(col)/ncols;
+    float cur_rot = inv_ndims * ic - ib;
 
     const int p = has_pos ? pos[i2] : 0;
-    const float theta_base = p*powf(freq_base, cur_rot);
+    const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
 
     float cos_theta, sin_theta;
     rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
 
     const float x0 = x[i + 0];
-    const float x1 = x[i + ncols/2];
+    const float x1 = x[i + n_dims/2];
 
-    dst[i + 0]       = x0*cos_theta - x1*sin_theta;
-    dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
+    dst[i + 0]        = x0*cos_theta - x1*sin_theta;
+    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
 }
 
 static __global__ void rope_glm_f32(
@@ -5739,20 +5741,26 @@ static void rope_cuda(
 
 template<typename T>
 static void rope_neox_cuda(
-    const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
     float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
 ) {
     GGML_ASSERT(ncols % 2 == 0);
     const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
     const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
     const dim3 block_nums(nrows, num_blocks_x, 1);
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+    const float inv_ndims = -1.0f / n_dims;
+
     if (pos == nullptr) {
         rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
+            x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
+            theta_scale, inv_ndims
         );
     } else {
         rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
+            x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
+            theta_scale, inv_ndims
         );
     }
 }
@@ -6707,15 +6715,14 @@ inline void ggml_cuda_op_rope(
         GGML_ASSERT(false);
         rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
     } else if (is_neox) {
-        GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
         if (src0->type == GGML_TYPE_F32) {
             rope_neox_cuda(
-                (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
                 attn_factor, corr_dims, main_stream
             );
         } else if (src0->type == GGML_TYPE_F16) {
             rope_neox_cuda(
-                (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                (const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
                 attn_factor, corr_dims, main_stream
             );
         } else {
diff --git a/llama.cpp b/llama.cpp
index 9fb7244b4..5b31f2016 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3469,7 +3469,7 @@ static void llm_build_k_shift(
        struct ggml_cgraph * graph,
             llm_rope_type   type,
                   int64_t   n_ctx,
-                  int64_t   n_rot,
+                  int       n_rot,
                   float     freq_base,
                   float     freq_scale,
        const llm_build_cb & cb) {
@@ -3501,7 +3501,7 @@ static void llm_build_k_shift(
             // we rotate only the first n_rot dimensions
             ggml_rope_custom_inplace(ctx,
                     ggml_view_3d(ctx, kv.k,
-                        n_rot, n_head_kv, n_ctx,
+                        n_embd_head, n_head_kv, n_ctx,
                         ggml_element_size(kv.k)*n_embd_head,
                         ggml_element_size(kv.k)*n_embd_gqa,
                         ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),

From e9c13ff78114af6fc6a4f27cc8dcdda0f3d389fb Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 24 Nov 2023 18:10:01 +0100
Subject: [PATCH 06/64] llama : set metal log callback correctly (#4204)

---
 llama.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 5b31f2016..c5f4053f2 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1118,6 +1118,12 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
 //
 
 struct llama_state {
+    llama_state() {
+#ifdef GGML_USE_METAL
+        ggml_metal_log_set_callback(log_callback, log_callback_user_data);
+#endif
+    }
+
     // We save the log callback globally
     ggml_log_callback log_callback = llama_log_callback_default;
     void * log_callback_user_data = nullptr;
@@ -8569,8 +8575,6 @@ struct llama_context * llama_new_context_with_model(
 
 #ifdef GGML_USE_METAL
             if (model->n_gpu_layers > 0) {
-                ggml_metal_log_set_callback(llama_log_callback_default, NULL);
-
                 ctx->ctx_metal = ggml_metal_init(1);
                 if (!ctx->ctx_metal) {
                     LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
@@ -9706,6 +9710,9 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
 void llama_log_set(ggml_log_callback log_callback, void * user_data) {
     g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
     g_state.log_callback_user_data = user_data;
+#ifdef GGML_USE_METAL
+    ggml_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
+#endif
 }
 
 static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {

From af19d3573481d409b3c4e55494810eb1f65a9aae Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 25 Nov 2023 11:29:06 +0200
Subject: [PATCH 07/64] server : OAI API compatibility (#4198)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add openai-compatible POST /v1/chat/completions API endpoint to server example

* fix code style

* Update server README.md

* Improve server README.md

* Fix server.cpp code style according to review

* server : some style changes

* server : indentation

* server : enable special tokens during tokenization by default

* server : minor code style

* server : change random string generator

* straightforward /v1/models endpoint

---------

Co-authored-by: kir-gadjello <111190790+kir-gadjello@users.noreply.github.com>
Co-authored-by: Tobi Lütke <tobi@Tobis-MacBook-Pro.local>
---
 examples/server/README.md  |  49 +++++
 examples/server/server.cpp | 375 +++++++++++++++++++++++++++++++++++--
 2 files changed, 413 insertions(+), 11 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index a6eda3b32..cfc220f58 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -234,6 +234,55 @@ node index.js
 
 -   **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
 
+-   **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served.
+
+    *Options:*
+
+    See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such are `mirostat` are supported.
+
+    *Examples:*
+
+    You can use either Python `openai` library with appropriate checkpoints:
+
+    ```python
+    import openai
+
+    client = openai.OpenAI(
+        base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
+        api_key = "sk-no-key-required"
+    )
+
+    completion = client.chat.completions.create(
+    model="gpt-3.5-turbo",
+    messages=[
+        {"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
+        {"role": "user", "content": "Write a limerick about python exceptions"}
+    ]
+    )
+
+    print(completion.choices[0].message)
+    ```
+    ... or raw HTTP requests:
+
+    ```shell
+    curl http://localhost:8080/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer no-key" \
+    -d '{
+    "model": "gpt-3.5-turbo",
+    "messages": [
+    {
+        "role": "system",
+        "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
+    },
+    {
+        "role": "user",
+        "content": "Write a limerick about python exceptions"
+    }
+    ]
+    }'
+    ```
+
 ## More examples
 
 ### Change system prompt on runtime
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index be23ad169..50f124b13 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -29,6 +29,8 @@
 #define SERVER_VERBOSE 1
 #endif
 
+#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
+
 using json = nlohmann::json;
 
 struct server_params
@@ -59,6 +61,10 @@ static bool server_verbose = false;
 #define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_INFO(   MSG, ...) server_log("INFO",    __func__, __LINE__, MSG, __VA_ARGS__)
 
+json oaicompat_completion_params_parse(const json &body);
+std::string format_chatml(std::vector<json> messages);
+
+
 //
 // base64 utils (TODO: move to common in the future)
 //
@@ -378,6 +384,9 @@ struct llama_client_slot
     bool stopped_word = false;
     bool stopped_limit = false;
 
+    bool oaicompat = false;
+    std::string oaicompat_model;
+
     std::string stopping_word;
 
     // sampling
@@ -477,7 +486,7 @@ struct llama_client_slot
         };
     }
 
-    void print_timings() {
+    void print_timings() const {
         LOG_TEE("\n");
         LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
             __func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
@@ -609,6 +618,11 @@ struct llama_server_context
 
     std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
     {
+        // TODO: currently, we tokenize using special tokens by default
+        //       this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
+        //       but it's better compared to completely ignoring ChatML and other chat templates
+        const bool TMP_FORCE_SPECIAL = true;
+
         // If `add_bos` is true, we only add BOS, when json_prompt is a string,
         // or the first element of the json_prompt array is a string.
         std::vector<llama_token> prompt_tokens;
@@ -624,12 +638,12 @@ struct llama_server_context
                     std::vector<llama_token> p;
                     if (first)
                     {
-                        p = ::llama_tokenize(ctx, s, add_bos);
+                        p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
                         first = false;
                     }
                     else
                     {
-                        p = ::llama_tokenize(ctx, s, false);
+                        p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
                     }
                     prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
                 }
@@ -646,7 +660,7 @@ struct llama_server_context
         else
         {
             auto s = json_prompt.template get<std::string>();
-            prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
+            prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
         }
 
         return prompt_tokens;
@@ -677,6 +691,14 @@ struct llama_server_context
         slot_params default_params;
         llama_sampling_params default_sparams;
 
+        if (data.count("__oaicompat") != 0) {
+            slot->oaicompat = true;
+            slot->oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
+        } else {
+            slot->oaicompat = false;
+            slot->oaicompat_model = "";
+        }
+
         slot->params.stream           = json_value(data, "stream",            false);
         slot->params.cache_prompt     = json_value(data, "cache_prompt",      false);
         slot->params.n_predict        = json_value(data, "n_predict",         default_params.n_predict);
@@ -1170,6 +1192,12 @@ struct llama_server_context
             res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
         }
 
+        if (slot.oaicompat)
+        {
+            res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
+            res.result_json["model"] = slot.oaicompat_model;
+        }
+
         queue_results.push_back(res);
     }
 
@@ -1217,6 +1245,12 @@ struct llama_server_context
             res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs);
         }
 
+        if (slot.oaicompat)
+        {
+            res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
+            res.result_json["model"] = slot.oaicompat_model;
+        }
+
         queue_results.push_back(res);
     }
 
@@ -1257,7 +1291,7 @@ struct llama_server_context
         task_server task;
         task.id = id_gen++;
         task.target_id = 0;
-        task.data = data;
+        task.data = std::move(data);
         task.infill_mode = infill;
         task.embedding_mode = embedding;
         task.type = COMPLETION_TASK;
@@ -2180,6 +2214,233 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
     }
 }
 
+
+static std::string random_string()
+{
+    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
+
+    std::random_device rd;
+    std::mt19937 generator(rd());
+
+    std::string result(32, ' ');
+
+    for (int i = 0; i < 32; ++i) {
+        result[i] = str[generator() % str.size()];
+    }
+
+    return result;
+}
+
+static std::string gen_chatcmplid()
+{
+    std::stringstream chatcmplid;
+    chatcmplid << "chatcmpl-" << random_string();
+    return chatcmplid.str();
+}
+
+std::string format_chatml(std::vector<json> messages)
+{
+    std::ostringstream chatml_msgs;
+
+    for (auto it = messages.begin(); it != messages.end(); ++it) {
+        chatml_msgs << "<|im_start|>"
+                    << json_value(*it, "role",    std::string("user")) << '\n';
+        chatml_msgs << json_value(*it, "content", std::string(""))
+                    << "<|im_end|>\n";
+    }
+
+    chatml_msgs << "<|im_start|>assistant" << '\n';
+
+    return chatml_msgs.str();
+}
+
+/* llama.cpp completion api semantics */
+json oaicompat_completion_params_parse(
+    const json &body /* openai api json semantics */)
+{
+    json llama_params;
+
+    llama_params["__oaicompat"] = true;
+
+    // Map OpenAI parameters to llama.cpp parameters
+    llama_params["prompt"]            = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
+    llama_params["temperature"]       = json_value(body, "temperature", 0.8);
+    llama_params["top_k"]             = json_value(body, "top_k", 40);
+    llama_params["top_p"]             = json_value(body, "top_p", 0.95);
+    llama_params["n_predict"]         = json_value(body, "max_tokens", -1);
+    llama_params["logit_bias"]        = json_value(body, "logit_bias",json::object());
+    llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
+    llama_params["presence_penalty"]  = json_value(body, "presence_penalty", 0.0);
+    llama_params["seed"]              = json_value(body, "seed", 0);
+    llama_params["stream"]            = json_value(body, "stream", false);
+    llama_params["mirostat"]          = json_value(body, "mirostat", false);
+    llama_params["mirostat_tau"]      = json_value(body, "mirostat_tau", 0.0);
+    llama_params["mirostat_eta"]      = json_value(body, "mirostat_eta", 0.0);
+    llama_params["penalize_nl"]       = json_value(body, "penalize_nl", false);
+    llama_params["typical_p"]         = json_value(body, "typical_p", 0.0);
+    llama_params["repeat_last_n"]     = json_value(body, "repeat_last_n", 0);
+    llama_params["ignore_eos"]        = json_value(body, "ignore_eos", false);
+    llama_params["tfs_z"]             = json_value(body, "tfs_z", 0.0);
+
+    if (llama_params.count("grammar") != 0) {
+        llama_params["grammar"] = json_value(body, "grammar", json::object());
+    }
+
+    // Handle 'stop' field
+    if (body["stop"].is_null()) {
+        llama_params["stop"] = json::array({});
+    } else if (body["stop"].is_string()) {
+        llama_params["stop"] = json::array({body["stop"].get<std::string>()});
+    } else {
+        llama_params["stop"] = json_value(body, "stop", json::array());
+    }
+
+    // Ensure there is ChatML-specific end sequence among stop words
+    llama_params["stop"].push_back("<|im_end|>");
+
+    return llama_params;
+}
+
+static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
+{
+    json result = response.result_json;
+
+    bool stopped_word        = result.count("stopped_word") != 0;
+    bool stopped_eos         = json_value(result, "stopped_eos", false);
+    int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
+    int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
+    std::string content      = json_value(result, "content", std::string(""));
+
+    std::string finish_reason = "length";
+    if (stopped_word || stopped_eos) {
+        finish_reason = "stop";
+    }
+
+    json choices =
+        streaming ? json::array({json{{"finish_reason", finish_reason},
+                                        {"index", 0},
+                                        {"delta", json::object()}}})
+                  : json::array({json{{"finish_reason", finish_reason},
+                                        {"index", 0},
+                                        {"message", json{{"content", content},
+                                                         {"role", "assistant"}}}}});
+
+    std::time_t t = std::time(0);
+
+    json res =
+        json{{"choices", choices},
+            {"created", t},
+            {"model",
+                json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
+            {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
+            {"usage",
+                json{{"completion_tokens", num_tokens_predicted},
+                    {"prompt_tokens", num_prompt_tokens},
+                    {"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
+            {"id", gen_chatcmplid()}};
+
+    if (server_verbose) {
+        res["__verbose"] = result;
+    }
+
+    if (result.contains("completion_probabilities")) {
+        res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
+    }
+
+    return res;
+}
+
+// return value is vector as there is one case where we might need to generate two responses
+static std::vector<json> format_partial_response_oaicompat(const task_result &response) {
+    json result = response.result_json;
+
+    if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
+        return std::vector<json>({response.result_json});
+    }
+
+    bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
+    std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
+
+    bool stopped_word   = json_value(result, "stopped_word", false);
+    bool stopped_eos    = json_value(result, "stopped_eos", false);
+    bool stopped_limit  = json_value(result, "stopped_limit", false);
+    std::string content = json_value(result, "content", std::string(""));
+
+    std::string finish_reason;
+    if (stopped_word || stopped_eos) {
+        finish_reason = "stop";
+    }
+    if (stopped_limit) {
+        finish_reason = "length";
+    }
+
+    std::time_t t = std::time(0);
+
+    json choices;
+
+    if (!finish_reason.empty()) {
+        choices = json::array({json{{"finish_reason", finish_reason},
+                                    {"index", 0},
+                                    {"delta", json::object()}}});
+    } else {
+        if (first) {
+            if (content.empty()) {
+                choices = json::array({json{{"finish_reason", nullptr},
+                                            {"index", 0},
+                                            {"delta", json{{"role", "assistant"}}}}});
+            } else {
+                // We have to send this as two updates to conform to openai behavior
+                json initial_ret = json{{"choices", json::array({json{
+                                        {"finish_reason", nullptr},
+                                        {"index", 0},
+                                        {"delta", json{
+                                            {"role", "assistant"}
+                                        }}}})},
+                            {"created", t},
+                            {"id", gen_chatcmplid()},
+                            {"model", modelname},
+                            {"object", "chat.completion.chunk"}};
+
+                json second_ret = json{
+                            {"choices", json::array({json{{"finish_reason", nullptr},
+                                                            {"index", 0},
+                                                            {"delta", json{
+                                                            {"content", content}}}
+                                                            }})},
+                            {"created", t},
+                            {"id", gen_chatcmplid()},
+                            {"model", modelname},
+                            {"object", "chat.completion.chunk"}};
+
+                return std::vector<json>({initial_ret, second_ret});
+            }
+        } else {
+            // Some idiosyncrasy in task processing logic makes several trailing calls
+            // with empty content, we ignore these at the calee site.
+            if (content.empty()) {
+                return std::vector<json>({json::object()});
+            }
+
+            choices = json::array({json{
+                {"finish_reason", nullptr},
+                {"index", 0},
+                {"delta",
+                json{
+                    {"content", content},
+                }},
+            }});
+        }
+    }
+
+    json ret = json{{"choices", choices},
+                    {"created", t},
+                    {"id", gen_chatcmplid()},
+                    {"model", modelname},
+                    {"object", "chat.completion.chunk"}};
+
+    return std::vector<json>({ret});
+}
+
 static json format_partial_response(
     llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs
 ) {
@@ -2356,9 +2617,9 @@ int main(int argc, char **argv)
                             task_result result = llama.next_result(task_id);
                             if (!result.error) {
                                 const std::string str =
-                                "data: " +
-                                result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
-                                "\n\n";
+                                    "data: " +
+                                    result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
+                                    "\n\n";
                                 LOG_VERBOSE("data stream", {
                                     { "to_send", str }
                                 });
@@ -2371,9 +2632,9 @@ int main(int argc, char **argv)
                                 }
                             } else {
                                 const std::string str =
-                                "error: " +
-                                result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
-                                "\n\n";
+                                    "error: " +
+                                    result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
+                                    "\n\n";
                                 LOG_VERBOSE("data stream", {
                                     { "to_send", str }
                                 });
@@ -2398,6 +2659,98 @@ int main(int argc, char **argv)
                 }
             });
 
+
+
+    svr.Get("/v1/models", [&params](const httplib::Request&, httplib::Response& res)
+            {
+                std::time_t t = std::time(0);
+
+                json models = {
+                    {"object", "list"},
+                    {"data", {
+                        {
+                            {"id", params.model_alias},
+                            {"object", "model"},
+                            {"created", t},
+                            {"owned_by", "llamacpp"}
+                        },
+                    }}
+                };
+
+                res.set_content(models.dump(), "application/json");
+            });
+
+    // TODO: add mount point without "/v1" prefix -- how?
+    svr.Post("/v1/chat/completions", [&llama](const httplib::Request &req, httplib::Response &res)
+            {
+                json data = oaicompat_completion_params_parse(json::parse(req.body));
+
+                const int task_id = llama.request_completion(data, false, false);
+
+                if (!json_value(data, "stream", false)) {
+                    std::string completion_text;
+                    task_result result = llama.next_result(task_id);
+
+                    if (!result.error && result.stop) {
+                        json oaicompat_result = format_final_response_oaicompat(data, result);
+
+                        res.set_content(oaicompat_result.dump(-1, ' ', false,
+                                            json::error_handler_t::replace),
+                                            "application/json");
+                    } else {
+                        res.status = 500;
+                        res.set_content(result.result_json["content"], "text/plain");
+                        return;
+                    }
+                } else {
+                    const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink &sink) {
+                        while (true) {
+                            task_result llama_result = llama.next_result(task_id);
+                            if (!llama_result.error) {
+                                std::vector<json> result_array = format_partial_response_oaicompat( llama_result);
+
+                                for (auto it = result_array.begin(); it != result_array.end(); ++it)
+                                {
+                                    if (!it->empty()) {
+                                        const std::string str =
+                                            "data: " +
+                                            it->dump(-1, ' ', false, json::error_handler_t::replace) +
+                                            "\n\n";
+                                        LOG_VERBOSE("data stream", {{"to_send", str}});
+                                        if (!sink.write(str.c_str(), str.size())) {
+                                            return false;
+                                        }
+                                    }
+                                }
+                                if (llama_result.stop) {
+                                    break;
+                                }
+                            } else {
+                                const std::string str =
+                                    "error: " +
+                                    llama_result.result_json.dump(-1, ' ', false,
+                                            json::error_handler_t::replace) +
+                                    "\n\n";
+                                LOG_VERBOSE("data stream", {{"to_send", str}});
+                                if (!sink.write(str.c_str(), str.size())) {
+                                    return false;
+                                }
+                                break;
+                            }
+                        }
+                        sink.done();
+                        return true;
+                    };
+
+                    auto on_complete = [task_id, &llama](bool) {
+                        // cancel request
+                        llama.request_cancel(task_id);
+                    };
+
+                    res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
+                }
+            });
+
     svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
             {
                 json data = json::parse(req.body);

From 04814e718edb13bdf8cca861dc2e5ab4e1995c30 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 25 Nov 2023 12:02:13 +0200
Subject: [PATCH 08/64] readme : update hot topics

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 5189e1255..2f83a71fd 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 
 ### Hot topics
 
+- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
 - Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167
 
 ----

From 3014b5415d08e3dff961da6eea835b9760a701b8 Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Sat, 25 Nov 2023 10:47:07 -0500
Subject: [PATCH 09/64] Update docs for yarn_ext_factor <0.0 as unspecified
 instead of NaN (#4189)

---
 convert.py | 0
 llama.h    | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 mode change 100644 => 100755 convert.py

diff --git a/convert.py b/convert.py
old mode 100644
new mode 100755
diff --git a/llama.h b/llama.h
index 1a62058d1..89cb6198e 100644
--- a/llama.h
+++ b/llama.h
@@ -185,7 +185,7 @@ extern "C" {
         // ref: https://github.com/ggerganov/llama.cpp/pull/2054
         float    rope_freq_base;   // RoPE base frequency, 0 = from model
         float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
-        float    yarn_ext_factor;  // YaRN extrapolation mix factor, NaN = from model
+        float    yarn_ext_factor;  // YaRN extrapolation mix factor, negative = from model
         float    yarn_attn_factor; // YaRN magnitude scaling factor
         float    yarn_beta_fast;   // YaRN low correction dim
         float    yarn_beta_slow;   // YaRN high correction dim

From f837c3a992b2b6146936cb120871a8cf9d0e3857 Mon Sep 17 00:00:00 2001
From: Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
Date: Sat, 25 Nov 2023 08:58:23 -0800
Subject: [PATCH 10/64] llama : grammar `reserve` space in `decode_utf8`
 (#4210)

* reserve space for codepoints

* improvement for the appended 0
---
 llama.cpp | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index c5f4053f2..f2b5967d7 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6420,10 +6420,13 @@ struct llama_grammar_candidate {
 // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
 static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
         const char         * src,
+        size_t               n_src,
         llama_partial_utf8   partial_start) {
     static const int      lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
     const char          * pos      = src;
     std::vector<uint32_t> code_points;
+    // common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
+    code_points.reserve(n_src + 1);
     uint32_t              value    = partial_start.value;
     int                   n_remain = partial_start.n_remain;
 
@@ -6474,6 +6477,13 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
     return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
 }
 
+static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
+        std::string src,
+        llama_partial_utf8 partial_start
+) {
+    return decode_utf8(src.c_str(), src.size(), partial_start);
+}
+
 // returns true iff pos points to the end of one of the definitions of a rule
 static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
     switch (pos->type) {
@@ -7123,7 +7133,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
         } else if (piece.empty() || piece[0] == 0) {
             candidates->data[i].logit = -INFINITY;
         } else {
-            candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8));
+            candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
             candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
         }
     }
@@ -7330,7 +7340,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
     const std::string piece = llama_token_to_piece(ctx, token);
 
     // Note terminating 0 in decoded string
-    const auto   decoded     = decode_utf8(piece.c_str(), grammar->partial_utf8);
+    const auto   decoded     = decode_utf8(piece, grammar->partial_utf8);
     const auto & code_points = decoded.first;
     for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
         grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);

From 1ddb52ec38f9931925a587f45a23b1c37152c028 Mon Sep 17 00:00:00 2001
From: Galunid <karolek1231456@gmail.com>
Date: Sat, 25 Nov 2023 22:45:02 +0100
Subject: [PATCH 11/64] scripts : Use mmap in torch load (#4202)

* Use mmap in torch load, prefer .bin files when loading

* Revert .bin > .safetensors preference
---
 convert-hf-to-gguf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 147d5717e..53ce76c70 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -59,7 +59,7 @@ class Model:
                 from safetensors import safe_open
                 ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
             else:
-                ctx = contextlib.nullcontext(torch.load(self.dir_model / part_name, map_location="cpu"))
+                ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
 
             with ctx as model_part:
                 for name in model_part.keys():

From 22da05536ff4ad963080773bef1fb839fdab95d3 Mon Sep 17 00:00:00 2001
From: Xiao-Yong Jin <jinxiaoyong@gmail.com>
Date: Sun, 26 Nov 2023 02:30:02 -0600
Subject: [PATCH 12/64] metal : fix yarn (#4220)

get the correct n_orig_ctx in metal
---
 ggml-metal.m | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index a9fdd3903..d52a1c3c4 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -1433,7 +1433,8 @@ void ggml_metal_graph_compute(
                             const int n_past     = ((int32_t *) dst->op_params)[0];
                             const int n_dims     = ((int32_t *) dst->op_params)[1];
                             const int mode       = ((int32_t *) dst->op_params)[2];
-                            const int n_orig_ctx = ((int32_t *) dst->op_params)[3];
+                            // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
+                            const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
 
                             float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
                             memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));

From 922754a8d60080e956891f6cee1fb03aa48d57c6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 26 Nov 2023 20:33:07 +0200
Subject: [PATCH 13/64] lookahead : add example for lookahead decoding (#4207)

* lookahead : init

* lookahead : generate and store n-grams

* lookahead : use loop instead recursion to generate n-grams

* lookahead : initial working implementation

* lookahead : filter repeating n-grams

* lookahead : use deterministic init

* lookahead : add to Makefile

* lookahead : fix a bug in the seq_id of the lookahead tokens

* lookahead : add comments

---------

Co-authored-by: slaren <slarengh@gmail.com>
---
 .gitignore                        |   1 +
 Makefile                          |   5 +-
 examples/CMakeLists.txt           |   1 +
 examples/lookahead/CMakeLists.txt |   5 +
 examples/lookahead/lookahead.cpp  | 487 ++++++++++++++++++++++++++++++
 5 files changed, 498 insertions(+), 1 deletion(-)
 create mode 100644 examples/lookahead/CMakeLists.txt
 create mode 100644 examples/lookahead/lookahead.cpp

diff --git a/.gitignore b/.gitignore
index 41259a12f..3806e05dd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -47,6 +47,7 @@ models-mnt
 /libllama.so
 /llama-bench
 /llava-cli
+/lookahead
 /main
 /metal
 /perplexity
diff --git a/Makefile b/Makefile
index a6d2c2ec0..95d85236f 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
 	simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search  \
-	speculative infill tokenize benchmark-matmult parallel finetune export-lora tests/test-c.o
+	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead tests/test-c.o
 
 # Binaries only useful for tests
 TEST_TARGETS = \
@@ -657,6 +657,9 @@ speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS)
 parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
+lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
 ifdef LLAMA_METAL
 metal: examples/metal/metal.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 71bcb6893..6744944fd 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -32,6 +32,7 @@ else()
     add_subdirectory(save-load-state)
     add_subdirectory(simple)
     add_subdirectory(speculative)
+    add_subdirectory(lookahead)
     add_subdirectory(train-text-from-scratch)
     if (LLAMA_METAL)
         add_subdirectory(metal)
diff --git a/examples/lookahead/CMakeLists.txt b/examples/lookahead/CMakeLists.txt
new file mode 100644
index 000000000..8827e3f11
--- /dev/null
+++ b/examples/lookahead/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET lookahead)
+add_executable(${TARGET} lookahead.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
new file mode 100644
index 000000000..4c49a85eb
--- /dev/null
+++ b/examples/lookahead/lookahead.cpp
@@ -0,0 +1,487 @@
+#include "common.h"
+#include "llama.h"
+
+#include <cmath>
+#include <cstdio>
+#include <string>
+#include <vector>
+
+struct ngram_data {
+    bool active = false;
+
+    llama_seq_id seq_id = -1;
+
+    std::vector<int> i_batch;
+
+    std::vector<llama_token> tokens;
+};
+
+// n-gram container
+struct ngram_container {
+    ngram_container(int n_vocab, int N, int G) {
+        cnt.resize(n_vocab);
+        head.resize(n_vocab);
+        tokens.resize(n_vocab * G * (N - 1));
+    }
+
+    int n_total = 0;
+
+    std::vector<int> cnt;
+    std::vector<int> head;
+
+    // [n_vocab][G][N - 1]
+    // for each token of the vocab, keep a ring-buffer of capacity G of n-grams of size N - 1
+    std::vector<llama_token> tokens;
+};
+
+int main(int argc, char ** argv) {
+    gpt_params params;
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    const int W = 15; // lookahead window
+    const int N = 5;  // n-gram size
+    const int G = 15; // max verification n-grams
+
+    const bool dump_kv_cache = params.dump_kv_cache;
+
+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("lookahead", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+#endif // LOG_DISABLE_LOGS
+
+    // init llama.cpp
+    llama_backend_init(params.numa);
+
+    llama_model * model = NULL;
+    llama_context * ctx = NULL;
+
+    // load the target model
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+
+    // Tokenize the prompt
+    const bool add_bos = llama_should_add_bos_token(model);
+    LOG("add_bos tgt: %d\n", add_bos);
+
+    std::vector<llama_token> inp;
+    std::vector<llama_token> all;
+
+    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+    all = inp;
+
+    const int max_context_size     = llama_n_ctx(ctx);
+    const int max_tokens_list_size = max_context_size - 4;
+
+    if ((int) inp.size() > max_tokens_list_size) {
+        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
+        return 1;
+    }
+
+    fprintf(stderr, "\n\n");
+
+    for (auto id : inp) {
+        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+    }
+
+    fflush(stderr);
+
+    const int n_input = inp.size();
+
+    const auto t_enc_start = ggml_time_us();
+
+    // eval the prompt
+    llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0,           0));
+    llama_decode(ctx, llama_batch_get_one(&inp.back(),           1, n_input - 1, 0));
+
+    for (int s = 1; s < W + G + 1; ++s) {
+        llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
+    }
+
+    const auto t_enc_end = ggml_time_us();
+
+    int n_predict = 0;
+    int n_accept  = 0;
+
+    int n_past = inp.size();
+
+    llama_token id = 0;
+
+    // used to determine end of generation
+    bool has_eos = false;
+
+    // for each decoded batch, we have at most W + G + 1 distinct sequences:
+    // seq_id == 0           : the current input token
+    // seq_id [1, W]         : tokens from the past N - 1 Jacobi iterations
+    // seq_id [W + 1, W + G] : verification n-grams
+    llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
+
+    // target model sampling context
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
+
+    // verification n-grams
+    std::vector<ngram_data> ngrams_cur(G);
+
+    // tokens for the past N - 1 Jacobi iterations
+    std::vector<llama_token> tokens_j_prev(W);
+    std::vector<std::vector<llama_token>> tokens_j(N - 1);
+    for (int j = 0; j < N - 1; j++) {
+        tokens_j[j].resize(W);
+
+        for (int i = 0; i < W; i++) {
+            // there are different ways to init these tokens
+            if (0) {
+                // initialize randomly from the prompt tokens
+                tokens_j[j][i] = all[1 + rand() % (all.size() - 1)];
+            } else {
+                // initialize with a sequence of increasing numbers
+                tokens_j[j][i] = 100 + i;
+            }
+        }
+    }
+
+    std::vector<llama_seq_id> seq_id_look;
+
+    // the input token belongs both to all sequences
+    std::vector<llama_seq_id> seq_id_all(W + G + 1);
+    for (int i = 0; i < W + G + 1; i++) {
+        seq_id_all[i] = i;
+    }
+
+    // here we keep adding new n-grams as we go
+    ngram_container ngrams_observed(llama_n_vocab(model), N, G);
+
+    // debug
+    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
+
+    const auto t_dec_start = ggml_time_us();
+
+    // sample first token
+    {
+        id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0);
+
+        llama_sampling_accept(ctx_sampling, ctx, id, true);
+
+        {
+            const std::string token_str = llama_token_to_piece(ctx, id);
+
+            printf("%s", token_str.c_str());
+            fflush(stdout);
+        }
+    }
+
+    while (true) {
+        // debug
+        if (dump_kv_cache) {
+            llama_kv_cache_view_update(ctx, &kvc_view);
+            dump_kv_cache_view_seqs(kvc_view, 40);
+        }
+
+        // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
+        //
+        // Example for W = 5, N = 4, G = 2:
+        // (I = input, L = lookahead, V = verification)
+        //
+        // Batch:  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
+        // T:        -2 -2 -2 -2 -1 -1 -1 -1 -1  0  0  0  0  0  0
+        // Info:   I  L  L  L  L  L  L  L  L  L  L  L  L  L  L  V  V  V  V  V  V
+        // Pos:    0  1  2  3  4  1  2  3  4  5  2  3  4  5  6  1  2  3  1  2  3   (+ n_past)
+        // Logits: 1  0  0  0  0  0  0  0  0  0  1  1  1  1  1  1  1  1  1  1  1
+        // ---------------------------------------------------------------------
+        // Seq:    0
+        //         1              1              1
+        //         2  2              2              2
+        //         3  3  3              3              3
+        //         4  4  4  4              4              4
+        //         5  5  5  5  5              5              5
+        //         6                                            6  6  6
+        //         7                                                     7  7  7
+        // ---------------------------------------------------------------------
+        //                                       |  |  |  |  |  |  |  |  |  |  |
+        //                                       V  V  V  V  V  |  |  |  |  |  |
+        //                                         j_tokens     |  |  |  |  |  |
+        //                                                      V  V  V  V  V  V
+        //                                                             id
+        {
+            llama_batch_clear(batch);
+
+            // current token - first token of the first level
+            llama_batch_add(batch, id, n_past, seq_id_all, true);
+
+            // verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation
+            {
+                const int g_cur = ngrams_observed.cnt[id];
+
+                ngrams_cur.resize(g_cur);
+                for (int g = 0; g < g_cur; g++) {
+                    ngrams_cur[g].active = true;
+                    ngrams_cur[g].tokens.resize(N);
+                    ngrams_cur[g].i_batch.resize(N);
+                    ngrams_cur[g].seq_id = W + 1 + g;
+                    ngrams_cur[g].i_batch[0] = 0;
+                    ngrams_cur[g].tokens [0] = id;
+                }
+
+                for (int j = 0; j < N - 1; j++) {
+                    for (int g = 0; g < g_cur; g++) {
+                        const int idx = id*(N - 1)*G + g*(N - 1);
+
+                        const llama_token t = ngrams_observed.tokens[idx + j];
+
+                        ngrams_cur[g].tokens [j + 1] = t;
+                        ngrams_cur[g].i_batch[j + 1] = batch.n_tokens;
+
+                        llama_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
+                    }
+                }
+            }
+
+            // fill the remaining W - 1 tokens for the first level
+            for (int i = 1; i < W; i++) {
+                seq_id_look.resize(W - i);
+                for (int j = 0; j < W - i; j++) {
+                    seq_id_look[j] = i + j + 1;
+                }
+
+                llama_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
+            }
+
+            // fill the rest of the levels
+            for (int j = 1; j < N - 1; j++) {
+                for (int i = 0; i < W; i++) {
+                    llama_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
+                }
+            }
+        }
+
+        if (llama_decode(ctx, batch) != 0) {
+            fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__);
+            return 1;
+        }
+
+        int seq_id_best = 0;
+
+        for (int v = 0; v < N; ++v) {
+            int i_batch = 0;
+
+            // if no active ngrams are left, it means the sampled token does not pass the verification
+            if (v > 0) {
+                for (int g = 0; g < (int) ngrams_cur.size(); g++) {
+                    if (ngrams_cur[g].active) {
+                        i_batch = ngrams_cur[g].i_batch[v];
+                        seq_id_best = ngrams_cur[g].seq_id;
+
+                        ++n_accept;
+                        break;
+                    }
+                }
+
+                // no more matches -> create a new batch
+                if (i_batch == 0) {
+                    break;
+                }
+            }
+
+            // sample the next token
+            id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_batch);
+
+            llama_sampling_accept(ctx_sampling, ctx, id, true);
+
+            // print
+            {
+                const std::string token_str = llama_token_to_piece(ctx, id);
+
+                if (v == 0) {
+                    printf("%s", token_str.c_str());
+                } else {
+                    // print light cyan
+                    printf("\033[0;96m%s\033[0m", token_str.c_str());
+                }
+                fflush(stdout);
+
+                if (id == llama_token_eos(model)) {
+                    has_eos = true;
+                }
+
+                all.push_back(id);
+            }
+
+            ++n_predict;
+            ++n_past;
+
+            if (n_predict > params.n_predict || has_eos) {
+                break;
+            }
+
+            // verify across active n-grams
+            for (int g = 0; g < (int) ngrams_cur.size(); g++) {
+                if (ngrams_cur[g].active) {
+                    if (v == N - 1) {
+                        ngrams_cur[g].active = false;
+                    } else {
+                        if (id != ngrams_cur[g].tokens[v + 1]) {
+                            ngrams_cur[g].active = false;
+                        }
+                    }
+                }
+            }
+
+            // print known n-grams starting with token id (debug)
+            if (0 && v == 0) {
+                if (ngrams_observed.cnt[id] > 0) {
+                    printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
+                }
+
+                for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
+                    printf("   - ngram %2d: ", i);
+
+                    const int idx = id*(N - 1)*G + i*(N - 1);
+
+                    for (int j = 0; j < N - 1; j++) {
+                        const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
+
+                        printf("%s", token_str.c_str());
+                    }
+
+                    printf("\n");
+                }
+            }
+
+            // update lookahead tokens
+            {
+                for (int i = 0; i < W; i++) {
+                    tokens_j_prev[i] = tokens_j[0][i];
+                }
+
+                for (int j = 0; j < N - 2; j++) {
+                    tokens_j[j] = tokens_j[j + 1];
+                }
+
+                if (v == 0) {
+                    // sample from the last level
+                    for (int i = 0; i < W; i++) {
+                        tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
+                    }
+                } else {
+                    for (int i = 0; i < W; i++) {
+                        // there are different ways to init these tokens
+                        if (0) {
+                            // random init
+                            tokens_j[N - 2][i] = all[1 + rand() % (all.size() - 1)];
+                        } else {
+                            // init from the previous level
+                            tokens_j[N - 2][i] = tokens_j[0][i];
+                        }
+                    }
+                }
+            }
+
+            // update observed ngrams
+            if (v == 0) {
+                // the first token of the n-gram is determined by the index in the container so it is not stored
+                std::vector<llama_token> ngram(N - 1);
+
+                // n-gram generation
+                // ref: https://github.com/hao-ai-lab/LookaheadDecoding/issues/14#issuecomment-1826198518
+                for (int f = 0; f < W; ++f) {
+                    const int ft = tokens_j_prev[f]; // first token of the n-gram
+
+                    for (int j = 0; j < N - 1; ++j) {
+                        ngram[j] = tokens_j[j][f];
+                    }
+
+                    // filter-out repeating n-grams
+                    {
+                        bool is_unique = true;
+
+                        for (int k = 0; k < ngrams_observed.cnt[ft]; ++k) {
+                            const int idx = ft*(N - 1)*G + k*(N - 1);
+
+                            bool is_match = true;
+                            for (int j = 0; j < N - 1; ++j) {
+                                if (ngrams_observed.tokens[idx + j] != ngram[j]) {
+                                    is_match = false;
+                                    break;
+                                }
+                            }
+
+                            if (is_match) {
+                                is_unique = false;
+                                break;
+                            }
+                        }
+
+                        if (!is_unique) {
+                            continue;
+                        }
+                    }
+
+                    const int head = ngrams_observed.head[ft];
+                    const int idx  = ft*(N - 1)*G + head*(N - 1);
+
+                    for (int i = 0; i < N - 1; i++) {
+                        ngrams_observed.tokens[idx + i] = ngram[i];
+                    }
+
+                    ngrams_observed.cnt[ft]  = std::min(G, ngrams_observed.cnt[ft] + 1);
+                    ngrams_observed.head[ft] = (head + 1) % G;
+
+                    ngrams_observed.n_total++;
+                }
+            }
+        }
+
+        if (n_predict > params.n_predict || has_eos) {
+            break;
+        }
+
+        // KV cache management
+        // if no verification token matched, we simply remove all cells from this batch -> no fragmentation
+        llama_kv_cache_seq_rm(ctx, -1, n_past, -1);
+
+        if (seq_id_best != 0) {
+            // if a verification token matched, we keep the best sequence and remove the rest
+            // this leads to some KV cache fragmentation
+            llama_kv_cache_seq_keep(ctx, seq_id_best);
+            llama_kv_cache_seq_cp  (ctx, seq_id_best, 0, -1, -1);
+            llama_kv_cache_seq_rm  (ctx, seq_id_best,    -1, -1);
+
+            for (int s = 1; s < W + G + 1; ++s) {
+                llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
+            }
+        }
+    }
+
+    auto t_dec_end = ggml_time_us();
+
+    LOG_TEE("\n\n");
+
+    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+
+    LOG_TEE("\n");
+    LOG_TEE("W = %2d\n", W);
+    LOG_TEE("N = %2d\n", N);
+    LOG_TEE("G = %2d\n", G);
+    LOG_TEE("\n");
+    LOG_TEE("n_predict = %d\n", n_predict);
+    LOG_TEE("n_accept  = %d\n", n_accept);
+
+    llama_print_timings(ctx);
+
+    llama_kv_cache_view_free(&kvc_view);
+    llama_sampling_free(ctx_sampling);
+
+    llama_batch_free(batch);
+
+    llama_free(ctx);
+    llama_free_model(model);
+
+    llama_backend_free();
+
+    fprintf(stderr, "\n\n");
+
+    return 0;
+}

From 9656026b53236ed7328458269c4c798dd50ac8d1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 26 Nov 2023 20:42:51 +0200
Subject: [PATCH 14/64] readme : update hot topics

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 2f83a71fd..2892132c4 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 
 ### Hot topics
 
+- Using `llama.cpp` with AWS instances: https://github.com/ggerganov/llama.cpp/discussions/4225
 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
 - Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167
 

From 3e73d31d9cc0232882ce61c64742aff3ecfec416 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 26 Nov 2023 21:51:46 +0200
Subject: [PATCH 15/64] lookahead : support `-n -1` infinite generation

---
 examples/lookahead/lookahead.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index 4c49a85eb..e55a15a1b 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -311,7 +311,7 @@ int main(int argc, char ** argv) {
             ++n_predict;
             ++n_past;
 
-            if (n_predict > params.n_predict || has_eos) {
+            if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
                 break;
             }
 
@@ -433,7 +433,7 @@ int main(int argc, char ** argv) {
             }
         }
 
-        if (n_predict > params.n_predict || has_eos) {
+        if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
             break;
         }
 

From f3b269813f6147c5b5cda082e6b45cf04a932e0d Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Sun, 26 Nov 2023 22:58:43 -0500
Subject: [PATCH 16/64] ggml : fix -Warray-bounds warning with gcc (#4231)

---
 ggml.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/ggml.c b/ggml.c
index f92292b39..0c7264a36 100644
--- a/ggml.c
+++ b/ggml.c
@@ -15689,13 +15689,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
             {
                 n_tasks = 1;
             } break;
-        case GGML_OP_COUNT:
-            {
-                GGML_ASSERT(false);
-            } break;
         default:
             {
-                printf("%s: op %s not implemented\n", __func__, ggml_op_name(node->op));
+                fprintf(stderr, "%s: op not implemented: ", __func__);
+                if (node->op < GGML_OP_COUNT) {
+                    fprintf(stderr, "%s\n", ggml_op_name(node->op));
+                } else {
+                    fprintf(stderr, "%d\n", node->op);
+                }
                 GGML_ASSERT(false);
             } break;
     }

From bb03290c17540768a16000a2b01ee4f22440aba1 Mon Sep 17 00:00:00 2001
From: Bailey Chittle <39804642+bachittle@users.noreply.github.com>
Date: Mon, 27 Nov 2023 09:56:52 -0500
Subject: [PATCH 17/64] examples : iOS example with swift ui (#4159)

* copy to llama.cpp as subdir

* attempt enabling metal, fails

* ggml metal compiles!

* Update README.md

* initial conversion to new format, utf8 errors?

* bug fixes, but now has an invalid memory access :(

* added O3, now has insufficient memory access

* begin sync with master

* update to match latest code, new errors

* fixed it!

* fix for loop conditionals, increase result size

* fix current workflow errors

* attempt a llama.swiftui workflow

* Update .github/workflows/build.yml

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 .github/workflows/build.yml                   |  11 +
 examples/llama.swiftui/.gitignore             |   1 +
 examples/llama.swiftui/README.md              |   7 +
 .../llama.cpp.swift/LibLlama.swift            | 176 +++++++
 .../llama.cpp.swift/bridging-header.h         |   5 +
 .../llama.swiftui.xcodeproj/project.pbxproj   | 481 ++++++++++++++++++
 .../contents.xcworkspacedata                  |   7 +
 .../xcshareddata/IDEWorkspaceChecks.plist     |   8 +
 .../AccentColor.colorset/Contents.json        |  11 +
 .../AppIcon.appiconset/Contents.json          |  13 +
 .../Assets.xcassets/Contents.json             |   6 +
 .../llama.swiftui/Models/LlamaState.swift     |  45 ++
 .../Preview Assets.xcassets/Contents.json     |   6 +
 .../llama.swiftui/Resources/models/.gitignore |   0
 .../llama.swiftui/UI/ContentView.swift        |  42 ++
 .../llama.swiftui/llama_swiftuiApp.swift      |  10 +
 16 files changed, 829 insertions(+)
 create mode 100644 examples/llama.swiftui/.gitignore
 create mode 100644 examples/llama.swiftui/README.md
 create mode 100644 examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
 create mode 100644 examples/llama.swiftui/llama.cpp.swift/bridging-header.h
 create mode 100644 examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
 create mode 100644 examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/contents.xcworkspacedata
 create mode 100644 examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
 create mode 100644 examples/llama.swiftui/llama.swiftui/Assets.xcassets/AccentColor.colorset/Contents.json
 create mode 100644 examples/llama.swiftui/llama.swiftui/Assets.xcassets/AppIcon.appiconset/Contents.json
 create mode 100644 examples/llama.swiftui/llama.swiftui/Assets.xcassets/Contents.json
 create mode 100644 examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
 create mode 100644 examples/llama.swiftui/llama.swiftui/Preview Content/Preview Assets.xcassets/Contents.json
 create mode 100644 examples/llama.swiftui/llama.swiftui/Resources/models/.gitignore
 create mode 100644 examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
 create mode 100644 examples/llama.swiftui/llama.swiftui/llama_swiftuiApp.swift

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index bc295d52d..22be233e6 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -498,6 +498,17 @@ jobs:
           path: |
             cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
 
+  ios-xcode-build:
+    runs-on: macos-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Build Xcode project
+        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
+
+
 #  freeBSD-latest:
 #    runs-on: macos-12
 #    steps:
diff --git a/examples/llama.swiftui/.gitignore b/examples/llama.swiftui/.gitignore
new file mode 100644
index 000000000..9bce6af39
--- /dev/null
+++ b/examples/llama.swiftui/.gitignore
@@ -0,0 +1 @@
+xcuserdata
diff --git a/examples/llama.swiftui/README.md b/examples/llama.swiftui/README.md
new file mode 100644
index 000000000..fa68e6ed8
--- /dev/null
+++ b/examples/llama.swiftui/README.md
@@ -0,0 +1,7 @@
+# llama.swiftui
+
+Local inference of llama.cpp on an iPhone.
+So far I only tested with starcoder 1B model, but it can most likely handle 7B models as well.
+
+https://github.com/bachittle/llama.cpp/assets/39804642/e290827a-4edb-4093-9642-2a5e399ec545
+
diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
new file mode 100644
index 000000000..aaef09611
--- /dev/null
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -0,0 +1,176 @@
+import Foundation
+
+// import llama
+
+enum LlamaError: Error {
+    case couldNotInitializeContext
+}
+
+actor LlamaContext {
+    private var model: OpaquePointer
+    private var context: OpaquePointer
+    private var batch: llama_batch
+    private var tokens_list: [llama_token]
+
+    var n_len: Int32 = 512
+    var n_cur: Int32 = 0
+    var n_decode: Int32 = 0
+
+    init(model: OpaquePointer, context: OpaquePointer) {
+        self.model = model
+        self.context = context
+        self.tokens_list = []
+        self.batch = llama_batch_init(512, 0, 1)
+    }
+
+    deinit {
+        llama_free(context)
+        llama_free_model(model)
+        llama_backend_free()
+    }
+
+    static func createContext(path: String) throws -> LlamaContext {
+        llama_backend_init(false)
+        let model_params = llama_model_default_params()
+
+        let model = llama_load_model_from_file(path, model_params)
+        guard let model else {
+            print("Could not load model at \(path)")
+            throw LlamaError.couldNotInitializeContext
+        }
+        var ctx_params = llama_context_default_params()
+        ctx_params.seed = 1234
+        ctx_params.n_ctx = 2048
+        ctx_params.n_threads = 8
+        ctx_params.n_threads_batch = 8
+
+        let context = llama_new_context_with_model(model, ctx_params)
+        guard let context else {
+            print("Could not load context!")
+            throw LlamaError.couldNotInitializeContext
+        }
+
+        return LlamaContext(model: model, context: context)
+    }
+
+    func get_n_tokens() -> Int32 {
+        return batch.n_tokens;
+    }
+
+    func completion_init(text: String) {
+        print("attempting to complete \"\(text)\"")
+
+        tokens_list = tokenize(text: text, add_bos: true)
+
+        let n_ctx = llama_n_ctx(context)
+        let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)
+
+        print("\n n_len = \(n_len), n_ctx = \(n_ctx), n_kv_req = \(n_kv_req)")
+
+        if n_kv_req > n_ctx {
+            print("error: n_kv_req > n_ctx, the required KV cache size is not big enough")
+        }
+
+        for id in tokens_list {
+            print(token_to_piece(token: id))
+        }
+
+        // batch = llama_batch_init(512, 0) // done in init()
+        batch.n_tokens = Int32(tokens_list.count)
+
+        for i1 in 0..<batch.n_tokens {
+            let i = Int(i1)
+            batch.token[i] = tokens_list[i]
+            batch.pos[i] = i1
+            batch.n_seq_id[Int(i)] = 1
+            batch.seq_id[Int(i)]![0] = 0
+            batch.logits[i] = 0
+        }
+        batch.logits[Int(batch.n_tokens) - 1] = 1 // true
+
+        if llama_decode(context, batch) != 0 {
+            print("llama_decode() failed")
+        }
+
+        n_cur = batch.n_tokens
+    }
+
+    func completion_loop() -> String {
+        var new_token_id: llama_token = 0
+
+        let n_vocab = llama_n_vocab(model)
+        let logits = llama_get_logits_ith(context, batch.n_tokens - 1)
+
+        var candidates = Array<llama_token_data>()
+        candidates.reserveCapacity(Int(n_vocab))
+
+        for token_id in 0..<n_vocab {
+            candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
+        }
+        candidates.withUnsafeMutableBufferPointer() { buffer in
+            var candidates_p = llama_token_data_array(data: buffer.baseAddress, size: buffer.count, sorted: false)
+
+            new_token_id = llama_sample_token_greedy(context, &candidates_p)
+        }
+
+        if new_token_id == llama_token_eos(context) || n_cur == n_len {
+            print("\n")
+            return ""
+        }
+
+        let new_token_str = token_to_piece(token: new_token_id)
+        print(new_token_str)
+        // tokens_list.append(new_token_id)
+
+        batch.n_tokens = 0
+
+        batch.token[Int(batch.n_tokens)] = new_token_id
+        batch.pos[Int(batch.n_tokens)] = n_cur
+        batch.n_seq_id[Int(batch.n_tokens)] = 1
+        batch.seq_id[Int(batch.n_tokens)]![0] = 0
+        batch.logits[Int(batch.n_tokens)] = 1 // true
+        batch.n_tokens += 1
+
+        n_decode += 1
+
+        n_cur += 1
+
+        if llama_decode(context, batch) != 0 {
+            print("failed to evaluate llama!")
+        }
+
+        return new_token_str
+    }
+
+    func clear() {
+        tokens_list.removeAll()
+    }
+
+    private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
+        let n_tokens = text.count + (add_bos ? 1 : 0)
+        let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
+        let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, false)
+
+        var swiftTokens: [llama_token] = []
+        for i in 0..<tokenCount {
+            swiftTokens.append(tokens[Int(i)])
+        }
+
+        tokens.deallocate()
+
+        return swiftTokens
+    }
+
+    private func token_to_piece(token: llama_token) -> String {
+        let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
+        result.initialize(repeating: Int8(0), count: 8)
+
+        let _ = llama_token_to_piece(model, token, result, 8)
+
+        let resultStr = String(cString: result)
+
+        result.deallocate()
+
+        return resultStr
+    }
+}
diff --git a/examples/llama.swiftui/llama.cpp.swift/bridging-header.h b/examples/llama.swiftui/llama.cpp.swift/bridging-header.h
new file mode 100644
index 000000000..6cd72c979
--- /dev/null
+++ b/examples/llama.swiftui/llama.cpp.swift/bridging-header.h
@@ -0,0 +1,5 @@
+//
+//  Use this file to import your target's public headers that you would like to expose to Swift.
+//
+
+#import "llama.h"
diff --git a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
new file mode 100644
index 000000000..bc1fd15ce
--- /dev/null
+++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
@@ -0,0 +1,481 @@
+// !$*UTF8*$!
+{
+    archiveVersion = 1;
+    classes = {
+    };
+    objectVersion = 56;
+    objects = {
+
+/* Begin PBXBuildFile section */
+        542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; };
+        5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; };
+        542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
+        542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; };
+        542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; };
+        542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; };
+        549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
+        549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; };
+        8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; };
+        8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; };
+        8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; };
+        8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; };
+        8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; };
+        8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
+        8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
+        8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXFileReference section */
+        542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = "<group>"; };
+        542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = "<group>"; };
+        542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = "<group>"; };
+        5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = "<group>"; };
+        542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = "<group>"; };
+        542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = "<group>"; };
+        542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = "<group>"; };
+        542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = "<group>"; };
+        542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = "<group>"; };
+        542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = "<group>"; };
+        549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = "<group>"; };
+        549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = "<group>"; };
+        549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = "<group>"; };
+        549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
+        8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = "<group>"; };
+        8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
+        8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; };
+        8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
+        8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
+        8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
+        8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
+        8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */ = {isa = PBXFileReference; lastKnownFileType = file; path = "llama-2-7b-chat.Q2_K.gguf"; sourceTree = "<group>"; };
+        8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
+        8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
+        8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+        8A1C83702AC328BD0096AF73 /* Frameworks */ = {
+            isa = PBXFrameworksBuildPhase;
+            buildActionMask = 2147483647;
+            files = (
+                549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
+                8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
+            );
+            runOnlyForDeploymentPostprocessing = 0;
+        };
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+        8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = {
+            isa = PBXGroup;
+            children = (
+                5423760A2B0D9C4B008E6A1C /* ggml-backend.c */,
+                542376092B0D9C40008E6A1C /* ggml-backend.h */,
+                542376062B0D9BEA008E6A1C /* ggml-quants.h */,
+                542376072B0D9BFB008E6A1C /* ggml-quants.c */,
+                549479C82AC9E10B00E0F78B /* ggml-metal.metal */,
+                549479C62AC9E0F200E0F78B /* ggml-metal.h */,
+                549479C52AC9E0F200E0F78B /* ggml-metal.m */,
+                542EA09B2AC8723900A8AEE9 /* ggml.c */,
+                542EA09C2AC8723900A8AEE9 /* ggml.h */,
+                542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */,
+                542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */,
+                542EA0A12AC8729100A8AEE9 /* llama.cpp */,
+                542EA0A22AC8729100A8AEE9 /* llama.h */,
+            );
+            name = llama.cpp;
+            sourceTree = "<group>";
+        };
+        8A1C836A2AC328BD0096AF73 = {
+            isa = PBXGroup;
+            children = (
+                8A08D1F62AC7383900FE6CD4 /* llama.cpp */,
+                8A907F312AC7134E006146EA /* llama.cpp.swift */,
+                8A3F84232AC4C891005E2EE8 /* models */,
+                8A1C83752AC328BD0096AF73 /* llama.swiftui */,
+                8A1C83742AC328BD0096AF73 /* Products */,
+                8A39BE082AC7601000BFEB40 /* Frameworks */,
+            );
+            sourceTree = "<group>";
+        };
+        8A1C83742AC328BD0096AF73 /* Products */ = {
+            isa = PBXGroup;
+            children = (
+                8A1C83732AC328BD0096AF73 /* llama.swiftui.app */,
+            );
+            name = Products;
+            sourceTree = "<group>";
+        };
+        8A1C83752AC328BD0096AF73 /* llama.swiftui */ = {
+            isa = PBXGroup;
+            children = (
+                8A3F84102AC4BD85005E2EE8 /* Resources */,
+                8A9F7C4B2AC332DC008AE1EA /* Models */,
+                8A9F7C4A2AC332BF008AE1EA /* UI */,
+                8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */,
+                8A1C837A2AC328BE0096AF73 /* Assets.xcassets */,
+                8A1C837C2AC328BE0096AF73 /* Preview Content */,
+            );
+            path = llama.swiftui;
+            sourceTree = "<group>";
+        };
+        8A1C837C2AC328BE0096AF73 /* Preview Content */ = {
+            isa = PBXGroup;
+            children = (
+                8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */,
+            );
+            path = "Preview Content";
+            sourceTree = "<group>";
+        };
+        8A39BE082AC7601000BFEB40 /* Frameworks */ = {
+            isa = PBXGroup;
+            children = (
+                549479CA2AC9E16000E0F78B /* Metal.framework */,
+                8A39BE092AC7601000BFEB40 /* Accelerate.framework */,
+            );
+            name = Frameworks;
+            sourceTree = "<group>";
+        };
+        8A3F84102AC4BD85005E2EE8 /* Resources */ = {
+            isa = PBXGroup;
+            children = (
+                8A3F84112AC4BD8C005E2EE8 /* models */,
+            );
+            path = Resources;
+            sourceTree = "<group>";
+        };
+        8A3F84112AC4BD8C005E2EE8 /* models */ = {
+            isa = PBXGroup;
+            children = (
+                8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */,
+            );
+            path = models;
+            sourceTree = "<group>";
+        };
+        8A907F312AC7134E006146EA /* llama.cpp.swift */ = {
+            isa = PBXGroup;
+            children = (
+                8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */,
+                8A907F322AC7134E006146EA /* LibLlama.swift */,
+            );
+            path = llama.cpp.swift;
+            sourceTree = "<group>";
+        };
+        8A9F7C4A2AC332BF008AE1EA /* UI */ = {
+            isa = PBXGroup;
+            children = (
+                8A1C83782AC328BD0096AF73 /* ContentView.swift */,
+            );
+            path = UI;
+            sourceTree = "<group>";
+        };
+        8A9F7C4B2AC332DC008AE1EA /* Models */ = {
+            isa = PBXGroup;
+            children = (
+                8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */,
+            );
+            path = Models;
+            sourceTree = "<group>";
+        };
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+        8A1C83722AC328BD0096AF73 /* llama.swiftui */ = {
+            isa = PBXNativeTarget;
+            buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */;
+            buildPhases = (
+                8A1C836F2AC328BD0096AF73 /* Sources */,
+                8A1C83702AC328BD0096AF73 /* Frameworks */,
+                8A1C83712AC328BD0096AF73 /* Resources */,
+            );
+            buildRules = (
+            );
+            dependencies = (
+            );
+            name = llama.swiftui;
+            packageProductDependencies = (
+            );
+            productName = llama.swiftui;
+            productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
+            productType = "com.apple.product-type.application";
+        };
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+        8A1C836B2AC328BD0096AF73 /* Project object */ = {
+            isa = PBXProject;
+            attributes = {
+                BuildIndependentTargetsInParallel = 1;
+                LastSwiftUpdateCheck = 1500;
+                LastUpgradeCheck = 1500;
+                TargetAttributes = {
+                    8A1C83722AC328BD0096AF73 = {
+                        CreatedOnToolsVersion = 15.0;
+                        LastSwiftMigration = 1500;
+                    };
+                };
+            };
+            buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */;
+            compatibilityVersion = "Xcode 14.0";
+            developmentRegion = en;
+            hasScannedForEncodings = 0;
+            knownRegions = (
+                en,
+                Base,
+            );
+            mainGroup = 8A1C836A2AC328BD0096AF73;
+            packageReferences = (
+            );
+            productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */;
+            projectDirPath = "";
+            projectRoot = "";
+            targets = (
+                8A1C83722AC328BD0096AF73 /* llama.swiftui */,
+            );
+        };
+/* End PBXProject section */
+
+/* Begin PBXResourcesBuildPhase section */
+        8A1C83712AC328BD0096AF73 /* Resources */ = {
+            isa = PBXResourcesBuildPhase;
+            buildActionMask = 2147483647;
+            files = (
+                542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */,
+                8A3F84242AC4C891005E2EE8 /* models in Resources */,
+                8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */,
+                8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */,
+            );
+            runOnlyForDeploymentPostprocessing = 0;
+        };
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+        8A1C836F2AC328BD0096AF73 /* Sources */ = {
+            isa = PBXSourcesBuildPhase;
+            buildActionMask = 2147483647;
+            files = (
+                542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */,
+                549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */,
+                542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */,
+                8A907F332AC7138A006146EA /* LibLlama.swift in Sources */,
+                542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */,
+                8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */,
+                8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */,
+                8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */,
+                542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */,
+                5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */,
+            );
+            runOnlyForDeploymentPostprocessing = 0;
+        };
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+        8A1C837F2AC328BE0096AF73 /* Debug */ = {
+            isa = XCBuildConfiguration;
+            buildSettings = {
+                ALWAYS_SEARCH_USER_PATHS = NO;
+                ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
+                CLANG_ANALYZER_NONNULL = YES;
+                CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+                CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
+                CLANG_ENABLE_MODULES = YES;
+                CLANG_ENABLE_OBJC_ARC = YES;
+                CLANG_ENABLE_OBJC_WEAK = YES;
+                CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+                CLANG_WARN_BOOL_CONVERSION = YES;
+                CLANG_WARN_COMMA = YES;
+                CLANG_WARN_CONSTANT_CONVERSION = YES;
+                CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+                CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+                CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+                CLANG_WARN_EMPTY_BODY = YES;
+                CLANG_WARN_ENUM_CONVERSION = YES;
+                CLANG_WARN_INFINITE_RECURSION = YES;
+                CLANG_WARN_INT_CONVERSION = YES;
+                CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+                CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+                CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+                CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+                CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+                CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+                CLANG_WARN_STRICT_PROTOTYPES = YES;
+                CLANG_WARN_SUSPICIOUS_MOVE = YES;
+                CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+                CLANG_WARN_UNREACHABLE_CODE = YES;
+                CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+                COPY_PHASE_STRIP = NO;
+                DEBUG_INFORMATION_FORMAT = dwarf;
+                ENABLE_STRICT_OBJC_MSGSEND = YES;
+                ENABLE_TESTABILITY = YES;
+                ENABLE_USER_SCRIPT_SANDBOXING = YES;
+                GCC_C_LANGUAGE_STANDARD = gnu17;
+                GCC_DYNAMIC_NO_PIC = NO;
+                GCC_NO_COMMON_BLOCKS = YES;
+                GCC_OPTIMIZATION_LEVEL = 0;
+                GCC_PREPROCESSOR_DEFINITIONS = (
+                    "DEBUG=1",
+                    "$(inherited)",
+                );
+                GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+                GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+                GCC_WARN_UNDECLARED_SELECTOR = YES;
+                GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+                GCC_WARN_UNUSED_FUNCTION = YES;
+                GCC_WARN_UNUSED_VARIABLE = YES;
+                IPHONEOS_DEPLOYMENT_TARGET = 17.0;
+                LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
+                MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
+                MTL_FAST_MATH = YES;
+                ONLY_ACTIVE_ARCH = YES;
+                SDKROOT = iphoneos;
+                SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
+                SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+            };
+            name = Debug;
+        };
+        8A1C83802AC328BE0096AF73 /* Release */ = {
+            isa = XCBuildConfiguration;
+            buildSettings = {
+                ALWAYS_SEARCH_USER_PATHS = NO;
+                ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
+                CLANG_ANALYZER_NONNULL = YES;
+                CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+                CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
+                CLANG_ENABLE_MODULES = YES;
+                CLANG_ENABLE_OBJC_ARC = YES;
+                CLANG_ENABLE_OBJC_WEAK = YES;
+                CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+                CLANG_WARN_BOOL_CONVERSION = YES;
+                CLANG_WARN_COMMA = YES;
+                CLANG_WARN_CONSTANT_CONVERSION = YES;
+                CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+                CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+                CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+                CLANG_WARN_EMPTY_BODY = YES;
+                CLANG_WARN_ENUM_CONVERSION = YES;
+                CLANG_WARN_INFINITE_RECURSION = YES;
+                CLANG_WARN_INT_CONVERSION = YES;
+                CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+                CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+                CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+                CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+                CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+                CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+                CLANG_WARN_STRICT_PROTOTYPES = YES;
+                CLANG_WARN_SUSPICIOUS_MOVE = YES;
+                CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+                CLANG_WARN_UNREACHABLE_CODE = YES;
+                CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+                COPY_PHASE_STRIP = NO;
+                DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+                ENABLE_NS_ASSERTIONS = NO;
+                ENABLE_STRICT_OBJC_MSGSEND = YES;
+                ENABLE_USER_SCRIPT_SANDBOXING = YES;
+                GCC_C_LANGUAGE_STANDARD = gnu17;
+                GCC_NO_COMMON_BLOCKS = YES;
+                GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+                GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+                GCC_WARN_UNDECLARED_SELECTOR = YES;
+                GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+                GCC_WARN_UNUSED_FUNCTION = YES;
+                GCC_WARN_UNUSED_VARIABLE = YES;
+                IPHONEOS_DEPLOYMENT_TARGET = 17.0;
+                LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
+                MTL_ENABLE_DEBUG_INFO = NO;
+                MTL_FAST_MATH = YES;
+                SDKROOT = iphoneos;
+                SWIFT_COMPILATION_MODE = wholemodule;
+                VALIDATE_PRODUCT = YES;
+            };
+            name = Release;
+        };
+        8A1C83822AC328BE0096AF73 /* Debug */ = {
+            isa = XCBuildConfiguration;
+            buildSettings = {
+                ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+                ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
+                CLANG_ENABLE_MODULES = YES;
+                CODE_SIGN_STYLE = Automatic;
+                CURRENT_PROJECT_VERSION = 1;
+                DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
+                DEVELOPMENT_TEAM = STLSG3FG8Q;
+                ENABLE_PREVIEWS = YES;
+                GENERATE_INFOPLIST_FILE = YES;
+                INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
+                INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
+                INFOPLIST_KEY_UILaunchScreen_Generation = YES;
+                INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+                INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+                IPHONEOS_DEPLOYMENT_TARGET = 16.0;
+                LD_RUNPATH_SEARCH_PATHS = (
+                    "$(inherited)",
+                    "@executable_path/Frameworks",
+                );
+                MARKETING_VERSION = 1.0;
+                PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
+                PRODUCT_NAME = "$(TARGET_NAME)";
+                SWIFT_EMIT_LOC_STRINGS = YES;
+                SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
+                SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+                SWIFT_VERSION = 5.0;
+                TARGETED_DEVICE_FAMILY = "1,2";
+            };
+            name = Debug;
+        };
+        8A1C83832AC328BE0096AF73 /* Release */ = {
+            isa = XCBuildConfiguration;
+            buildSettings = {
+                ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+                ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
+                CLANG_ENABLE_MODULES = YES;
+                CODE_SIGN_STYLE = Automatic;
+                CURRENT_PROJECT_VERSION = 1;
+                DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
+                DEVELOPMENT_TEAM = STLSG3FG8Q;
+                ENABLE_PREVIEWS = YES;
+                GENERATE_INFOPLIST_FILE = YES;
+                INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
+                INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
+                INFOPLIST_KEY_UILaunchScreen_Generation = YES;
+                INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+                INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+                IPHONEOS_DEPLOYMENT_TARGET = 16.0;
+                LD_RUNPATH_SEARCH_PATHS = (
+                    "$(inherited)",
+                    "@executable_path/Frameworks",
+                );
+                MARKETING_VERSION = 1.0;
+                PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
+                PRODUCT_NAME = "$(TARGET_NAME)";
+                SWIFT_EMIT_LOC_STRINGS = YES;
+                SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
+                SWIFT_VERSION = 5.0;
+                TARGETED_DEVICE_FAMILY = "1,2";
+            };
+            name = Release;
+        };
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+        8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = {
+            isa = XCConfigurationList;
+            buildConfigurations = (
+                8A1C837F2AC328BE0096AF73 /* Debug */,
+                8A1C83802AC328BE0096AF73 /* Release */,
+            );
+            defaultConfigurationIsVisible = 0;
+            defaultConfigurationName = Release;
+        };
+        8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = {
+            isa = XCConfigurationList;
+            buildConfigurations = (
+                8A1C83822AC328BE0096AF73 /* Debug */,
+                8A1C83832AC328BE0096AF73 /* Release */,
+            );
+            defaultConfigurationIsVisible = 0;
+            defaultConfigurationName = Release;
+        };
+/* End XCConfigurationList section */
+    };
+    rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
+}
diff --git a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/contents.xcworkspacedata
new file mode 100644
index 000000000..919434a62
--- /dev/null
+++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/contents.xcworkspacedata
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Workspace
+   version = "1.0">
+   <FileRef
+      location = "self:">
+   </FileRef>
+</Workspace>
diff --git a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
new file mode 100644
index 000000000..3d4c1e552
--- /dev/null
+++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>IDEDidComputeMac32BitWarning</key>
+    <true/>
+</dict>
+</plist>
diff --git a/examples/llama.swiftui/llama.swiftui/Assets.xcassets/AccentColor.colorset/Contents.json b/examples/llama.swiftui/llama.swiftui/Assets.xcassets/AccentColor.colorset/Contents.json
new file mode 100644
index 000000000..eb8789700
--- /dev/null
+++ b/examples/llama.swiftui/llama.swiftui/Assets.xcassets/AccentColor.colorset/Contents.json
@@ -0,0 +1,11 @@
+{
+  "colors" : [
+    {
+      "idiom" : "universal"
+    }
+  ],
+  "info" : {
+    "author" : "xcode",
+    "version" : 1
+  }
+}
diff --git a/examples/llama.swiftui/llama.swiftui/Assets.xcassets/AppIcon.appiconset/Contents.json b/examples/llama.swiftui/llama.swiftui/Assets.xcassets/AppIcon.appiconset/Contents.json
new file mode 100644
index 000000000..13613e3ee
--- /dev/null
+++ b/examples/llama.swiftui/llama.swiftui/Assets.xcassets/AppIcon.appiconset/Contents.json
@@ -0,0 +1,13 @@
+{
+  "images" : [
+    {
+      "idiom" : "universal",
+      "platform" : "ios",
+      "size" : "1024x1024"
+    }
+  ],
+  "info" : {
+    "author" : "xcode",
+    "version" : 1
+  }
+}
diff --git a/examples/llama.swiftui/llama.swiftui/Assets.xcassets/Contents.json b/examples/llama.swiftui/llama.swiftui/Assets.xcassets/Contents.json
new file mode 100644
index 000000000..73c00596a
--- /dev/null
+++ b/examples/llama.swiftui/llama.swiftui/Assets.xcassets/Contents.json
@@ -0,0 +1,6 @@
+{
+  "info" : {
+    "author" : "xcode",
+    "version" : 1
+  }
+}
diff --git a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
new file mode 100644
index 000000000..babc60cdc
--- /dev/null
+++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
@@ -0,0 +1,45 @@
+import Foundation
+
+@MainActor
+class LlamaState: ObservableObject {
+    @Published var messageLog = ""
+
+    private var llamaContext: LlamaContext?
+    private var modelUrl: URL? {
+        Bundle.main.url(forResource: "q8_0", withExtension: "gguf", subdirectory: "models")
+        // Bundle.main.url(forResource: "llama-2-7b-chat", withExtension: "Q2_K.gguf", subdirectory: "models")
+    }
+    init() {
+        do {
+            try loadModel()
+        } catch {
+            messageLog += "Error!\n"
+        }
+    }
+
+    private func loadModel() throws {
+        messageLog += "Loading model...\n"
+        if let modelUrl {
+            llamaContext = try LlamaContext.createContext(path: modelUrl.path())
+            messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
+        } else {
+            messageLog += "Could not locate model\n"
+        }
+    }
+
+    func complete(text: String) async {
+        guard let llamaContext else {
+            return
+        }
+        messageLog += "Attempting to complete text...\n"
+        await llamaContext.completion_init(text: text)
+        messageLog += "\(text)"
+
+        while await llamaContext.n_cur <= llamaContext.n_len {
+            let result = await llamaContext.completion_loop()
+            messageLog += "\(result)"
+        }
+        await llamaContext.clear()
+        messageLog += "\n\ndone\n"
+    }
+}
diff --git a/examples/llama.swiftui/llama.swiftui/Preview Content/Preview Assets.xcassets/Contents.json b/examples/llama.swiftui/llama.swiftui/Preview Content/Preview Assets.xcassets/Contents.json
new file mode 100644
index 000000000..73c00596a
--- /dev/null
+++ b/examples/llama.swiftui/llama.swiftui/Preview Content/Preview Assets.xcassets/Contents.json	
@@ -0,0 +1,6 @@
+{
+  "info" : {
+    "author" : "xcode",
+    "version" : 1
+  }
+}
diff --git a/examples/llama.swiftui/llama.swiftui/Resources/models/.gitignore b/examples/llama.swiftui/llama.swiftui/Resources/models/.gitignore
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
new file mode 100644
index 000000000..0bd16a806
--- /dev/null
+++ b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
@@ -0,0 +1,42 @@
+import SwiftUI
+
+struct ContentView: View {
+    @StateObject var llamaState = LlamaState()
+
+    @State private var multiLineText = ""
+
+    var body: some View {
+        VStack {
+            ScrollView(.vertical) {
+                Text(llamaState.messageLog)
+            }
+
+            TextEditor(text: $multiLineText)
+                .frame(height: 200)
+                .padding()
+                .border(Color.gray, width: 0.5)
+            Button(action: {
+                sendText()
+            }) {
+                Text("Send")
+                    .padding()
+                    .background(Color.blue)
+                    .foregroundColor(.white)
+                    .cornerRadius(8)
+            }
+        }
+        .padding()
+    }
+
+    func sendText() {
+        Task {
+            await llamaState.complete(text: multiLineText)
+            multiLineText = ""
+        }
+    }
+}
+/*
+#Preview {
+    ContentView()
+}
+*/
diff --git a/examples/llama.swiftui/llama.swiftui/llama_swiftuiApp.swift b/examples/llama.swiftui/llama.swiftui/llama_swiftuiApp.swift
new file mode 100644
index 000000000..cccda8a97
--- /dev/null
+++ b/examples/llama.swiftui/llama.swiftui/llama_swiftuiApp.swift
@@ -0,0 +1,10 @@
+import SwiftUI
+
+@main
+struct llama_swiftuiApp: App {
+    var body: some Scene {
+        WindowGroup {
+            ContentView()
+        }
+    }
+}

From 0dab8cd7cca7e1bc3550dcb4797b9062cdbb1ebd Mon Sep 17 00:00:00 2001
From: Kasumi <90275229+kasumi-1@users.noreply.github.com>
Date: Tue, 28 Nov 2023 01:39:42 +0800
Subject: [PATCH 18/64] readme : add Amica to UI list (#4230)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 2892132c4..d0d6c9666 100644
--- a/README.md
+++ b/README.md
@@ -116,6 +116,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [nat/openplayground](https://github.com/nat/openplayground)
 - [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
 - [withcatai/catai](https://github.com/withcatai/catai)
+- [semperai/amica](https://github.com/semperai/amica)
 
 ---
 

From b38a16dfcff88d547f78f52d1bea31b84a05aff7 Mon Sep 17 00:00:00 2001
From: bandoti <141645996+bandoti@users.noreply.github.com>
Date: Mon, 27 Nov 2023 15:25:42 -0400
Subject: [PATCH 19/64] cmake : fix issue with version info not getting baked
 into LlamaConfig.cmake (#3970)

* Split CPP generation from build-info query

* Remove blank lines

* Add BUILD_SHARED_LIBS option
---
 CMakeLists.txt                   |  4 ++++
 common/CMakeLists.txt            |  2 +-
 scripts/build-info.cmake         | 22 ----------------------
 scripts/gen-build-info-cpp.cmake | 24 ++++++++++++++++++++++++
 4 files changed, 29 insertions(+), 23 deletions(-)
 create mode 100644 scripts/gen-build-info-cpp.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f32df5fe5..3e0009415 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,7 @@ else()
 endif()
 
 # general
+option(BUILD_SHARED_LIBS                "build shared libraries"                                OFF)
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
 option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
 option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)
@@ -100,6 +101,9 @@ option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALO
 option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER               "llama: build server example"                           ON)
 
+# Required for relocatable CMake package
+include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
+
 #
 # Compile flags
 #
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 4f930bdc5..71891edc3 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -26,7 +26,7 @@ add_custom_command(
     COMMENT "Generating build details from Git"
     COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
             -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/build-info.cmake"
+            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake"
     WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
     DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
     VERBATIM
diff --git a/scripts/build-info.cmake b/scripts/build-info.cmake
index 73853dfa4..ea3dc55c8 100644
--- a/scripts/build-info.cmake
+++ b/scripts/build-info.cmake
@@ -1,5 +1,3 @@
-set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
-set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
 set(BUILD_NUMBER 0)
 set(BUILD_COMMIT "unknown")
 set(BUILD_COMPILER "unknown")
@@ -58,23 +56,3 @@ else()
     )
     set(BUILD_TARGET ${OUT})
 endif()
-
-# Only write the build info if it changed
-if(EXISTS ${OUTPUT_FILE})
-    file(READ ${OUTPUT_FILE} CONTENTS)
-    string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
-    set(OLD_COMMIT ${CMAKE_MATCH_1})
-    string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
-    set(OLD_COMPILER ${CMAKE_MATCH_1})
-    string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
-    set(OLD_TARGET ${CMAKE_MATCH_1})
-    if (
-        NOT OLD_COMMIT   STREQUAL BUILD_COMMIT   OR
-        NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
-        NOT OLD_TARGET   STREQUAL BUILD_TARGET
-    )
-        configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
-    endif()
-else()
-    configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
-endif()
diff --git a/scripts/gen-build-info-cpp.cmake b/scripts/gen-build-info-cpp.cmake
new file mode 100644
index 000000000..d89338920
--- /dev/null
+++ b/scripts/gen-build-info-cpp.cmake
@@ -0,0 +1,24 @@
+include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
+
+set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
+set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
+
+# Only write the build info if it changed
+if(EXISTS ${OUTPUT_FILE})
+    file(READ ${OUTPUT_FILE} CONTENTS)
+    string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
+    set(OLD_COMMIT ${CMAKE_MATCH_1})
+    string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
+    set(OLD_COMPILER ${CMAKE_MATCH_1})
+    string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
+    set(OLD_TARGET ${CMAKE_MATCH_1})
+    if (
+        NOT OLD_COMMIT   STREQUAL BUILD_COMMIT   OR
+        NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
+        NOT OLD_TARGET   STREQUAL BUILD_TARGET
+    )
+        configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
+    endif()
+else()
+    configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
+endif()

From 8406b0924bf323f37d536dee8b8165c1f3d9d11d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 28 Nov 2023 10:32:03 +0200
Subject: [PATCH 20/64] ggml : re-enable BLAS for CPU when src0 != F32 + remove
 redundant full offload checks in llama.cpp (#4240)

* ggml : use blas even if src0 is not F32

* llama : use n_threads_batch only when n_tokens >= 32

ggml-ci

* llama : revert n_threads_batch logic

ggml-ci
---
 ggml.c    |  2 +-
 llama.cpp | 12 +-----------
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/ggml.c b/ggml.c
index 0c7264a36..c522a101f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9373,7 +9373,7 @@ static bool ggml_compute_forward_mul_mat_use_blas(
     // TODO: find the optimal values for these
     if (ggml_is_contiguous(src0) &&
         ggml_is_contiguous(src1) &&
-        src0->type == GGML_TYPE_F32 &&
+      //src0->type == GGML_TYPE_F32 &&
         src1->type == GGML_TYPE_F32 &&
         (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
 
diff --git a/llama.cpp b/llama.cpp
index f2b5967d7..cb544228b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5550,18 +5550,8 @@ static int llama_decode_internal(
         n_threads = std::min(4, n_threads);
     }
 
-    // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
-    const bool full_offload_supported =
-        model.arch == LLM_ARCH_LLAMA      ||
-        model.arch == LLM_ARCH_BAICHUAN   ||
-        model.arch == LLM_ARCH_FALCON     ||
-        model.arch == LLM_ARCH_REFACT     ||
-        model.arch == LLM_ARCH_MPT        ||
-        model.arch == LLM_ARCH_STARCODER  ||
-        model.arch == LLM_ARCH_STABLELM;
-
     const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
-    if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
+    if (ggml_cpu_has_cublas() && fully_offloaded) {
         n_threads = 1;
     }
 

From 64e64aa2557d97490b2fe1262b313e2f4a1607e3 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Tue, 28 Nov 2023 04:51:11 -0500
Subject: [PATCH 21/64] ggml : restore abort() in GGML_ASSERT (#4242)

---
 ggml.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/ggml.h b/ggml.h
index f2fce0f22..4d6d4edfd 100644
--- a/ggml.h
+++ b/ggml.h
@@ -244,11 +244,10 @@
 #define GGML_ASSERT(x) \
     do { \
         if (!(x)) { \
-            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            fflush(stderr); \
             fflush(stdout); \
+            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
             ggml_print_backtrace(); \
-            exit(1); \
+            abort(); \
         } \
     } while (0)
 

From 4fea3420ee3918d125d74c94d962a6ea82875351 Mon Sep 17 00:00:00 2001
From: Peter Sugihara <peter@campsh.com>
Date: Tue, 28 Nov 2023 23:16:34 -0800
Subject: [PATCH 22/64] readme : add FreeChat (#4248)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index d0d6c9666..44cc94093 100644
--- a/README.md
+++ b/README.md
@@ -117,6 +117,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
 - [withcatai/catai](https://github.com/withcatai/catai)
 - [semperai/amica](https://github.com/semperai/amica)
+- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
 
 ---
 

From 1f5cd83275fabb43f2ae92c30033b384a3eb37b4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 29 Nov 2023 11:00:17 +0200
Subject: [PATCH 23/64] examples : add readme files

---
 examples/lookahead/README.md   | 7 +++++++
 examples/speculative/README.md | 8 ++++++++
 2 files changed, 15 insertions(+)
 create mode 100644 examples/lookahead/README.md
 create mode 100644 examples/speculative/README.md

diff --git a/examples/lookahead/README.md b/examples/lookahead/README.md
new file mode 100644
index 000000000..252a6689e
--- /dev/null
+++ b/examples/lookahead/README.md
@@ -0,0 +1,7 @@
+# llama.cpp/examples/lookahead
+
+Demonstartion of lookahead decoding technique:
+
+https://lmsys.org/blog/2023-11-21-lookahead-decoding/
+
+More info: https://github.com/ggerganov/llama.cpp/pull/4207
diff --git a/examples/speculative/README.md b/examples/speculative/README.md
new file mode 100644
index 000000000..d88fd3790
--- /dev/null
+++ b/examples/speculative/README.md
@@ -0,0 +1,8 @@
+# llama.cpp/examples/speculative
+
+Demonstartion of speculative decoding and tree-based speculative decoding techniques
+
+More info:
+
+- https://github.com/ggerganov/llama.cpp/pull/2926
+- https://github.com/ggerganov/llama.cpp/pull/3624

From e2bd725f4b39bc5c6234858d158e01248f5ab5bd Mon Sep 17 00:00:00 2001
From: rhjdvsgsgks <26178113+rhjdvsgsgks@users.noreply.github.com>
Date: Thu, 30 Nov 2023 20:50:40 +0000
Subject: [PATCH 24/64] py : fix oai proxy (#3972)

* fix oai proxy

fix generation not stoped while bot stop talking in chat mode

fix possible `slot_id` not exist

response for cors (and pre flight)

* oai proxy: workaround for some client (such as Chatbox)

* use stop as separator to replace hardcoded `\n`
---
 examples/server/api_like_OAI.py | 46 ++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/examples/server/api_like_OAI.py b/examples/server/api_like_OAI.py
index 313e1a965..830c056d4 100755
--- a/examples/server/api_like_OAI.py
+++ b/examples/server/api_like_OAI.py
@@ -11,10 +11,10 @@ app = Flask(__name__)
 slot_id = -1
 
 parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
-parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')
-parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: '\\nUSER: ')", default="\\nUSER: ")
-parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: '\\nASSISTANT: ')", default="\\nASSISTANT: ")
-parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: '\\nASSISTANT's RULE: ')", default="\\nASSISTANT's RULE: ")
+parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.')
+parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: 'USER: ')", default="USER: ")
+parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: 'ASSISTANT: ')", default="ASSISTANT: ")
+parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: 'ASSISTANT's RULE: ')", default="ASSISTANT's RULE: ")
 parser.add_argument("--stop", type=str, help="the end of response in chat completions(default: '</s>')", default="</s>")
 parser.add_argument("--llama-api", type=str, help="Set the address of server.cpp in llama.cpp(default: http://127.0.0.1:8080)", default='http://127.0.0.1:8080')
 parser.add_argument("--api-key", type=str, help="Set the api key to allow only few user(default: NULL)", default="")
@@ -34,19 +34,19 @@ def is_present(json, key):
 
 #convert chat to prompt
 def convert_chat(messages):
-    prompt = "" + args.chat_prompt.replace("\\n", "\n")
 
-    system_n = args.system_name.replace("\\n", "\n")
-    user_n = args.user_name.replace("\\n", "\n")
-    ai_n = args.ai_name.replace("\\n", "\n")
-    stop = args.stop.replace("\\n", "\n")
+    system_n = args.system_name
+    user_n = args.user_name
+    ai_n = args.ai_name
+    stop = args.stop
 
+    prompt = "" + args.chat_prompt + stop
 
     for line in messages:
         if (line["role"] == "system"):
-            prompt += f"{system_n}{line['content']}"
+            prompt += f"{system_n}{line['content']}{stop}"
         if (line["role"] == "user"):
-            prompt += f"{user_n}{line['content']}"
+            prompt += f"{user_n}{line['content']}{stop}"
         if (line["role"] == "assistant"):
             prompt += f"{ai_n}{line['content']}{stop}"
     prompt += ai_n.rstrip()
@@ -130,7 +130,7 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
             }
         ]
     }
-    slot_id = data["slot_id"]
+    slot_id = data.get("slot_id")
     if (chat):
         if (start):
             resData["choices"][0]["delta"] =  {
@@ -150,11 +150,13 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
     return resData
 
 
-@app.route('/chat/completions', methods=['POST'])
-@app.route('/v1/chat/completions', methods=['POST'])
+@app.route('/chat/completions', methods=['POST', 'OPTIONS'])
+@app.route('/v1/chat/completions', methods=['POST', 'OPTIONS'])
 def chat_completions():
     if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
         return Response(status=403)
+    if request.method == 'OPTIONS':
+        return Response(headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
     body = request.get_json()
     stream = False
     tokenize = False
@@ -177,20 +179,22 @@ def chat_completions():
             data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True)
             time_now = int(time.time())
             resData = make_resData_stream({}, chat=True, time_now=time_now, start=True)
-            yield 'data: {}\n'.format(json.dumps(resData))
+            yield 'data: {}\n\n'.format(json.dumps(resData))
             for line in data.iter_lines():
                 if line:
                     decoded_line = line.decode('utf-8')
                     resData = make_resData_stream(json.loads(decoded_line[6:]), chat=True, time_now=time_now)
-                    yield 'data: {}\n'.format(json.dumps(resData))
-        return Response(generate(), mimetype='text/event-stream')
+                    yield 'data: {}\n\n'.format(json.dumps(resData))
+        return Response(generate(), mimetype='text/event-stream', headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
 
 
-@app.route('/completions', methods=['POST'])
-@app.route('/v1/completions', methods=['POST'])
+@app.route('/completions', methods=['POST', 'OPTIONS'])
+@app.route('/v1/completions', methods=['POST', 'OPTIONS'])
 def completion():
     if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
         return Response(status=403)
+    if request.method == 'OPTIONS':
+        return Response(headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
     body = request.get_json()
     stream = False
     tokenize = False
@@ -216,8 +220,8 @@ def completion():
                 if line:
                     decoded_line = line.decode('utf-8')
                     resData = make_resData_stream(json.loads(decoded_line[6:]), chat=False, time_now=time_now)
-                    yield 'data: {}\n'.format(json.dumps(resData))
-        return Response(generate(), mimetype='text/event-stream')
+                    yield 'data: {}\n\n'.format(json.dumps(resData))
+        return Response(generate(), mimetype='text/event-stream', headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
 
 if __name__ == '__main__':
     app.run(args.host, port=args.port)

From 954e22858c5cea1dc03e9172d3879402af2b5990 Mon Sep 17 00:00:00 2001
From: tarcey <cey.tarik@gmail.com>
Date: Thu, 30 Nov 2023 22:40:23 +0100
Subject: [PATCH 25/64] llama : fix typical sampling (#4261)

Typical sampling was broken because after copying new_candidates into canditates, the "sorted" bool is left at "true", but the new data is no longer sorted according to probability. Patch to set "sorted" to false.

Test: Generating with temp=0.0001 (approx. argmax)  should generate the same sequence at typical>=1.0 and typical=0.9999 (approx. disabled, but enters the typical sampling codepath).
---
 llama.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llama.cpp b/llama.cpp
index cb544228b..4af450615 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7027,6 +7027,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
     // Replace the data in candidates with the new_candidates data
     std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
     candidates->size = new_candidates.size();
+    candidates->sorted = false;
 
     if (ctx) {
         ctx->t_sample_us += ggml_time_us() - t_start_sample_us;

From f4d973cecb7368c985720ba9100ae6abba14806d Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 30 Nov 2023 22:42:23 +0100
Subject: [PATCH 26/64] convert.py : fix llama/llama2 conversion due to
 vocab_size=-1 (#4258)

---
 convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert.py b/convert.py
index 3ad836ce0..6e95d6cb3 100755
--- a/convert.py
+++ b/convert.py
@@ -267,7 +267,7 @@ class Params:
             n_ctx = 2048
 
         return Params(
-            n_vocab          = config.get("vocab_size", model["tok_embeddings.weight"].shape[0]),
+            n_vocab          = model["tok_embeddings.weight"].shape[0],
             n_embd           = config["dim"],
             n_layer          = config["n_layers"],
             n_ctx            = n_ctx,

From b18c66ca6eee4fe0465cff5042daf05005dc9ab2 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Thu, 30 Nov 2023 22:43:08 +0100
Subject: [PATCH 27/64] llama : fix alignment of general.name in print meta
 (#4254)

* llama: fix alignment of general.name in print meta

This commit fixes the alignment of the general.name field in the
llm_load_print_meta function.

Currently the output looks like this:
```console
llm_load_print_meta: model ftype      = mostly Q4_0
llm_load_print_meta: model params     = 13.02 B
llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
llm_load_print_meta: general.name   = LLaMA v2
```
And with this commit it looks like this:
```console
llm_load_print_meta: model ftype      = mostly Q4_0
llm_load_print_meta: model params     = 13.02 B
llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
llm_load_print_meta: general.name     = LLaMA v2
```

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>

* llama: fix alignment of special tokens

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>

---------

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
---
 llama.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 4af450615..26754ef72 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2645,15 +2645,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     }
 
     // general kv
-    LLAMA_LOG_INFO("%s: general.name   = %s\n",    __func__, model.name.c_str());
+    LLAMA_LOG_INFO("%s: general.name     = %s\n",    __func__, model.name.c_str());
 
     // special tokens
-    if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
-    if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
-    if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
-    if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
-    if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
-    if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token  = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].text.c_str() );    }
+    if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
+    if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
+    if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
+    if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
+    if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
+    if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].text.c_str() );    }
 }
 
 static void llm_load_tensors(

From 74daabae6927b99e7333d6126dee35193c418457 Mon Sep 17 00:00:00 2001
From: Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
Date: Thu, 30 Nov 2023 22:43:32 +0100
Subject: [PATCH 28/64] readme : fix typo (#4253)

llama.cpp uses GitHub Actions, not Gitlab Actions.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 44cc94093..b89ba73aa 100644
--- a/README.md
+++ b/README.md
@@ -896,7 +896,7 @@ Additionally, there the following images, similar to the above:
 - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 
-The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the Gitlab Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
+The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
 
 #### Usage
 

From f7f9e06212d44530b3200033286049dbdf84b3d3 Mon Sep 17 00:00:00 2001
From: Li Tan <tanliboy@gmail.com>
Date: Thu, 30 Nov 2023 13:44:11 -0800
Subject: [PATCH 29/64] cmake : fix the metal file foder path (#4217)

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3e0009415..6f35a25d5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -165,7 +165,7 @@ if (LLAMA_METAL)
     #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
 
     # copy ggml-metal.metal to bin directory
-    configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
+    configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
 
     set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
         ${FOUNDATION_LIBRARY}

From bde629bb53b85886ee0fe83524c1efe2689bc618 Mon Sep 17 00:00:00 2001
From: Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
Date: Fri, 1 Dec 2023 06:45:17 +0900
Subject: [PATCH 30/64] batched.swift : update README.md (#4214)

docs: update how to run
---
 examples/batched.swift/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/batched.swift/README.md b/examples/batched.swift/README.md
index 464c9079c..4c2721fe8 100644
--- a/examples/batched.swift/README.md
+++ b/examples/batched.swift/README.md
@@ -1,4 +1,4 @@
 This is a swift clone of `examples/batched`.
 
 $ `make`
-$ `./swift MODEL_PATH [PROMPT] [PARALLEL]`
+$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]`

From 3bd2c7ce1b752973cf937482a0333e85d1681e2b Mon Sep 17 00:00:00 2001
From: Juraj Bednar <juraj@bednar.io>
Date: Thu, 30 Nov 2023 22:46:01 +0100
Subject: [PATCH 31/64] docker : add finetune option (#4211)

---
 .devops/tools.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.devops/tools.sh b/.devops/tools.sh
index 9d999315f..3a7d274e4 100755
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -13,6 +13,8 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
     ./quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
     ./main "$@"
+elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
+    ./finetune "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
     echo "Converting PTH to GGML..."
     for i in `ls $1/$2/ggml-model-f16.bin*`; do
@@ -34,6 +36,8 @@ else
     echo "              ex: --outtype f16 \"/models/7B/\" "
     echo "  --quantize (-q): Optimize with quantization process ggml"
     echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
+    echo "  --finetune (-f): Run finetune command to create a lora finetune of the model"
+    echo "              See documentation for finetune for command-line parameters"
     echo "  --all-in-one (-a): Execute --convert & --quantize"
     echo "              ex: \"/models/\" 7B"
     echo "  --server (-s): Run a model on the server"

From 524907aa768a26cbf83d8e2eb30547e2ee1d1b1a Mon Sep 17 00:00:00 2001
From: vodkaslime <646329483@qq.com>
Date: Fri, 1 Dec 2023 05:49:21 +0800
Subject: [PATCH 32/64] readme : fix (#4135)

* fix: readme

* chore: resolve comments

* chore: resolve comments
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b89ba73aa..dac971ae5 100644
--- a/README.md
+++ b/README.md
@@ -324,7 +324,7 @@ mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
 
 ### BLAS Build
 
-Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
+Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
 
 - #### Accelerate Framework:
 

From 8efa0f6ebed53c9453e6721da86fb294e5015909 Mon Sep 17 00:00:00 2001
From: Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
Date: Thu, 30 Nov 2023 13:56:19 -0800
Subject: [PATCH 33/64] main : pass LOG_TEE callback to llama.cpp log (#4033)

* main : Call llama_log_set to use LOG_TEE

* tabs to spaces
---
 examples/main/main.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 31ec8cade..c5cdfbf21 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -100,6 +100,12 @@ static void sigint_handler(int signo) {
 }
 #endif
 
+static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) user_data;
+    LOG_TEE("%s", text);
+}
+
 int main(int argc, char ** argv) {
     gpt_params params;
     g_params = &params;
@@ -113,6 +119,7 @@ int main(int argc, char ** argv) {
     log_set_target(log_filename_generator("main", "log"));
     LOG_TEE("Log start\n");
     log_dump_cmdline(argc, argv);
+    llama_log_set(llama_log_callback_logTee, nullptr);
 #endif // LOG_DISABLE_LOGS
 
     // TODO: Dump params ?

From 33c9892af58b7b161f2a532935dcccff8c8048c6 Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Thu, 30 Nov 2023 23:11:14 +0100
Subject: [PATCH 34/64] llava : ShareGPT4V compatibility (vision encoder only
 loading) (#4172)

* ShareGPT4 compatibility (vision encoder only loading)

Load only a CLIP vision encoder (as supplied by ShareGPT finetunes)
Corrects the argument parsing for --img_mean and --img_std (which were previously not parsed but attempted to access)
Defines defaults for img_mean and img_std which are equal to the llava 1.5 CLIP encoder, so you do not have to provide them

* Update convert-image-encoder-to-gguf.py
---
 .../llava/convert-image-encoder-to-gguf.py    | 52 +++++++++++++------
 1 file changed, 37 insertions(+), 15 deletions(-)

diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py
index 2f5eef199..729aaef8f 100644
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -5,7 +5,7 @@ import json
 import torch
 import numpy as np
 from gguf import *
-from transformers import CLIPModel, CLIPProcessor
+from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel
 
 TEXT = "clip.text"
 VISION = "clip.vision"
@@ -78,11 +78,19 @@ ap.add_argument("--text-only", action="store_true", required=False,
                 help="Save a text-only model. It can't be used to encode images")
 ap.add_argument("--vision-only", action="store_true", required=False,
                 help="Save a vision-only model. It can't be used to encode texts")
+ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
+                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
 ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
 ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
 ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
+# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
+default_image_mean = [0.48145466, 0.4578275, 0.40821073]
+default_image_std = [0.26862954, 0.26130258, 0.27577711]
+ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
+ap.add_argument('--image_std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
 
+# with proper
 args = ap.parse_args()
 
 
@@ -96,15 +104,22 @@ if args.use_f32:
 # output in the same directory as the model if output_dir is None
 dir_model = args.model_dir
 
-
-with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
-    vocab = json.load(f)
-    tokens = [key for key in vocab]
+if args.clip_model_is_vision:
+    vocab = None
+    tokens = None
+else:
+    with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
+        vocab = json.load(f)
+        tokens = [key for key in vocab]
 
 with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
     config = json.load(f)
-    v_hparams = config["vision_config"]
-    t_hparams = config["text_config"]
+    if args.clip_model_is_vision:
+        v_hparams = config
+        t_hparams = None
+    else:
+        v_hparams = config["vision_config"]
+        t_hparams = config["text_config"]
 
 # possible data types
 #   ftype == 0 -> float32
@@ -117,9 +132,12 @@ ftype = 1
 if args.use_f32:
     ftype = 0
 
-
-model = CLIPModel.from_pretrained(dir_model)
-processor = CLIPProcessor.from_pretrained(dir_model)
+if args.clip_model_is_vision:
+    model = CLIPVisionModel.from_pretrained(dir_model)
+    processor = None
+else:
+    model = CLIPModel.from_pretrained(dir_model)
+    processor = CLIPProcessor.from_pretrained(dir_model)
 
 fname_middle = None
 has_text_encoder = True
@@ -128,13 +146,13 @@ has_llava_projector = False
 if args.text_only:
     fname_middle = "text-"
     has_vision_encoder = False
-elif args.vision_only:
-    fname_middle = "vision-"
-    has_text_encoder = False
 elif args.llava_projector is not None:
     fname_middle = "mmproj-"
     has_text_encoder = False
     has_llava_projector = True
+elif args.vision_only:
+    fname_middle = "vision-"
+    has_text_encoder = False
 else:
     fname_middle = ""
 
@@ -182,8 +200,12 @@ if has_vision_encoder:
     block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
     fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
 
-    image_mean = processor.image_processor.image_mean if args.image_mean is None else args.image_mean
-    image_std = processor.image_processor.image_std if args.image_std is None else args.image_std
+    if processor is not None:
+        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
+        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
+    else:
+        image_mean = args.image_mean if args.image_mean is not None else default_image_mean
+        image_std = args.image_std if args.image_std is not None else default_image_std
     fout.add_array("clip.vision.image_mean", image_mean)
     fout.add_array("clip.vision.image_std", image_std)
 

From 15f5d96037e597523b721aa39c874d69de2acf85 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 30 Nov 2023 17:23:08 -0500
Subject: [PATCH 35/64] build : fix build info generation and cleanup Makefile
 (#3920)

* cmake : fix joining of REAL_GIT_DIR

* fix includes with help from include-what-you-use

* make : remove unneeded deps and add test-rope target

* fix C includes in C++ source files

* Revert "fix includes with help from include-what-you-use"

This reverts commit 635e9fadfd516d4604a0fecf4a854bfb25ad17ae.
---
 .gitignore            | 25 +++++++++++++------------
 Makefile              | 23 +++++++++++++----------
 common/CMakeLists.txt |  7 ++++++-
 ggml-opencl.cpp       | 12 +++++-------
 llama.cpp             |  1 -
 5 files changed, 37 insertions(+), 31 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3806e05dd..58c483994 100644
--- a/.gitignore
+++ b/.gitignore
@@ -88,15 +88,16 @@ poetry.lock
 poetry.toml
 
 # Test binaries
-tests/test-grammar-parser
-tests/test-llama-grammar
-tests/test-double-float
-tests/test-grad0
-tests/test-opt
-tests/test-quantize-fns
-tests/test-quantize-perf
-tests/test-sampling
-tests/test-tokenizer-0-llama
-tests/test-tokenizer-0-falcon
-tests/test-tokenizer-1-llama
-tests/test-tokenizer-1-bpe
+/tests/test-grammar-parser
+/tests/test-llama-grammar
+/tests/test-double-float
+/tests/test-grad0
+/tests/test-opt
+/tests/test-quantize-fns
+/tests/test-quantize-perf
+/tests/test-sampling
+/tests/test-tokenizer-0-llama
+/tests/test-tokenizer-0-falcon
+/tests/test-tokenizer-1-llama
+/tests/test-tokenizer-1-bpe
+/tests/test-rope
diff --git a/Makefile b/Makefile
index 95d85236f..22132ae23 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ BUILD_TARGETS = \
 TEST_TARGETS = \
 	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
-	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
+	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope
 
 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -648,7 +648,7 @@ beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS)
 finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-export-lora: examples/export-lora/export-lora.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
 speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
@@ -701,28 +701,28 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
-tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-double-float: tests/test-double-float.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-grad0: tests/test-grad0.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-opt: tests/test-opt.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-opt: tests/test-opt.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
 tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
@@ -737,5 +737,8 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
 tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
+tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
 tests/test-c.o: tests/test-c.c llama.h
 	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 71891edc3..b5d5453d2 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -11,7 +11,12 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
     if(NOT IS_DIRECTORY "${GIT_DIR}")
         file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
         string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
-        set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
+        string(FIND "${REAL_GIT_DIR}" "/" SLASH_POS)
+        if (SLASH_POS EQUAL 0)
+            set(GIT_DIR "${REAL_GIT_DIR}")
+        else()
+            set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
+        endif()
     endif()
 
     set(GIT_INDEX "${GIT_DIR}/index")
diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
index 202bcb485..496f9cdca 100644
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -1,20 +1,18 @@
+#include "ggml.h"
 #include "ggml-opencl.h"
 
 #include <array>
 #include <atomic>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
 #include <sstream>
 #include <vector>
-#include <limits>
 
 #define CL_TARGET_OPENCL_VERSION 110
 #include <clblast.h>
 
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "ggml.h"
-
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
diff --git a/llama.cpp b/llama.cpp
index 26754ef72..1e00ea4a9 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -46,7 +46,6 @@
     #endif
     #include <windows.h>
     #include <io.h>
-    #include <stdio.h> // for _fseeki64
 #endif
 
 #include <algorithm>

From d2809a3ba2780e00fce5a6149a7eda09f1c0e906 Mon Sep 17 00:00:00 2001
From: WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
Date: Thu, 30 Nov 2023 17:23:44 -0500
Subject: [PATCH 36/64] make : fix Apple clang determination bug (#4272)

Co-authored-by: Will Findley <findley@gmail.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 22132ae23..25b113e0a 100644
--- a/Makefile
+++ b/Makefile
@@ -30,7 +30,7 @@ ifeq '' '$(findstring clang,$(shell $(CC) --version))'
 	CC_VER := $(shell $(CC) -dumpfullversion -dumpversion | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
 else
 	CC_IS_CLANG=1
-	ifeq '' '$(findstring Apple LLVM,$(shell $(CC) --version))'
+	ifeq '' '$(findstring Apple,$(shell $(CC) --version))'
 		CC_IS_LLVM_CLANG=1
 	else
 		CC_IS_APPLE_CLANG=1

From f43f09366dfd018e4568e23a232aaa8c4f7cfc78 Mon Sep 17 00:00:00 2001
From: Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
Date: Thu, 30 Nov 2023 17:25:04 -0500
Subject: [PATCH 37/64] server : add single-client multi-prompt support (#4232)

* * add multiprompt support

* * cleanup

* * more cleanup

* * remove atomicity of id_gen, and change lock_guard to unique_lock on completion requests

* * remove all references to mutex_multitasks

* Update examples/server/server.cpp

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Update examples/server/server.cpp

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Update examples/server/server.cpp

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Update examples/server/server.cpp

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* * change to set

---------

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
---
 examples/server/server.cpp | 139 ++++++++++++++++++++++++++++++++++---
 1 file changed, 128 insertions(+), 11 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 50f124b13..5edb3678e 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -155,15 +155,23 @@ struct task_server {
     json data;
     bool infill_mode = false;
     bool embedding_mode = false;
+    int multitask_id = -1;
 };
 
 struct task_result {
     int id;
+    int multitask_id = -1;
     bool stop;
     bool error;
     json result_json;
 };
 
+struct task_multi {
+    int id;
+    std::set<int> subtasks_remaining{};
+    std::vector<task_result> results{};
+};
+
 // TODO: can become bool if we can't find use of more states
 enum slot_state
 {
@@ -406,6 +414,9 @@ struct llama_client_slot
     double t_prompt_processing; // ms
     double t_token_generation; // ms
 
+    // multitasks
+    int multitask_id = -1;
+
     void reset() {
         num_prompt_tokens      = 0;
         generated_text         = "";
@@ -529,7 +540,8 @@ struct llama_server_context
 
     std::vector<task_server> queue_tasks;
     std::vector<task_result> queue_results;
-    std::mutex mutex_tasks;
+    std::vector<task_multi>  queue_multitasks;
+    std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks
     std::mutex mutex_results;
 
     ~llama_server_context()
@@ -1112,17 +1124,40 @@ struct llama_server_context
         return slot.images.size() > 0;
     }
 
-    void send_error(int id, std::string error)
+    void send_error(task_server& task, std::string error)
     {
         std::lock_guard<std::mutex> lock(mutex_results);
         task_result res;
-        res.id = id;
+        res.id = task.id;
+        res.multitask_id = task.multitask_id;
         res.stop = false;
         res.error = true;
         res.result_json = { { "content", error } };
         queue_results.push_back(res);
     }
 
+    void add_multi_task(int id, std::vector<int>& sub_ids)
+    {
+        std::lock_guard<std::mutex> lock(mutex_tasks);
+        task_multi multi;
+        multi.id = id;
+        std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
+        queue_multitasks.push_back(multi);
+    }
+
+    void update_multi_task(int multitask_id, int subtask_id, task_result& result)
+    {
+        std::lock_guard<std::mutex> lock(mutex_tasks);
+        for (auto& multitask : queue_multitasks)
+        {
+            if (multitask.id == multitask_id)
+            {
+                multitask.subtasks_remaining.erase(subtask_id);
+                multitask.results.push_back(result);
+            }
+        }
+    }
+
     json get_model_props()
     {
         return get_formated_generation(slots[0]);
@@ -1167,6 +1202,7 @@ struct llama_server_context
         std::lock_guard<std::mutex> lock(mutex_results);
         task_result res;
         res.id = slot.task_id;
+        res.multitask_id = slot.multitask_id;
         res.error = false;
         res.stop = false;
 
@@ -1206,6 +1242,7 @@ struct llama_server_context
         std::lock_guard<std::mutex> lock(mutex_results);
         task_result res;
         res.id = slot.task_id;
+        res.multitask_id = slot.multitask_id;
         res.error = false;
         res.stop = true;
 
@@ -1251,6 +1288,12 @@ struct llama_server_context
             res.result_json["model"] = slot.oaicompat_model;
         }
 
+        // parent multitask, if any, needs to be updated
+        if (slot.multitask_id != -1)
+        {
+            update_multi_task(slot.multitask_id, slot.task_id, res);
+        }
+
         queue_results.push_back(res);
     }
 
@@ -1259,6 +1302,7 @@ struct llama_server_context
         std::lock_guard<std::mutex> lock(mutex_results);
         task_result res;
         res.id = slot.task_id;
+        res.multitask_id = slot.multitask_id;
         res.error = false;
         res.stop = true;
 
@@ -1285,9 +1329,9 @@ struct llama_server_context
         queue_results.push_back(res);
     }
 
-    int request_completion(json data, bool infill, bool embedding)
+    int request_completion(json data, bool infill, bool embedding, int multitask_id)
     {
-        std::lock_guard<std::mutex> lock(mutex_tasks);
+        std::unique_lock<std::mutex> lock(mutex_tasks);
         task_server task;
         task.id = id_gen++;
         task.target_id = 0;
@@ -1295,6 +1339,16 @@ struct llama_server_context
         task.infill_mode = infill;
         task.embedding_mode = embedding;
         task.type = COMPLETION_TASK;
+        task.multitask_id = multitask_id;
+
+        // when a completion task's prompt array is not a singleton, we split it into multiple requests
+        if (task.data.at("prompt").size() > 1)
+        {
+            lock.unlock(); // entering new func scope
+            return split_multiprompt_task(task);
+        }
+
+        // otherwise, it's a single-prompt task, we actually queue it
         queue_tasks.push_back(task);
         return task.id;
     }
@@ -1313,8 +1367,17 @@ struct llama_server_context
 
             for (int i = 0; i < (int) queue_results.size(); i++)
             {
+                // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
+                if (queue_results[i].multitask_id == task_id)
+                {
+                    update_multi_task(task_id, queue_results[i].id, queue_results[i]);
+                    queue_results.erase(queue_results.begin() + i);
+                    continue;
+                }
+
                 if (queue_results[i].id == task_id)
                 {
+                    assert(queue_results[i].multitask_id == -1);
                     task_result res = queue_results[i];
                     queue_results.erase(queue_results.begin() + i);
                     return res;
@@ -1404,6 +1467,27 @@ struct llama_server_context
         queue_tasks.push_back(task);
     }
 
+    int split_multiprompt_task(task_server& multiprompt_task)
+    {
+        auto prompt_count = multiprompt_task.data.at("prompt").size();
+        assert(prompt_count > 1);
+
+        int multitask_id = id_gen++;
+        std::vector<int> subtask_ids(prompt_count);
+        for (int i = 0; i < prompt_count; i++)
+        {
+            json subtask_data = multiprompt_task.data;
+            subtask_data["prompt"] = subtask_data["prompt"][i];
+
+            // subtasks inherit everything else (infill mode, embedding mode, etc.)
+            subtask_ids[i] = request_completion(subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id);
+        }
+
+        // queue up the multitask so we can track its subtask progression
+        add_multi_task(multitask_id, subtask_ids);
+        return multitask_id;
+    }
+
     void process_tasks()
     {
         std::lock_guard<std::mutex> lock(mutex_tasks);
@@ -1419,7 +1503,7 @@ struct llama_server_context
                     {
                         LOG_TEE("slot unavailable\n");
                         // send error result
-                        send_error(task.id, "slot unavailable");
+                        send_error(task, "slot unavailable");
                         return;
                     }
 
@@ -1433,11 +1517,12 @@ struct llama_server_context
                     slot->infill = task.infill_mode;
                     slot->embedding = task.embedding_mode;
                     slot->task_id = task.id;
+                    slot->multitask_id = task.multitask_id;
 
                     if (!launch_slot_with_data(slot, task.data))
                     {
                         // send error result
-                        send_error(task.id, "internal_error");
+                        send_error(task, "internal_error");
                         break;
                     }
                 } break;
@@ -1453,6 +1538,38 @@ struct llama_server_context
                 } break;
             }
         }
+
+        // remove finished multitasks from the queue of multitasks, and add the corresponding result to the result queue
+        auto queue_iterator = queue_multitasks.begin();
+        while (queue_iterator != queue_multitasks.end())
+        {
+            if (queue_iterator->subtasks_remaining.empty())
+            {
+                // all subtasks done == multitask is done
+                task_result aggregate_result;
+                aggregate_result.id = queue_iterator->id;
+                aggregate_result.stop = true;
+                aggregate_result.error = false;
+
+                // collect json results into one json result
+                std::vector<json> result_jsons;
+                for (auto& subres : queue_iterator->results)
+                {
+                    result_jsons.push_back(subres.result_json);
+                    aggregate_result.error = aggregate_result.error && subres.error;
+                }
+                aggregate_result.result_json = json{ "results", result_jsons };
+
+                std::lock_guard<std::mutex> lock(mutex_results);
+                queue_results.push_back(aggregate_result);
+
+                queue_iterator = queue_multitasks.erase(queue_iterator);
+            }
+            else
+            {
+                ++queue_iterator;
+            }
+        }
     }
 
     bool update_slots() {
@@ -2596,7 +2713,7 @@ int main(int argc, char **argv)
     svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
             {
                 json data = json::parse(req.body);
-                const int task_id = llama.request_completion(data, false, false);
+                const int task_id = llama.request_completion(data, false, false, -1);
                 if (!json_value(data, "stream", false)) {
                     std::string completion_text;
                     task_result result = llama.next_result(task_id);
@@ -2685,7 +2802,7 @@ int main(int argc, char **argv)
             {
                 json data = oaicompat_completion_params_parse(json::parse(req.body));
 
-                const int task_id = llama.request_completion(data, false, false);
+                const int task_id = llama.request_completion(data, false, false, -1);
 
                 if (!json_value(data, "stream", false)) {
                     std::string completion_text;
@@ -2754,7 +2871,7 @@ int main(int argc, char **argv)
     svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
             {
                 json data = json::parse(req.body);
-                const int task_id = llama.request_completion(data, true, false);
+                const int task_id = llama.request_completion(data, true, false, -1);
                 if (!json_value(data, "stream", false)) {
                     std::string completion_text;
                     task_result result = llama.next_result(task_id);
@@ -2858,7 +2975,7 @@ int main(int argc, char **argv)
                 {
                     prompt = "";
                 }
-                const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true);
+                const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
                 task_result result = llama.next_result(task_id);
                 return res.set_content(result.result_json.dump(), "application/json");
             });

From 1d144112c0fbbb4ecc07dbcf4f05a380148bd6de Mon Sep 17 00:00:00 2001
From: Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
Date: Thu, 30 Nov 2023 17:25:49 -0500
Subject: [PATCH 38/64] server : add --log-disable to disable logging to file
 (#4260)

* * add --log-disable to disable logging to file in the server example

* * typo fix
---
 examples/server/server.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 5edb3678e..a65344b92 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1961,6 +1961,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("    -spf FNAME, --system-prompt-file FNAME\n");
     printf("                        Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
     printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA.\n");
+    printf("  --log-disable         disables logging to a file.\n");
     printf("\n");
 }
 
@@ -2315,6 +2316,11 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
             }
             params.mmproj = argv[i];
         }
+        else if (arg == "--log-disable")
+        {
+            log_set_target(stdout);
+            LOG_INFO("logging to file is disabled.", {});
+        }
         else
         {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());

From ef47ec18da469423c276b683dd9b5741cee7023e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 1 Dec 2023 10:51:24 +0200
Subject: [PATCH 39/64] ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
---
 examples/batched-bench/batched-bench.cpp |   2 +-
 ggml-alloc.c                             |   2 +-
 ggml-cuda.cu                             | 130 +++++++++-----
 ggml-metal.m                             |  43 +++--
 ggml-metal.metal                         | 210 +++++++++++------------
 ggml.c                                   |  77 +++++++--
 ggml.h                                   |   8 +
 llama.cpp                                |  35 ++--
 8 files changed, 311 insertions(+), 196 deletions(-)

diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 533c55c17..57596ed98 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -155,7 +155,7 @@ int main(int argc, char ** argv) {
     }
 
     LOG_TEE("\n");
-    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq);
+    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %d, n_threads_batch = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
     LOG_TEE("\n");
 
     LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
diff --git a/ggml-alloc.c b/ggml-alloc.c
index cdfe4caf6..0d4e12ae9 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -137,7 +137,7 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
 
 #ifdef GGML_ALLOCATOR_DEBUG
     add_allocated_tensor(alloc, tensor);
-    size_t cur_max = (char*)addr - (char*)alloc->data + size;
+    size_t cur_max = (char*)addr - (char*)alloc->base + size;
     if (cur_max > alloc->max_size) {
         printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
         for (int i = 0; i < 1024; i++) {
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 5b80e4ae3..9019a849f 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -443,6 +443,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
 #define CUDA_SCALE_BLOCK_SIZE 256
 #define CUDA_CLAMP_BLOCK_SIZE 256
 #define CUDA_ROPE_BLOCK_SIZE 256
+#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
 #define CUDA_ALIBI_BLOCK_SIZE 32
 #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
 #define CUDA_QUANTIZE_BLOCK_SIZE 256
@@ -501,6 +502,31 @@ static size_t g_scratch_offset = 0;
 
 static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
 
+static __device__ __forceinline__ float warp_reduce_sum(float x) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
+    }
+    return x;
+}
+
+static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
+        a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
+    }
+    return a;
+}
+
+static __device__ __forceinline__ float warp_reduce_max(float x) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
+    }
+    return x;
+}
+
 static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
     const int i = blockDim.x*blockIdx.x + threadIdx.x;
 
@@ -577,15 +603,6 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
     dst[i] = x[i] * x[i];
 }
 
-static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
-        a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
-    }
-    return a;
-}
-
 template <int block_size>
 static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
     const int row = blockIdx.x*blockDim.y + threadIdx.y;
@@ -624,14 +641,6 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
     }
 }
 
-static __device__ __forceinline__ float warp_reduce_sum(float x) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
-    }
-    return x;
-}
-
 template <int block_size>
 static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
     const int row = blockIdx.x*blockDim.y + threadIdx.y;
@@ -4717,45 +4726,74 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
     dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
 }
 
-// the CUDA soft max implementation differs from the CPU implementation
-// instead of doubles floats are used
-static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
-    const int row = blockDim.x*blockIdx.x + threadIdx.x;
-    const int block_size = blockDim.y;
-    const int tid = threadIdx.y;
+static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale) {
+    const int tid  = threadIdx.x;
+    const int rowx = blockIdx.x;
+    const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
+
+    const int block_size = blockDim.x;
+
+    const int warp_id = threadIdx.x / WARP_SIZE;
+    const int lane_id = threadIdx.x % WARP_SIZE;
+
+    __shared__ float buf[CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE];
 
     float max_val = -INFINITY;
 
     for (int col = tid; col < ncols; col += block_size) {
-        const int i = row*ncols + col;
-        max_val = max(max_val, x[i]);
+        const int ix = rowx*ncols + col;
+        const int iy = rowy*ncols + col;
+        max_val = max(max_val, x[ix]*scale + (y ? y[iy] : 0.0f));
     }
 
     // find the max value in the block
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
+    max_val = warp_reduce_max(max_val);
+    if (block_size > WARP_SIZE) {
+        if (warp_id == 0) {
+            buf[lane_id] = -INFINITY;
+        }
+        __syncthreads();
+
+        if (lane_id == 0) {
+            buf[warp_id] = max_val;
+        }
+        __syncthreads();
+
+        max_val = buf[lane_id];
+        max_val = warp_reduce_max(max_val);
     }
 
     float tmp = 0.f;
 
     for (int col = tid; col < ncols; col += block_size) {
-        const int i = row*ncols + col;
-        const float val = expf(x[i] - max_val);
+        const int ix = rowx*ncols + col;
+        const int iy = rowy*ncols + col;
+        const float val = expf((x[ix]*scale + (y ? y[iy] : 0.0f)) - max_val);
         tmp += val;
-        dst[i] = val;
+        dst[ix] = val;
     }
 
-    // sum up partial sums
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    // find the sum of exps in the block
+    tmp = warp_reduce_sum(tmp);
+    if (block_size > WARP_SIZE) {
+        if (warp_id == 0) {
+            buf[lane_id] = 0.f;
+        }
+        __syncthreads();
+
+        if (lane_id == 0) {
+            buf[warp_id] = tmp;
+        }
+        __syncthreads();
+
+        tmp = buf[lane_id];
+        tmp = warp_reduce_sum(tmp);
     }
 
     const float inv_tmp = 1.f / tmp;
 
     for (int col = tid; col < ncols; col += block_size) {
-        const int i = row*ncols + col;
+        const int i = rowx*ncols + col;
         dst[i] *= inv_tmp;
     }
 }
@@ -5792,10 +5830,12 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols
     diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
 }
 
-static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
-    const dim3 block_dims(1, WARP_SIZE, 1);
+static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
+    int nth = WARP_SIZE;
+    while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
+    const dim3 block_dims(nth,     1, 1);
     const dim3 block_nums(nrows_x, 1, 1);
-    soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
+    soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
 }
 
 static void im2col_f32_f16_cuda(const float * x, half * dst,
@@ -6846,14 +6886,18 @@ inline void ggml_cuda_op_soft_max(
     GGML_ASSERT(src0->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
+    GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
+
     const int64_t ne00 = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
+    const int64_t nrows_x = ggml_nrows(src0);
+    const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
 
-    soft_max_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
+    float scale = 1.0f;
+    memcpy(&scale, dst->op_params, sizeof(float));
+
+    soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
 
-    (void) src1;
     (void) dst;
-    (void) src1_dd;
 }
 
 inline void ggml_cuda_op_scale(
diff --git a/ggml-metal.m b/ggml-metal.m
index d52a1c3c4..6cfacf64f 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -1028,20 +1028,27 @@ void ggml_metal_graph_compute(
                             int nth = 32; // SIMD width
 
                             if (ne00%4 == 0) {
+                                while (nth < ne00/4 && nth < 256) {
+                                    nth *= 2;
+                                }
                                 [encoder setComputePipelineState:ctx->pipeline_soft_max_4];
                             } else {
-                                do {
+                                while (nth < ne00 && nth < 1024) {
                                     nth *= 2;
-                                } while (nth <= ne00 && nth <= 1024);
-                                nth /= 2;
+                                }
                                 [encoder setComputePipelineState:ctx->pipeline_soft_max];
                             }
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:GGML_PAD(nth/32*sizeof(float), 16) atIndex:0];
+
+                            const float scale = ((float *) dst->op_params)[0];
+
+                            [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
+                            [encoder setBuffer:id_src1 offset:offs_src1   atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst    atIndex:2];
+                            [encoder setBytes:&ne00  length:sizeof(ne00)  atIndex:3];
+                            [encoder setBytes:&ne01  length:sizeof(ne01)  atIndex:4];
+                            [encoder setBytes:&ne02  length:sizeof(ne02)  atIndex:5];
+                            [encoder setBytes:&scale length:sizeof(scale) atIndex:6];
+                            [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
 
                             [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                         } break;
@@ -1351,15 +1358,19 @@ void ggml_metal_graph_compute(
                             float eps;
                             memcpy(&eps, dst->op_params, sizeof(float));
 
-                            const int nth = MIN(512, ne00);
+                            int nth = 32; // SIMD width
+
+                            while (nth < ne00/4 && nth < 1024) {
+                                nth *= 2;
+                            }
 
                             [encoder setComputePipelineState:ctx->pipeline_rms_norm];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
-                            [encoder setBytes:&eps  length:sizeof(   float) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:GGML_PAD(nth/32*sizeof(float), 16) atIndex:0];
+                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
+                            [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
+                            [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
 
                             const int64_t nrows = ggml_nrows(src0);
 
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 5d1357cd7..9a79f815f 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -39,6 +39,8 @@ typedef struct {
     int8_t  qs[QK8_0]; // quants
 } block_q8_0;
 
+#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
+
 // general-purpose kernel for addition of two tensors
 // pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
 // cons: not very efficient
@@ -180,10 +182,12 @@ kernel void kernel_gelu(
 
 kernel void kernel_soft_max(
         device const float * src0,
+        device const float * src1,
         device       float * dst,
         constant   int64_t & ne00,
         constant   int64_t & ne01,
         constant   int64_t & ne02,
+        constant     float & scale,
         threadgroup float  * buf [[threadgroup(0)]],
         uint  tgpig[[threadgroup_position_in_grid]],
         uint  tpitg[[thread_position_in_threadgroup]],
@@ -194,73 +198,77 @@ kernel void kernel_soft_max(
     const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
     const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
 
-    device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
-    device       float * pdst  = dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+    device const float * psrc0 =        src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+    device const float * pmask = src1 ? src1                                      + i01*ne00 : nullptr;
+    device       float * pdst  =        dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
 
     // parallel max
-    float lmax = tpitg < ne00 ? psrc0[tpitg] : -INFINITY;
+    float lmax = -INFINITY;
 
-    for (int i00 = tpitg + ntg; i00 < ne00; i00 += ntg) {
-        lmax = MAX(lmax, psrc0[i00]);
+    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
+        lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f));
     }
 
-    float max = simd_max(lmax);
-    if (tiisg == 0) {
-        buf[sgitg] = max;
+    // find the max value in the block
+    float max_val = simd_max(lmax);
+    if (ntg > N_SIMDWIDTH) {
+        if (sgitg == 0) {
+            buf[tiisg] = -INFINITY;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        if (tiisg == 0) {
+            buf[sgitg] = max_val;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        max_val = buf[tiisg];
+        max_val = simd_max(max_val);
     }
 
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    // broadcast, simd group number is ntg / 32
-    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
-       if (tpitg < i) {
-           buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
-       }
-    }
-
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    max = buf[0];
-
     // parallel sum
     float lsum = 0.0f;
     for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-        const float exp_psrc0 = exp(psrc0[i00] - max);
+        const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
         lsum += exp_psrc0;
-        // Remember the result of exp here. exp is expensive, so we really do not
-        // wish to compute it twice.
         pdst[i00] = exp_psrc0;
     }
 
     float sum = simd_sum(lsum);
-    if (tiisg == 0) {
-        buf[sgitg] = sum;
+    if (ntg > N_SIMDWIDTH) {
+        if (sgitg == 0) {
+            buf[tiisg] = 0.0f;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        if (tiisg == 0) {
+            buf[sgitg] = sum;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        sum = buf[tiisg];
+        sum = simd_sum(sum);
     }
 
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    // broadcast, simd group number is ntg / 32
-    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
-       if (tpitg < i) {
-           buf[tpitg] += buf[tpitg + i];
-       }
-    }
-
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    sum = buf[0];
+    const float inv_sum = 1.0f/sum;
 
     for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-        pdst[i00] /= sum;
+        pdst[i00] *= inv_sum;
     }
 }
 
 kernel void kernel_soft_max_4(
         device const float * src0,
+        device const float * src1,
         device       float * dst,
         constant   int64_t & ne00,
         constant   int64_t & ne01,
         constant   int64_t & ne02,
+        constant     float & scale,
         threadgroup float  * buf [[threadgroup(0)]],
         uint  tgpig[[threadgroup_position_in_grid]],
         uint  tpitg[[thread_position_in_threadgroup]],
@@ -271,64 +279,68 @@ kernel void kernel_soft_max_4(
     const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
     const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
 
-    device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
-    device       float4 * pdst4 = (device       float4 *)(dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
+    device const float4 * psrc4 =        (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
+    device const float4 * pmask = src1 ? (device const float4 *)(src1 +                                      i01*ne00) : nullptr;
+    device       float4 * pdst4 =        (device       float4 *)(dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
 
     // parallel max
-    float4 lmax4 = tpitg < ne00/4 ? psrc4[tpitg] : -INFINITY;
+    float4 lmax4 = -INFINITY;
 
-    for (int i00 = tpitg + ntg; i00 < ne00/4; i00 += ntg) {
-        lmax4 = fmax(lmax4, psrc4[i00]);
+    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
+        lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f));
     }
 
     const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
-    float max = simd_max(lmax);
-    if (tiisg == 0) {
-        buf[sgitg] = max;
+
+    float max_val = simd_max(lmax);
+    if (ntg > N_SIMDWIDTH) {
+        if (sgitg == 0) {
+            buf[tiisg] = -INFINITY;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        if (tiisg == 0) {
+            buf[sgitg] = max_val;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        max_val = buf[tiisg];
+        max_val = simd_max(max_val);
     }
 
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    // broadcast, simd group number is ntg / 32
-    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
-       if (tpitg < i) {
-           buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
-       }
-    }
-
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    max = buf[0];
-
     // parallel sum
     float4 lsum4 = 0.0f;
     for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
-        const float4 exp_psrc4 = exp(psrc4[i00] - max);
+        const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
         lsum4 += exp_psrc4;
         pdst4[i00] = exp_psrc4;
     }
 
     const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
     float sum = simd_sum(lsum);
-    if (tiisg == 0) {
-        buf[sgitg] = sum;
+    if (ntg > N_SIMDWIDTH) {
+        if (sgitg == 0) {
+            buf[tiisg] = 0.0f;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        if (tiisg == 0) {
+            buf[sgitg] = sum;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        sum = buf[tiisg];
+        sum = simd_sum(sum);
     }
 
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    // broadcast, simd group number is ntg / 32
-    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
-       if (tpitg < i) {
-           buf[tpitg] += buf[tpitg + i];
-       }
-    }
-
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    sum = buf[0];
+    const float inv_sum = 1.0f/sum;
 
     for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
-        pdst4[i00] /= sum;
+        pdst4[i00] *= inv_sum;
     }
 }
 
@@ -435,14 +447,13 @@ kernel void kernel_rms_norm(
         constant   int64_t & ne00,
         constant  uint64_t & nb01,
         constant     float & eps,
-        threadgroup float  * sum [[threadgroup(0)]],
+        threadgroup float  * buf [[threadgroup(0)]],
         uint tgpig[[threadgroup_position_in_grid]],
         uint tpitg[[thread_position_in_threadgroup]],
         uint sgitg[[simdgroup_index_in_threadgroup]],
         uint tiisg[[thread_index_in_simdgroup]],
         uint   ntg[[threads_per_threadgroup]]) {
-    device const float4 * x        = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
-    device const float  * x_scalar = (device const float  *) x;
+    device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
 
     float4 sumf = 0;
     float all_sum = 0;
@@ -453,40 +464,30 @@ kernel void kernel_rms_norm(
     }
     all_sum = sumf[0] + sumf[1] + sumf[2] + sumf[3];
     all_sum = simd_sum(all_sum);
-    if (tiisg == 0) {
-        sum[sgitg] = all_sum;
-    }
-
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    // broadcast, simd group number is ntg / 32
-    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
-       if (tpitg < i) {
-           sum[tpitg] += sum[tpitg + i];
-       }
-    }
-    if (tpitg == 0) {
-        for (int i = 4 * (ne00 / 4); i < ne00; i++) {
-            sum[0] += x_scalar[i];
+    if (ntg > N_SIMDWIDTH) {
+        if (sgitg == 0) {
+            buf[tiisg] = 0.0f;
         }
-        sum[0] /= ne00;
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        if (tiisg == 0) {
+            buf[sgitg] = all_sum;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        all_sum = buf[tiisg];
+        all_sum = simd_sum(all_sum);
     }
 
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    const float mean  = sum[0];
+    const float mean  = all_sum/ne00;
     const float scale = 1.0f/sqrt(mean + eps);
 
     device float4 * y = (device float4 *) (dst + tgpig*ne00);
-    device float * y_scalar = (device float *) y;
     for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
         y[i00] = x[i00] * scale;
     }
-    if (tpitg == 0) {
-        for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
-            y_scalar[i00] = x_scalar[i00] * scale;
-        }
-    }
 }
 
 // function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
@@ -576,7 +577,6 @@ inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thre
 // putting them in the kernel cause a significant performance penalty
 #define N_DST 4        // each SIMD group works on 4 rows
 #define N_SIMDGROUP 2  // number of SIMD groups in a thread group
-#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
 //Note: This is a template, but strictly speaking it only applies to
 //      quantizations where the block size is 32. It also does not
 //      giard against the number of rows not being divisible by
diff --git a/ggml.c b/ggml.c
index c522a101f..e2687ef4f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4826,7 +4826,17 @@ struct ggml_tensor * ggml_diag_mask_zero_inplace(
 static struct ggml_tensor * ggml_soft_max_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
+        struct ggml_tensor  * mask,
+        float                 scale,
         bool                  inplace) {
+    GGML_ASSERT(ggml_is_contiguous(a));
+    if (mask) {
+        GGML_ASSERT(ggml_is_contiguous(mask));
+        GGML_ASSERT(mask->ne[2] == 1);
+        GGML_ASSERT(mask->ne[3] == 1);
+        GGML_ASSERT(ggml_can_repeat_rows(mask, a));
+    }
+
     bool is_node = false;
 
     if (a->grad) {
@@ -4835,9 +4845,13 @@ static struct ggml_tensor * ggml_soft_max_impl(
 
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
+    float params[] = { scale };
+    ggml_set_op_params(result, params, sizeof(params));
+
     result->op   = GGML_OP_SOFT_MAX;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
+    result->src[1] = mask;
 
     return result;
 }
@@ -4845,13 +4859,21 @@ static struct ggml_tensor * ggml_soft_max_impl(
 struct ggml_tensor * ggml_soft_max(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    return ggml_soft_max_impl(ctx, a, false);
+    return ggml_soft_max_impl(ctx, a, NULL, 1.0f, false);
 }
 
 struct ggml_tensor * ggml_soft_max_inplace(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    return ggml_soft_max_impl(ctx, a, true);
+    return ggml_soft_max_impl(ctx, a, NULL, 1.0f, true);
+}
+
+struct ggml_tensor * ggml_soft_max_ext(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * mask,
+        float                 scale) {
+    return ggml_soft_max_impl(ctx, a, mask, scale, false);
 }
 
 // ggml_soft_max_back
@@ -10551,20 +10573,25 @@ static void ggml_compute_forward_diag_mask_zero(
 static void ggml_compute_forward_soft_max_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    assert(ggml_is_contiguous(dst));
+    assert(ggml_are_same_shape(src0, dst));
 
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
     }
 
+    float scale = 1.0f;
+    memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
+
     // TODO: handle transposed/permuted matrices
 
     const int ith = params->ith;
     const int nth = params->nth;
 
+    const int64_t ne11 = src1 ? src1->ne[1] : 1;
+
     const int nc = src0->ne[0];
     const int nr = ggml_nrows(src0);
 
@@ -10575,29 +10602,40 @@ static void ggml_compute_forward_soft_max_f32(
     const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr);
 
+    float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
+
     for (int i1 = ir0; i1 < ir1; i1++) {
-        float *sp = (float *)((char *) src0->data + i1*src0->nb[1]);
-        float *dp = (float *)((char *)  dst->data +  i1*dst->nb[1]);
+        float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
+        float * dp = (float *)((char *)  dst->data +  i1*dst->nb[1]);
+
+        // broadcast the mask across rows
+        float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
+
+        ggml_vec_cpy_f32  (nc, wp, sp);
+        ggml_vec_scale_f32(nc, wp, scale);
+        if (mp) {
+            ggml_vec_acc_f32(nc, wp, mp);
+        }
 
 #ifndef NDEBUG
         for (int i = 0; i < nc; ++i) {
             //printf("p[%d] = %f\n", i, p[i]);
-            assert(!isnan(sp[i]));
+            assert(!isnan(wp[i]));
         }
 #endif
 
         float max = -INFINITY;
-        ggml_vec_max_f32(nc, &max, sp);
+        ggml_vec_max_f32(nc, &max, wp);
 
         ggml_float sum = 0.0;
 
         uint16_t scvt;
         for (int i = 0; i < nc; i++) {
-            if (sp[i] == -INFINITY) {
+            if (wp[i] == -INFINITY) {
                 dp[i] = 0.0f;
             } else {
-                // const float val = (sp[i] == -INFINITY) ? 0.0 : exp(sp[i] - max);
-                ggml_fp16_t s = GGML_FP32_TO_FP16(sp[i] - max);
+                // const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
+                ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
                 memcpy(&scvt, &s, sizeof(scvt));
                 const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
                 sum += (ggml_float)val;
@@ -10622,11 +10660,12 @@ static void ggml_compute_forward_soft_max_f32(
 static void ggml_compute_forward_soft_max(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
     switch (src0->type) {
         case GGML_TYPE_F32:
             {
-                ggml_compute_forward_soft_max_f32(params, src0, dst);
+                ggml_compute_forward_soft_max_f32(params, src0, src1, dst);
             } break;
         default:
             {
@@ -13863,7 +13902,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             } break;
         case GGML_OP_SOFT_MAX:
             {
-                ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
+                ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_SOFT_MAX_BACK:
             {
@@ -15899,6 +15938,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                         cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
                     }
                 } break;
+            case GGML_OP_SOFT_MAX:
+                {
+                    n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
+
+                    cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
+                } break;
             case GGML_OP_CONV_TRANSPOSE_1D:
                 {
                     GGML_ASSERT(node->src[0]->ne[3] == 1);
diff --git a/ggml.h b/ggml.h
index 4d6d4edfd..2f6787d4e 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1282,6 +1282,14 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
+    // fused soft_max(a*scale + mask)
+    // mask is optional
+    GGML_API struct ggml_tensor * ggml_soft_max_ext(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * mask,
+            float                 scale);
+
     GGML_API struct ggml_tensor * ggml_soft_max_back(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
diff --git a/llama.cpp b/llama.cpp
index 1e00ea4a9..e74fd7234 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3704,23 +3704,29 @@ static struct ggml_tensor * llm_build_kqv(
     struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
     cb(kq, "kq", il);
 
-    kq = ggml_scale(ctx, kq, kq_scale);
-    cb(kq, "kq_scaled", il);
-
     if (max_alibi_bias > 0.0f) {
-        // TODO: n_head or n_head_kv
-        // TODO: K-shift is likely not working
-        // TODO: change to ggml_add
-        kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
-        cb(kq, "kq_scaled_alibi", il);
+        // temporary branch until we figure out how to handle ggml_alibi through ggml_add
+        kq = ggml_scale(ctx, kq, kq_scale);
+        cb(kq, "kq_scaled", il);
+
+        if (max_alibi_bias > 0.0f) {
+            // TODO: n_head or n_head_kv
+            // TODO: K-shift is likely not working
+            // TODO: change to ggml_add
+            kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
+            cb(kq, "kq_scaled_alibi", il);
+        }
+
+        kq = ggml_add(ctx, kq, kq_mask);
+        cb(kq, "kq_masked", il);
+
+        kq = ggml_soft_max(ctx, kq);
+        cb(kq, "kq_soft_max", il);
+    } else {
+        kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head)));
+        cb(kq, "kq_soft_max_ext", il);
     }
 
-    kq = ggml_add(ctx, kq, kq_mask);
-    cb(kq, "kq_masked", il);
-
-    kq = ggml_soft_max(ctx, kq);
-    cb(kq, "kq_soft_max", il);
-
     // split cached v into n_head heads
     struct ggml_tensor * v =
         ggml_view_3d(ctx, kv.v,
@@ -5041,6 +5047,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
     { "kq_scaled_alibi",            OFFLOAD_FUNC_KQ  },
     { "kq_masked",                  OFFLOAD_FUNC_KQ  },
     { "kq_soft_max",                OFFLOAD_FUNC_V   },
+    { "kq_soft_max_ext",            OFFLOAD_FUNC_V   },
     { "v",                          OFFLOAD_FUNC_V   },
     { "kqv",                        OFFLOAD_FUNC_V   },
     { "kqv_merged",                 OFFLOAD_FUNC_V   },

From 8d6d9f033b8101f929e445cf45b39e1557ca7934 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Fri, 1 Dec 2023 10:41:56 +0100
Subject: [PATCH 40/64] py : add requirements file for convert-hf-to-gguf.py
 (#4277)

This commit adds a requirements file for the convert-hf-to-gguf.py
script, and also add the torch and transformers packages to it.

The motivation for this is that currently running convert-hf-to-gguf.py
will produce the following error:
```console
$ python3 -m venv venv
$ source venv/bin/activate
(venv) $ pip install -r requirements.txt
Collecting numpy==1.24.4
Collecting sentencepiece==0.1.98
Collecting gguf>=0.1.0
Installing collected packages: sentencepiece, numpy, gguf
Successfully installed gguf-0.5.1 numpy-1.24.4 sentencepiece-0.1.98

(venv) $ python convert-hf-to-gguf.py --help
Traceback (most recent call last):
  File "llama.cpp/convert-hf-to-gguf.py", line 16, in <module>
    import torch
ModuleNotFoundError: No module named 'torch'
```
With this commit, and using requirements-hf-to-gguf.txt instead of
requirements.txt, the script can be run and shows the help output.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
---
 requirements-hf-to-gguf.txt | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 requirements-hf-to-gguf.txt

diff --git a/requirements-hf-to-gguf.txt b/requirements-hf-to-gguf.txt
new file mode 100644
index 000000000..f4600539e
--- /dev/null
+++ b/requirements-hf-to-gguf.txt
@@ -0,0 +1,3 @@
+-r requirements.txt
+torch==2.1.1
+transformers==4.35.2

From 880f57973b8e0091d0f9f50eb5ab4cd4e31582ca Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 1 Dec 2023 18:42:11 +0200
Subject: [PATCH 41/64] llama : fix integer overflow during quantization
 (#4284)

happens with multi-threaded quantization of Qwen-72B

ggml-ci
---
 llama.cpp | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index e74fd7234..6fbfeca54 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7655,18 +7655,21 @@ static void llama_convert_tensor_internal(
         return;
     }
 
-    auto block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
-    auto block_size_bytes = ggml_type_size(tensor->type);
+    size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
+    size_t block_size_bytes = ggml_type_size(tensor->type);
 
     GGML_ASSERT(nelements % block_size == 0);
-    auto nblocks = nelements / block_size;
-    auto blocks_per_thread = nblocks / nthread;
-    auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
+    size_t nblocks = nelements / block_size;
+    size_t blocks_per_thread = nblocks / nthread;
+    size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
 
-    for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
-        auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
-        auto thr_elems = thr_blocks * block_size; // number of elements for this thread
-        auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
+    size_t in_buff_offs = 0;
+    size_t out_buff_offs = 0;
+
+    for (int tnum = 0; tnum < nthread; tnum++) {
+        size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
+        size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
+        size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
 
         auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
             if (typ == GGML_TYPE_F16) {

From 37c746d687d877bc11803e96b4dc5f378b83c0a0 Mon Sep 17 00:00:00 2001
From: Shijie <821898965@qq.com>
Date: Sat, 2 Dec 2023 02:16:31 +0800
Subject: [PATCH 42/64] llama : add Qwen support (#4281)

* enable qwen to llama.cpp

* llama : do not GPU split bias tensors

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 convert-hf-to-gguf.py          | 131 +++++++++++++++++++-
 gguf-py/gguf/constants.py      |  20 ++++
 gguf-py/gguf/tensor_mapping.py |  18 +--
 llama.cpp                      | 211 +++++++++++++++++++++++++++++++++
 prompts/chat-with-qwen.txt     |   1 +
 5 files changed, 372 insertions(+), 9 deletions(-)
 create mode 100644 prompts/chat-with-qwen.txt

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 53ce76c70..bced1f561 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -10,7 +10,7 @@ import re
 import sys
 from enum import IntEnum
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast
+from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast, Optional
 
 import numpy as np
 import torch
@@ -168,6 +168,8 @@ class Model:
             return PersimmonModel
         if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
             return StableLMModel
+        if model_architecture == "QWenLMHeadModel":
+            return QwenModel
         return Model
 
     def _is_model_safetensors(self) -> bool:
@@ -203,6 +205,8 @@ class Model:
             return gguf.MODEL_ARCH.PERSIMMON
         if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
             return gguf.MODEL_ARCH.STABLELM
+        if arch == "QWenLMHeadModel":
+            return gguf.MODEL_ARCH.QWEN
 
         raise NotImplementedError(f'Architecture "{arch}" not supported!')
 
@@ -832,6 +836,131 @@ class StableLMModel(Model):
         self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
         self.gguf_writer.add_layer_norm_eps(1e-5)
 
+
+class QwenModel(Model):
+    @staticmethod
+    def token_bytes_to_string(b):
+        from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
+        byte_encoder = bytes_to_unicode()
+        return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
+
+    @staticmethod
+    def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]:
+        parts = [bytes([b]) for b in token]
+        while True:
+            min_idx = None
+            min_rank = None
+            for i, pair in enumerate(zip(parts[:-1], parts[1:])):
+                rank = mergeable_ranks.get(pair[0] + pair[1])
+                if rank is not None and (min_rank is None or rank < min_rank):
+                    min_idx = i
+                    min_rank = rank
+            if min_rank is None or (max_rank is not None and min_rank >= max_rank):
+                break
+            assert min_idx is not None
+            parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
+        return parts
+
+    def set_vocab(self):
+        dir_model = self.dir_model
+        hparams = self.hparams
+        tokens: list[bytearray] = []
+        toktypes: list[int] = []
+
+        from transformers import AutoTokenizer  # type: ignore[attr-defined]
+        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
+        vocab_size = hparams["vocab_size"]
+        assert max(tokenizer.get_vocab().values()) < vocab_size
+
+        merges = []
+        vocab = {}
+        mergeable_ranks = tokenizer.mergeable_ranks
+        for token, rank in mergeable_ranks.items():
+            vocab[self.token_bytes_to_string(token)] = rank
+            if len(token) == 1:
+                continue
+            merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
+            assert len(merged) == 2
+            merges.append(' '.join(map(self.token_bytes_to_string, merged)))
+
+        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in vocab.items()}
+        added_vocab = tokenizer.special_tokens
+
+        for i in range(vocab_size):
+            if i not in reverse_vocab:
+                pad_token = f"[PAD{i}]".encode("utf-8")
+                tokens.append(bytearray(pad_token))
+                toktypes.append(gguf.TokenType.USER_DEFINED)
+            elif reverse_vocab[i] in added_vocab:
+                tokens.append(reverse_vocab[i])
+                toktypes.append(gguf.TokenType.CONTROL)
+            else:
+                tokens.append(reverse_vocab[i])
+                toktypes.append(gguf.TokenType.NORMAL)
+
+        self.gguf_writer.add_tokenizer_model("gpt2")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
+        special_vocab.merges = merges
+        special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
+        special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
+        special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def set_gguf_parameters(self):
+        self.gguf_writer.add_name("Qwen")
+        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
+        self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
+        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
+        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
+        self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
+        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
+        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
+
+    def write_tensors(self):
+        block_count = self.hparams["num_hidden_layers"]
+        model_kv = dict(self.get_tensors())
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+        for name, data_torch in model_kv.items():
+            # we don't need these
+            if name.endswith(".rotary_emb.inv_freq"):
+                continue
+
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            data = data_torch.squeeze().numpy()
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            self.gguf_writer.add_tensor(new_name, data)
+
 ###### CONVERSION LOGIC ######
 
 
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 8bd82daca..685c88f1a 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -92,6 +92,7 @@ class MODEL_ARCH(IntEnum):
     BERT      = auto()
     BLOOM     = auto()
     STABLELM  = auto()
+    QWEN      = auto()
 
 
 class MODEL_TENSOR(IntEnum):
@@ -132,6 +133,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
     MODEL_ARCH.BERT:           "bert",
     MODEL_ARCH.BLOOM:          "bloom",
     MODEL_ARCH.STABLELM:       "stablelm",
+    MODEL_ARCH.QWEN:           "qwen",
 }
 
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -317,6 +319,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
+    MODEL_ARCH.QWEN: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
     MODEL_ARCH.GPT2: [
         # TODO
     ],
@@ -336,6 +352,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
     MODEL_ARCH.PERSIMMON: [
         MODEL_TENSOR.ROPE_FREQS,
     ],
+    MODEL_ARCH.QWEN: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
 }
 
 #
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 22ad8b8fc..cc6236014 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -10,7 +10,7 @@ class TensorNameMap:
         # Token embeddings
         MODEL_TENSOR.TOKEN_EMBD: (
             "gpt_neox.embed_in",                         # gptneox
-            "transformer.wte",                           # gpt2 gpt-j mpt refact
+            "transformer.wte",                           # gpt2 gpt-j mpt refact qwen
             "transformer.word_embeddings",               # falcon
             "word_embeddings",                           # bloom
             "model.embed_tokens",                        # llama-hf
@@ -38,7 +38,7 @@ class TensorNameMap:
         # Output
         MODEL_TENSOR.OUTPUT: (
             "embed_out",                 # gptneox
-            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan
+            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen
             "output",                    # llama-pth bloom
             "word_embeddings_for_head",  # persimmon
         ),
@@ -51,7 +51,7 @@ class TensorNameMap:
             "norm",                                    # llama-pth
             "embeddings.LayerNorm",                    # bert
             "transformer.norm_f",                      # mpt
-            "ln_f",                                    # refact bloom
+            "ln_f",                                    # refact bloom qwen
             "language_model.encoder.final_layernorm",  # persimmon
         ),
 
@@ -65,7 +65,7 @@ class TensorNameMap:
         # Attention norm
         MODEL_TENSOR.ATTN_NORM: (
             "gpt_neox.layers.{bid}.input_layernorm",                # gptneox
-            "transformer.h.{bid}.ln_1",                             # gpt2 gpt-j refact
+            "transformer.h.{bid}.ln_1",                             # gpt2 gpt-j refact qwen
             "transformer.blocks.{bid}.norm_1",                      # mpt
             "transformer.h.{bid}.input_layernorm",                  # falcon7b
             "h.{bid}.input_layernorm",                              # bloom
@@ -85,7 +85,7 @@ class TensorNameMap:
         # Attention query-key-value
         MODEL_TENSOR.ATTN_QKV: (
             "gpt_neox.layers.{bid}.attention.query_key_value",                     # gptneox
-            "transformer.h.{bid}.attn.c_attn",                                     # gpt2
+            "transformer.h.{bid}.attn.c_attn",                                     # gpt2 qwen
             "transformer.blocks.{bid}.attn.Wqkv",                                  # mpt
             "transformer.h.{bid}.self_attention.query_key_value",                  # falcon
             "h.{bid}.self_attention.query_key_value",                              # bloom
@@ -119,7 +119,7 @@ class TensorNameMap:
         # Attention output
         MODEL_TENSOR.ATTN_OUT: (
             "gpt_neox.layers.{bid}.attention.dense",                     # gptneox
-            "transformer.h.{bid}.attn.c_proj",                           # gpt2 refact
+            "transformer.h.{bid}.attn.c_proj",                           # gpt2 refact qwen
             "transformer.blocks.{bid}.attn.out_proj",                    # mpt
             "transformer.h.{bid}.self_attention.dense",                  # falcon
             "h.{bid}.self_attention.dense",                              # bloom
@@ -139,7 +139,7 @@ class TensorNameMap:
         # Feed-forward norm
         MODEL_TENSOR.FFN_NORM: (
             "gpt_neox.layers.{bid}.post_attention_layernorm",                # gptneox
-            "transformer.h.{bid}.ln_2",                                      # gpt2 refact
+            "transformer.h.{bid}.ln_2",                                      # gpt2 refact qwen
             "h.{bid}.post_attention_layernorm",                              # bloom
             "transformer.blocks.{bid}.norm_2",                               # mpt
             "model.layers.{bid}.post_attention_layernorm",                   # llama-hf
@@ -161,18 +161,20 @@ class TensorNameMap:
             "encoder.layer.{bid}.intermediate.dense",                 # bert
             "transformer.h.{bid}.mlp.fc_in",                          # gpt-j
             "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h",  # persimmon
+            "transformer.h.{bid}.mlp.w1",                             # qwen
         ),
 
         # Feed-forward gate
         MODEL_TENSOR.FFN_GATE: (
             "model.layers.{bid}.mlp.gate_proj",  # llama-hf refact
             "layers.{bid}.feed_forward.w1",      # llama-pth
+            "transformer.h.{bid}.mlp.w2",        # qwen
         ),
 
         # Feed-forward down
         MODEL_TENSOR.FFN_DOWN: (
             "gpt_neox.layers.{bid}.mlp.dense_4h_to_h",                # gptneox
-            "transformer.h.{bid}.mlp.c_proj",                         # gpt2 refact
+            "transformer.h.{bid}.mlp.c_proj",                         # gpt2 refact qwen
             "transformer.blocks.{bid}.ffn.down_proj",                 # mpt
             "transformer.h.{bid}.mlp.dense_4h_to_h",                  # falcon
             "h.{bid}.mlp.dense_4h_to_h",                              # bloom
diff --git a/llama.cpp b/llama.cpp
index 6fbfeca54..ca21cffab 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -192,6 +192,7 @@ enum llm_arch {
     LLM_ARCH_REFACT,
     LLM_ARCH_BLOOM,
     LLM_ARCH_STABLELM,
+    LLM_ARCH_QWEN,
     LLM_ARCH_UNKNOWN,
 };
 
@@ -208,6 +209,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
     { LLM_ARCH_REFACT,          "refact"    },
     { LLM_ARCH_BLOOM,           "bloom"     },
     { LLM_ARCH_STABLELM,        "stablelm"  },
+    { LLM_ARCH_QWEN,            "qwen"      },
 };
 
 enum llm_kv {
@@ -518,6 +520,22 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         },
     },
+    {
+        LLM_ARCH_QWEN,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
 
     {
         LLM_ARCH_UNKNOWN,
@@ -2347,6 +2365,15 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                }
             } break;
+        case LLM_ARCH_QWEN:
+            {
+                GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
+                switch (hparams.n_layer) {
+                    case 32: model.type = e_model::MODEL_7B; break;
+                    case 40: model.type = e_model::MODEL_13B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            } break;
 
         default: (void)0;
     }
@@ -3310,6 +3337,71 @@ static void llm_load_tensors(
                         }
                     }
                 } break;
+            case LLM_ARCH_QWEN:
+                {
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    {
+                        ggml_backend_type backend_norm;
+                        ggml_backend_type backend_output;
+
+                        if (n_gpu_layers > int(n_layer)) {
+                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
+                            // on Windows however this is detrimental unless everything is on the GPU
+#ifndef _WIN32
+                            backend_norm = llama_backend_offload;
+#else
+                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
+#endif // _WIN32
+
+                            backend_output = llama_backend_offload_split;
+                        } else {
+                            backend_norm   = GGML_BACKEND_CPU;
+                            backend_output = GGML_BACKEND_CPU;
+                        }
+
+                        model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
+                        model.output      = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+
+                        if (backend_norm == GGML_BACKEND_GPU) {
+                            vram_weights += ggml_nbytes(model.output_norm);
+                        }
+                        if (backend_output == GGML_BACKEND_GPU_SPLIT) {
+                            vram_weights += ggml_nbytes(model.output);
+                        }
+                    }
+
+                    const uint32_t n_ff = hparams.n_ff / 2;
+
+                    const int i_gpu_start = n_layer - n_gpu_layers;
+
+                    model.layers.resize(n_layer);
+
+                    for (uint32_t i = 0; i < n_layer; ++i) {
+                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
+
+                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}, backend_split);
+                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd * 3},         backend);
+                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
+
+                        layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
+
+                        layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
+                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+
+                        if (backend == GGML_BACKEND_GPU) {
+                            vram_weights +=
+                                ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wqkv)     + ggml_nbytes(layer.bqkv)     +
+                                ggml_nbytes(layer.wo)        + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
+                                ggml_nbytes(layer.ffn_down)  + ggml_nbytes(layer.ffn_up);
+                        }
+                    }
+                } break;
 
             default:
                 throw std::runtime_error("unknown architecture");
@@ -4908,6 +5000,121 @@ struct llm_build_context {
 
         return gf;
     }
+
+    struct ggml_cgraph * build_qwen() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        cb(inpL, "inp_embd", -1);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos, "inp_pos", -1);
+
+        // KQ_scale
+        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale, "KQ_scale", -1);
+
+        // KQ_mask (mask for 1 head, it wil be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
+
+        // shift the entire K-cache if needed
+        if (do_rope_shift) {
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
+        }
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                cb(cur, "bqkv", il);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+                // using mode = 2 for neox mode
+                Qcur = ggml_rope_custom(
+                    ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_custom(
+                    ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+
+                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                        model.layers[il].wo, NULL,
+                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
+                cb(cur, "kqv_out", il);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward forward
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, cur,
+                        model.layers[il].ffn_up,   NULL,
+                        model.layers[il].ffn_gate, NULL,
+                        model.layers[il].ffn_down, NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
 };
 
 //
@@ -5382,6 +5589,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_stablelm();
             } break;
+        case LLM_ARCH_QWEN:
+            {
+                result = llm.build_qwen();
+            } break;
         default:
             GGML_ASSERT(false);
     }
diff --git a/prompts/chat-with-qwen.txt b/prompts/chat-with-qwen.txt
new file mode 100644
index 000000000..ac39ad925
--- /dev/null
+++ b/prompts/chat-with-qwen.txt
@@ -0,0 +1 @@
+You are a helpful assistant.
\ No newline at end of file

From 03562f3a86d6706eea9f4fc09b532946c191b34e Mon Sep 17 00:00:00 2001
From: CausalLM <148736309+CausalLM@users.noreply.github.com>
Date: Sat, 2 Dec 2023 02:17:06 +0800
Subject: [PATCH 43/64] llama : support attention bias on LLaMA architecture
 (#4283)

* Support attention_bias on LLaMA architecture

QKVO bias, should fix InternLM (https://github.com/ggerganov/llama.cpp/issues/3133) and works for LLaMAfied Qwen models (https://github.com/ggerganov/llama.cpp/pull/3743#issuecomment-1825923608).

* check existence of qkvo bias while loading llama models

Tested on LLaMA2, CUDA and CPU.

* Update llama.cpp
---
 llama.cpp | 52 ++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 48 insertions(+), 4 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index ca21cffab..15e52ad36 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1266,6 +1266,9 @@ struct llama_layer {
     struct ggml_tensor * wqkv;
 
     // attention bias
+    struct ggml_tensor * bq;
+    struct ggml_tensor * bk;
+    struct ggml_tensor * bv;
     struct ggml_tensor * bo;
     struct ggml_tensor * bqkv;
 
@@ -2809,6 +2812,30 @@ static void llm_load_tensors(
                         layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
                         layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
 
+                        try {
+                            layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend);
+                        } catch (const std::runtime_error& e) {
+                            if (std::string(e.what()).find("not found") != std::string::npos) layer.bq = NULL; else throw;
+                        }
+
+                        try {
+                            layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend);
+                        } catch (const std::runtime_error& e) {
+                            if (std::string(e.what()).find("not found") != std::string::npos) layer.bk = NULL; else throw;
+                        }
+
+                        try {
+                            layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend);
+                        } catch (const std::runtime_error& e) {
+                            if (std::string(e.what()).find("not found") != std::string::npos) layer.bv = NULL; else throw;
+                        }
+
+                        try {
+                            layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
+                        } catch (const std::runtime_error& e) {
+                            if (std::string(e.what()).find("not found") != std::string::npos) layer.bo = NULL; else throw;
+                        }
+
                         layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
 
                         layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
@@ -2817,9 +2844,14 @@ static void llm_load_tensors(
 
                         if (backend == GGML_BACKEND_GPU) {
                             vram_weights +=
-                                ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq)       + ggml_nbytes(layer.wk)       +
-                                ggml_nbytes(layer.wv)        + ggml_nbytes(layer.wo)       + ggml_nbytes(layer.ffn_norm) +
-                                ggml_nbytes(layer.ffn_gate)  + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
+                                ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
+                                ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) +
+                                (layer.bq ? ggml_nbytes(layer.bq) : 0) +
+                                (layer.bk ? ggml_nbytes(layer.bk) : 0) +
+                                (layer.bv ? ggml_nbytes(layer.bv) : 0) +
+                                (layer.bo ? ggml_nbytes(layer.bo) : 0) +
+                                ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
+                                ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
                         }
                     }
                 } break;
@@ -3983,12 +4015,24 @@ struct llm_build_context {
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
 
                 struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
 
                 struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
 
                 Qcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
@@ -4007,7 +4051,7 @@ struct llm_build_context {
                 llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
 
                 cur = llm_build_kqv(ctx0, hparams, kv_self,
-                        model.layers[il].wo, NULL,
+                        model.layers[il].wo, model.layers[il].bo,
                         Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
                 cb(cur, "kqv_out", il);
             }

From 511f52c334e37033f9c9de07b98fca4abc9470bd Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Fri, 1 Dec 2023 13:18:35 -0500
Subject: [PATCH 44/64] build : enable libstdc++ assertions for debug builds
 (#4275)

---
 CMakeLists.txt | 5 +++++
 Makefile       | 4 ++++
 2 files changed, 9 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6f35a25d5..0639518de 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -116,6 +116,11 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
 include(CheckCXXCompilerFlag)
 
+# enable libstdc++ assertions for debug builds
+if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+    add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
+endif()
+
 if (NOT MSVC)
     if (LLAMA_SANITIZE_THREAD)
         add_compile_options(-fsanitize=thread)
diff --git a/Makefile b/Makefile
index 25b113e0a..3cc932a2e 100644
--- a/Makefile
+++ b/Makefile
@@ -174,6 +174,10 @@ ifdef LLAMA_DEBUG
 	MK_CFLAGS   += -O0 -g
 	MK_CXXFLAGS += -O0 -g
 	MK_LDFLAGS  += -g
+
+	ifeq ($(UNAME_S),Linux)
+		MK_CXXFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS
+	endif
 else
 	MK_CPPFLAGS += -DNDEBUG
 endif

From b220222a64ce760bfbec9c770f11db3ec6a6abb6 Mon Sep 17 00:00:00 2001
From: Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
Date: Sat, 2 Dec 2023 03:19:45 +0900
Subject: [PATCH 45/64] swift : fix token_to_piece implementation (#4278)

* Fix token_to_piece implementation in Swift

* Fix errors
---
 examples/batched.swift/Sources/main.swift     | 10 +++------
 .../llama.cpp.swift/LibLlama.swift            | 22 +++++++++++++------
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift
index ba15197ae..ce9d80d9b 100644
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -230,18 +230,15 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
     var result = [CChar](repeating: 0, count: 8)
     let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
     if nTokens < 0 {
-        if result.count >= -Int(nTokens) {
-            result.removeLast(-Int(nTokens))
-        } else {
-            result.removeAll()
-        }
+        let actualTokensCount = -Int(nTokens)
+        result = .init(repeating: 0, count: actualTokensCount)
         let check = llama_token_to_piece(
             model,
             token,
             &result,
             Int32(result.count)
         )
-        assert(check == nTokens)
+        assert(check == actualTokensCount)
     } else {
         result.removeLast(result.count - Int(nTokens))
     }
@@ -259,5 +256,4 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
         buffer = []
         return bufferString
     }
-    return nil
 }
diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
index aaef09611..09b36d9e6 100644
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -164,13 +164,21 @@ actor LlamaContext {
     private func token_to_piece(token: llama_token) -> String {
         let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
         result.initialize(repeating: Int8(0), count: 8)
+        defer {
+            result.deallocate()
+        }
+        let nTokens = llama_token_to_piece(model, token, result, 8)
 
-        let _ = llama_token_to_piece(model, token, result, 8)
-
-        let resultStr = String(cString: result)
-
-        result.deallocate()
-
-        return resultStr
+        if nTokens < 0 {
+            let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
+            newResult.initialize(repeating: Int8(0), count: Int(-nTokens))
+            defer {
+                newResult.deallocate()
+            }
+            _ = llama_token_to_piece(model, token, newResult, -nTokens)
+            return String(cString: newResult)
+        } else {
+            return String(cString: result)
+        }
     }
 }

From d5a1cbde60531d02ac74da27ea355182e3a4d516 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 1 Dec 2023 20:35:03 +0200
Subject: [PATCH 46/64] llama : support optional tensors (#4283)

---
 examples/server/server.cpp |  2 +-
 llama.cpp                  | 33 +++++++++------------------------
 2 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index a65344b92..0fd42dcba 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1469,7 +1469,7 @@ struct llama_server_context
 
     int split_multiprompt_task(task_server& multiprompt_task)
     {
-        auto prompt_count = multiprompt_task.data.at("prompt").size();
+        int prompt_count = multiprompt_task.data.at("prompt").size();
         assert(prompt_count > 1);
 
         int multitask_id = id_gen++;
diff --git a/llama.cpp b/llama.cpp
index 15e52ad36..99964ec00 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1991,10 +1991,13 @@ struct llama_model_loader {
         return tensor;
     }
 
-    struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
+    struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend, bool optional = false) {
         struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
 
         if (cur == NULL) {
+            if (optional) {
+                return NULL;
+            }
             throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
         }
 
@@ -2812,29 +2815,11 @@ static void llm_load_tensors(
                         layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
                         layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
 
-                        try {
-                            layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend);
-                        } catch (const std::runtime_error& e) {
-                            if (std::string(e.what()).find("not found") != std::string::npos) layer.bq = NULL; else throw;
-                        }
-
-                        try {
-                            layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend);
-                        } catch (const std::runtime_error& e) {
-                            if (std::string(e.what()).find("not found") != std::string::npos) layer.bk = NULL; else throw;
-                        }
-
-                        try {
-                            layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend);
-                        } catch (const std::runtime_error& e) {
-                            if (std::string(e.what()).find("not found") != std::string::npos) layer.bv = NULL; else throw;
-                        }
-
-                        try {
-                            layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
-                        } catch (const std::runtime_error& e) {
-                            if (std::string(e.what()).find("not found") != std::string::npos) layer.bo = NULL; else throw;
-                        }
+                        // optional bias tensors
+                        layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     backend, true);
+                        layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, backend, true);
+                        layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, backend, true);
+                        layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     backend, true);
 
                         layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
 

From 5a7d3125e7c24f223659b7f0b7aa7736986e92c0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 1 Dec 2023 20:39:12 +0200
Subject: [PATCH 47/64] llama : avoid using "optional" keyword (#4283)

---
 llama.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 99964ec00..3f5d663cf 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1991,11 +1991,11 @@ struct llama_model_loader {
         return tensor;
     }
 
-    struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend, bool optional = false) {
+    struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend, bool required = true) {
         struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
 
         if (cur == NULL) {
-            if (optional) {
+            if (!required) {
                 return NULL;
             }
             throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
@@ -2816,10 +2816,10 @@ static void llm_load_tensors(
                         layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
 
                         // optional bias tensors
-                        layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     backend, true);
-                        layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, backend, true);
-                        layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, backend, true);
-                        layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     backend, true);
+                        layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     backend, false);
+                        layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, backend, false);
+                        layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, backend, false);
+                        layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     backend, false);
 
                         layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
 

From d7b800b8bc490a221acbd83c575206a907f2f6e2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 3 Dec 2023 10:58:16 +0200
Subject: [PATCH 48/64] llama : pad KV cache size (#4280)

* llama : pad KV cache size to 32

* metal : try to improve batched decoding
---
 ggml-metal.m | 2 +-
 llama.cpp    | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index 6cfacf64f..3343bc8a3 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -1083,7 +1083,7 @@ void ggml_metal_graph_compute(
 
                             // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                             // to the matrix-vector kernel
-                            int ne11_mm_min = 1;
+                            int ne11_mm_min = src0t == GGML_TYPE_F16 ? 1 : 16;
 
 #if 0
                             // the numbers below are measured on M2 Ultra for 7B and 13B models
diff --git a/llama.cpp b/llama.cpp
index 3f5d663cf..fd905ade7 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5744,8 +5744,7 @@ static int llama_decode_internal(
     // a heuristic, to avoid attending the full cache if it is not yet utilized
     // after enough generations, the benefit from this heuristic disappears
     // if we start defragmenting the cache, the benefit from this will be more important
-    //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32));   // TODO: this might be better for CUDA?
-    kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
+    kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
 
     //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
 

From 6949b50df56ee58a2d76d45487942cb211c08629 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rickard=20Ed=C3=A9n?= <rickardeden@gmail.com>
Date: Sun, 3 Dec 2023 10:03:25 +0100
Subject: [PATCH 49/64] py : add grammar to oai like api (#4294)

---
 examples/server/api_like_OAI.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/server/api_like_OAI.py b/examples/server/api_like_OAI.py
index 830c056d4..607fe49d3 100755
--- a/examples/server/api_like_OAI.py
+++ b/examples/server/api_like_OAI.py
@@ -70,6 +70,7 @@ def make_postData(body, chat=False, stream=False):
     if(is_present(body, "mirostat_tau")): postData["mirostat_tau"] = body["mirostat_tau"]
     if(is_present(body, "mirostat_eta")): postData["mirostat_eta"] = body["mirostat_eta"]
     if(is_present(body, "seed")): postData["seed"] = body["seed"]
+    if(is_present(body, "grammar")): postData["grammar"] = body["grammar"]
     if(is_present(body, "logit_bias")): postData["logit_bias"] = [[int(token), body["logit_bias"][token]] for token in body["logit_bias"].keys()]
     if (args.stop != ""):
         postData["stop"] = [args.stop]

From 33e171d1e9fc4903f9314b490d77fb8d58331b63 Mon Sep 17 00:00:00 2001
From: Ed Lee <edilee@mozilla.com>
Date: Sun, 3 Dec 2023 01:10:43 -0800
Subject: [PATCH 50/64] server : fix OpenAI API `stop` field to be optional
 (#4299)

(cherry picked from commit Mozilla-Ocho/llamafile@e8c92bcb84ae3bcbf0d617b7ee6a5413bcbd58af)
---
 examples/server/server.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 0fd42dcba..911f7bbe1 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2410,9 +2410,7 @@ json oaicompat_completion_params_parse(
     }
 
     // Handle 'stop' field
-    if (body["stop"].is_null()) {
-        llama_params["stop"] = json::array({});
-    } else if (body["stop"].is_string()) {
+    if (body.contains("stop") && body["stop"].is_string()) {
         llama_params["stop"] = json::array({body["stop"].get<std::string>()});
     } else {
         llama_params["stop"] = json_value(body, "stop", json::array());

From adf3de4f69ff7e44131222f05f9c7447ac0be3cb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 3 Dec 2023 15:56:22 +0200
Subject: [PATCH 51/64] ggml : fix soft max out-of-bounds access (#4307)

ggml-ci
---
 ggml.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index e2687ef4f..cecb12700 100644
--- a/ggml.c
+++ b/ggml.c
@@ -15629,7 +15629,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
             } break;
         case GGML_OP_DIAG_MASK_ZERO:
         case GGML_OP_DIAG_MASK_INF:
-        case GGML_OP_SOFT_MAX:
         case GGML_OP_SOFT_MAX_BACK:
         case GGML_OP_ROPE:
         case GGML_OP_ROPE_BACK:
@@ -15645,6 +15644,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
             {
                 n_tasks = 1; //TODO
             } break;
+        case GGML_OP_SOFT_MAX:
+            {
+                n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
+            } break;
         case GGML_OP_CONV_TRANSPOSE_1D:
             {
                 n_tasks = n_threads;

From fbbc42827b2949b95bcde23ce47bb47d006c895d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 3 Dec 2023 15:56:35 +0200
Subject: [PATCH 52/64] ggml : reuse ggml_get_n_tasks() in ggml_graph_plan()
 (#4308)

* ggml : fix soft max out-of-bounds access

ggml-ci

* ggml : reuse ggml_get_n_tasks() in ggml_graph_plan()

ggml-ci
---
 ggml.c | 23 ++---------------------
 1 file changed, 2 insertions(+), 21 deletions(-)

diff --git a/ggml.c b/ggml.c
index cecb12700..f743df1f3 100644
--- a/ggml.c
+++ b/ggml.c
@@ -15879,18 +15879,16 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
 
     // thread scheduling for the different operations + work buffer size estimation
     for (int i = 0; i < cgraph->n_nodes; i++) {
-        int n_tasks = 1;
-
         struct ggml_tensor * node = cgraph->nodes[i];
 
+        const int n_tasks = ggml_get_n_tasks(node, n_threads);
+
         size_t cur = 0;
 
         switch (node->op) {
             case GGML_OP_CPY:
             case GGML_OP_DUP:
                 {
-                    n_tasks = n_threads;
-
                     if (ggml_is_quantized(node->type)) {
                         cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
                     }
@@ -15898,16 +15896,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
             case GGML_OP_ADD:
             case GGML_OP_ADD1:
                 {
-                    n_tasks = n_threads;
-
                     if (ggml_is_quantized(node->src[0]->type)) {
                         cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
                     }
                 } break;
             case GGML_OP_ACC:
                 {
-                    n_tasks = n_threads;
-
                     if (ggml_is_quantized(node->src[0]->type)) {
                         cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
                     }
@@ -15935,16 +15929,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                 } break;
             case GGML_OP_OUT_PROD:
                 {
-                    n_tasks = n_threads;
-
                     if (ggml_is_quantized(node->src[0]->type)) {
                         cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
                     }
                 } break;
             case GGML_OP_SOFT_MAX:
                 {
-                    n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
-
                     cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
                 } break;
             case GGML_OP_CONV_TRANSPOSE_1D:
@@ -15974,7 +15964,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                 } break;
             case GGML_OP_IM2COL:
                 {
-                    n_tasks = n_threads;
                 } break;
             case GGML_OP_CONV_TRANSPOSE_2D:
                 {
@@ -15992,8 +15981,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                 } break;
             case GGML_OP_FLASH_ATTN:
                 {
-                    n_tasks = n_threads;
-
                     const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
 
                     if (node->src[1]->type == GGML_TYPE_F32) {
@@ -16006,8 +15993,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                 } break;
             case GGML_OP_FLASH_FF:
                 {
-                    n_tasks = n_threads;
-
                     if (node->src[1]->type == GGML_TYPE_F32) {
                         cur  = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
                         cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
@@ -16018,8 +16003,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                 } break;
             case GGML_OP_FLASH_ATTN_BACK:
                 {
-                    n_tasks = n_threads;
-
                     const int64_t    D = node->src[0]->ne[0];
                     const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
                     const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
@@ -16034,8 +16017,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
 
             case GGML_OP_CROSS_ENTROPY_LOSS:
                 {
-                    n_tasks = n_threads;
-
                     cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
                 } break;
             case GGML_OP_COUNT:

From 4fa44e84adb4c78e1885694cc3513982d4af2b08 Mon Sep 17 00:00:00 2001
From: Ikko Eltociear Ashimine <eltociear@gmail.com>
Date: Mon, 4 Dec 2023 16:57:35 +0900
Subject: [PATCH 53/64] grammar-parser : fix typo (#4318)

preceeding -> preceding
---
 common/grammar-parser.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp
index ff51cc803..bf89a96f3 100644
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -190,7 +190,7 @@ namespace grammar_parser {
                 pos = parse_space(pos + 1, is_nested);
             } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
                 if (last_sym_start == out_elements.size()) {
-                    throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos);
+                    throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
                 }
 
                 // apply transformation to previous symbol (last_sym_start to end) according to

From 5c9f90cba1cc6b0a2a7d19ee5dcb73cad6331d30 Mon Sep 17 00:00:00 2001
From: Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
Date: Mon, 4 Dec 2023 22:43:45 +0900
Subject: [PATCH 54/64] swift : fix prompt tokenization logic (#4321)

---
 examples/batched.swift/Sources/main.swift             | 5 +++--
 examples/llama.swiftui/llama.cpp.swift/LibLlama.swift | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift
index ce9d80d9b..4d0005349 100644
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -215,9 +215,10 @@ print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end
 llama_print_timings(context)
 
 private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
-    let n_tokens = text.count + (add_bos ? 1 : 0)
+    let utf8Count = text.utf8.count
+    let n_tokens = utf8Count + (add_bos ? 1 : 0)
     let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-    let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
+    let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
     var swiftTokens: [llama_token] = []
     for i in 0 ..< tokenCount {
         swiftTokens.append(tokens[Int(i)])
diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
index 09b36d9e6..f828106fb 100644
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -147,9 +147,10 @@ actor LlamaContext {
     }
 
     private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
-        let n_tokens = text.count + (add_bos ? 1 : 0)
+        let utf8Count = text.utf8.count
+        let n_tokens = utf8Count + (add_bos ? 1 : 0)
         let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-        let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, false)
+        let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
 
         var swiftTokens: [llama_token] = []
         for i in 0..<tokenCount {

From d208995c6da66f252d4054c1c5a90eb8ccb7a2f7 Mon Sep 17 00:00:00 2001
From: Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
Date: Tue, 5 Dec 2023 01:03:49 +0900
Subject: [PATCH 55/64] swift : fix concatenation method to avoid invalid UTF8
 stringfication (#4325)

---
 .../llama.cpp.swift/LibLlama.swift            | 37 +++++++++++++++----
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
index f828106fb..3754f0551 100644
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -11,6 +11,8 @@ actor LlamaContext {
     private var context: OpaquePointer
     private var batch: llama_batch
     private var tokens_list: [llama_token]
+    /// This variable is used to store temporarily invalid cchars
+    private var temporary_invalid_cchars: [CChar]
 
     var n_len: Int32 = 512
     var n_cur: Int32 = 0
@@ -21,6 +23,7 @@ actor LlamaContext {
         self.context = context
         self.tokens_list = []
         self.batch = llama_batch_init(512, 0, 1)
+        self.temporary_invalid_cchars = []
     }
 
     deinit {
@@ -61,6 +64,7 @@ actor LlamaContext {
         print("attempting to complete \"\(text)\"")
 
         tokens_list = tokenize(text: text, add_bos: true)
+        temporary_invalid_cchars = []
 
         let n_ctx = llama_n_ctx(context)
         let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)
@@ -72,7 +76,7 @@ actor LlamaContext {
         }
 
         for id in tokens_list {
-            print(token_to_piece(token: id))
+            print(String(cString: token_to_piece(token: id) + [0]))
         }
 
         // batch = llama_batch_init(512, 0) // done in init()
@@ -115,10 +119,25 @@ actor LlamaContext {
 
         if new_token_id == llama_token_eos(context) || n_cur == n_len {
             print("\n")
-            return ""
+            let new_token_str = String(cString: temporary_invalid_cchars + [0])
+            temporary_invalid_cchars.removeAll()
+            return new_token_str
         }
 
-        let new_token_str = token_to_piece(token: new_token_id)
+        let new_token_cchars = token_to_piece(token: new_token_id)
+        temporary_invalid_cchars.append(contentsOf: new_token_cchars)
+        let new_token_str: String
+        if let string = String(validatingUTF8: temporary_invalid_cchars + [0]) {
+            temporary_invalid_cchars.removeAll()
+            new_token_str = string
+        } else if (0 ..< temporary_invalid_cchars.count).contains(where: {$0 != 0 && String(validatingUTF8: Array(temporary_invalid_cchars.suffix($0)) + [0]) != nil}) {
+            // in this case, at least the suffix of the temporary_invalid_cchars can be interpreted as UTF8 string
+            let string = String(cString: temporary_invalid_cchars + [0])
+            temporary_invalid_cchars.removeAll()
+            new_token_str = string
+        } else {
+            new_token_str = ""
+        }
         print(new_token_str)
         // tokens_list.append(new_token_id)
 
@@ -144,6 +163,7 @@ actor LlamaContext {
 
     func clear() {
         tokens_list.removeAll()
+        temporary_invalid_cchars.removeAll()
     }
 
     private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
@@ -162,7 +182,8 @@ actor LlamaContext {
         return swiftTokens
     }
 
-    private func token_to_piece(token: llama_token) -> String {
+    /// - note: The result does not contain null-terminator
+    private func token_to_piece(token: llama_token) -> [CChar] {
         let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
         result.initialize(repeating: Int8(0), count: 8)
         defer {
@@ -176,10 +197,12 @@ actor LlamaContext {
             defer {
                 newResult.deallocate()
             }
-            _ = llama_token_to_piece(model, token, newResult, -nTokens)
-            return String(cString: newResult)
+            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens)
+            let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
+            return Array(bufferPointer)
         } else {
-            return String(cString: result)
+            let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nTokens))
+            return Array(bufferPointer)
         }
     }
 }

From 23b5e12eb5a76489b4c3ee22213a081da68b1809 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Mon, 4 Dec 2023 17:04:21 +0100
Subject: [PATCH 56/64] simple : update error message for KV cache check
 (#4324)

This commit updates the error message that is printed when the
KV cache is not big enough to hold all the prompt and generated
tokens. Specifically it removes the reference to n_parallel and
replaces it with n_len.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
---
 examples/simple/simple.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 374aef6f1..9cfde8308 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -75,7 +75,7 @@ int main(int argc, char ** argv) {
     // make sure the KV cache is big enough to hold all the prompt and generated tokens
     if (n_kv_req > n_ctx) {
         LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
-        LOG_TEE("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
+        LOG_TEE("%s:        either reduce n_len or increase n_ctx\n", __func__);
         return 1;
     }
 

From e4b76bbe316ee50fb17d9ac29e654c0edf830eba Mon Sep 17 00:00:00 2001
From: kchro3 <62481661+kchro3@users.noreply.github.com>
Date: Mon, 4 Dec 2023 23:29:46 -0800
Subject: [PATCH 57/64] swift : revert compiler checks for swift package
 (#4332)

---
 Package.swift | 46 ++++++++++++++++------------------------------
 1 file changed, 16 insertions(+), 30 deletions(-)

diff --git a/Package.swift b/Package.swift
index 5b3bd72ca..18d610d69 100644
--- a/Package.swift
+++ b/Package.swift
@@ -2,33 +2,14 @@
 
 import PackageDescription
 
-#if arch(arm) || arch(arm64)
-let platforms: [SupportedPlatform]? = [
-    .macOS(.v12),
-    .iOS(.v14),
-    .watchOS(.v4),
-    .tvOS(.v14)
-]
-let exclude: [String] = []
-let resources: [Resource] = [
-    .process("ggml-metal.metal")
-]
-let additionalSources: [String] = ["ggml-metal.m"]
-let additionalSettings: [CSetting] = [
-    .unsafeFlags(["-fno-objc-arc"]),
-    .define("GGML_USE_METAL")
-]
-#else
-let platforms: [SupportedPlatform]? = nil
-let exclude: [String] = ["ggml-metal.metal"]
-let resources: [Resource] = []
-let additionalSources: [String] = []
-let additionalSettings: [CSetting] = []
-#endif
-
 let package = Package(
     name: "llama",
-    platforms: platforms,
+    platforms: [
+        .macOS(.v12),
+        .iOS(.v14),
+        .watchOS(.v4),
+        .tvOS(.v14)
+    ],
     products: [
         .library(name: "llama", targets: ["llama"]),
     ],
@@ -36,25 +17,30 @@ let package = Package(
         .target(
             name: "llama",
             path: ".",
-            exclude: exclude,
+            exclude: [],
             sources: [
                 "ggml.c",
                 "llama.cpp",
                 "ggml-alloc.c",
                 "ggml-backend.c",
                 "ggml-quants.c",
-            ] + additionalSources,
-            resources: resources,
+                "ggml-metal.m",
+            ],
+            resources: [
+                .process("ggml-metal.metal")
+            ],
             publicHeadersPath: "spm-headers",
             cSettings: [
                 .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
-                .define("GGML_USE_ACCELERATE")
+                .define("GGML_USE_ACCELERATE"),
+                .unsafeFlags(["-fno-objc-arc"]),
+                .define("GGML_USE_METAL"),
                 // NOTE: NEW_LAPACK will required iOS version 16.4+
                 // We should consider add this in the future when we drop support for iOS 14
                 // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
                 // .define("ACCELERATE_NEW_LAPACK"),
                 // .define("ACCELERATE_LAPACK_ILP64")
-            ] + additionalSettings,
+            ],
             linkerSettings: [
                 .linkedFramework("Accelerate")
             ]

From 52c8bc3cf312e1caf02d37bfb9d9d865cbe33594 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Tue, 5 Dec 2023 15:05:51 +0500
Subject: [PATCH 58/64] sampling : custom samplers order (#4285)

* Samplers sequence order w parameter

* Cleaned commented code

* Fixed formatting

* Rewrote with unordered_map

* Revert and rewrite, too many problems and safeguards would be needed

* Fixed code style

* Code style fixes according to review

* More readable samplers input string, fixed help

* Style fix in sampler_queue

* Formatting fixes

* Fixing whitespaces
---
 common/common.cpp      | 56 +++++++++++++++++++++++++++++++++++++++
 common/common.h        |  6 +++++
 common/sampling.cpp    | 60 ++++++++++++++++++++++++++++++++++--------
 common/sampling.h      | 36 ++++++++++++++-----------
 examples/main/main.cpp |  1 +
 5 files changed, 132 insertions(+), 27 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 1dcc235ea..8e6d74d0d 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -280,6 +280,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             params.yarn_beta_slow = std::stof(argv[i]);
         } else if (arg == "--memory-f32") {
             params.memory_f16 = false;
+        } else if (arg == "--samplers") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            sparams.samplers_sequence = parse_samplers_input(argv[i]);
+        } else if (arg == "--sampling-seq") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            sparams.samplers_sequence = argv[i];
         } else if (arg == "--top-p") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -761,6 +773,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
     printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
     printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    printf("  --samplers            samplers that will be used for generation in the order, separated by \';\', for example: \"top_k;tfs;typical;top_p;min_p;temp\"\n");
+    printf("  --sampling-seq        simplified sequence for samplers that will be used (default: %s)\n", sparams.samplers_sequence.c_str());
     printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
     printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
     printf("  --min-p N             min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
@@ -886,6 +900,48 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
     GGML_UNREACHABLE();
 }
 
+//
+// String parsing
+//
+
+std::string parse_samplers_input(std::string input) {
+    std::string output = "";
+    // since samplers names are written multiple ways
+    // make it ready for both system names and input names
+    std::unordered_map<std::string, char> samplers_symbols {
+        {"top_k",      'k'},
+        {"top-k",      'k'},
+        {"top_p",      'p'},
+        {"top-p",      'p'},
+        {"nucleus",    'p'},
+        {"typical_p",  'y'},
+        {"typical-p",  'y'},
+        {"typical",    'y'},
+        {"min_p",      'm'},
+        {"min-p",      'm'},
+        {"tfs_z",      'f'},
+        {"tfs-z",      'f'},
+        {"tfs",        'f'},
+        {"temp",       't'},
+        {"temperature",'t'}
+    };
+    // expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p"
+    size_t separator = input.find(';');
+    while (separator != input.npos) {
+        std::string name = input.substr(0,separator);
+        input = input.substr(separator+1);
+        separator = input.find(';');
+
+        if (samplers_symbols.find(name) != samplers_symbols.end()) {
+            output += samplers_symbols[name];
+        }
+    }
+    if (samplers_symbols.find(input) != samplers_symbols.end()) {
+        output += samplers_symbols[input];
+    }
+    return output;
+}
+
 //
 // Model utils
 //
diff --git a/common/common.h b/common/common.h
index 2f6fe48ab..534f7b132 100644
--- a/common/common.h
+++ b/common/common.h
@@ -141,6 +141,12 @@ std::string gpt_random_prompt(std::mt19937 & rng);
 
 void process_escapes(std::string& input);
 
+//
+// String parsing
+//
+
+std::string parse_samplers_input(std::string input);
+
 //
 // Model utils
 //
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 1317024c2..b6bb886c6 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -99,6 +99,54 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
     return std::string(result);
 }
 
+std::string llama_sampling_order_print(const llama_sampling_params & params) {
+    std::string result = "CFG -> Penalties ";
+    if (params.mirostat == 0) {
+        for (auto s : params.samplers_sequence) {
+            switch (s) {
+                case 'k': result += "-> top_k "; break;
+                case 'f': result += "-> tfs_z "; break;
+                case 'y': result += "-> typical_p "; break;
+                case 'p': result += "-> top_p "; break;
+                case 'm': result += "-> min_p "; break;
+                case 't': result += "-> temp "; break;
+                default : break;
+            }
+        }
+    } else result += "-> mirostat ";
+
+    return result;
+}
+
+// no reasons to expose this function in header
+void sampler_queue(
+                   struct llama_context * ctx_main,
+            const llama_sampling_params & params,
+                 llama_token_data_array & cur_p,
+                                 size_t & min_keep) {
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
+
+    const float         temp              = params.temp;
+    const int32_t       top_k             = params.top_k <= 0 ? n_vocab : params.top_k;
+    const float         top_p             = params.top_p;
+    const float         min_p             = params.min_p;
+    const float         tfs_z             = params.tfs_z;
+    const float         typical_p         = params.typical_p;
+    const std::string & samplers_sequence = params.samplers_sequence;
+
+    for (auto s : samplers_sequence) {
+        switch (s){
+            case 'k': llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep); break;
+            case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep); break;
+            case 'y': llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
+            case 'p': llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
+            case 'm': llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
+            case 't': llama_sample_temp     (ctx_main, &cur_p, temp); break;
+            default : break;
+        }
+    }
+}
+
 llama_token llama_sampling_sample(
                   struct llama_sampling_context * ctx_sampling,
                   struct llama_context * ctx_main,
@@ -109,11 +157,6 @@ llama_token llama_sampling_sample(
     const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
 
     const float   temp            = params.temp;
-    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
-    const float   top_p           = params.top_p;
-    const float   min_p           = params.min_p;
-    const float   tfs_z           = params.tfs_z;
-    const float   typical_p       = params.typical_p;
     const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
     const float   penalty_repeat  = params.penalty_repeat;
     const float   penalty_freq    = params.penalty_freq;
@@ -188,12 +231,7 @@ llama_token llama_sampling_sample(
             // temperature sampling
             size_t min_keep = std::max(1, params.n_probs);
 
-            llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep);
-            llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep);
-            llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep);
-            llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep);
-            llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep);
-            llama_sample_temp     (ctx_main, &cur_p, temp);
+            sampler_queue(ctx_main, params, cur_p, min_keep);
 
             id = llama_sample_token(ctx_main, &cur_p);
 
diff --git a/common/sampling.h b/common/sampling.h
index 7c9b8dcf2..fdfa9eed1 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -10,22 +10,23 @@
 
 // sampling parameters
 typedef struct llama_sampling_params {
-    int32_t n_prev            = 64;    // number of previous tokens to remember
-    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t top_k             = 40;    // <= 0 to use vocab size
-    float   top_p             = 0.95f; // 1.0 = disabled
-    float   min_p             = 0.05f; // 0.0 = disabled
-    float   tfs_z             = 1.00f; // 1.0 = disabled
-    float   typical_p         = 1.00f; // 1.0 = disabled
-    float   temp              = 0.80f; // 1.0 = disabled
-    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   penalty_repeat    = 1.10f; // 1.0 = disabled
-    float   penalty_freq      = 0.00f; // 0.0 = disabled
-    float   penalty_present   = 0.00f; // 0.0 = disabled
-    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   mirostat_tau      = 5.00f; // target entropy
-    float   mirostat_eta      = 0.10f; // learning rate
-    bool    penalize_nl       = true;  // consider newlines as a repeatable token
+    int32_t     n_prev                = 64;       // number of previous tokens to remember
+    int32_t     n_probs               = 0;        // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t     top_k                 = 40;       // <= 0 to use vocab size
+    float       top_p                 = 0.95f;    // 1.0 = disabled
+    float       min_p                 = 0.05f;    // 0.0 = disabled
+    float       tfs_z                 = 1.00f;    // 1.0 = disabled
+    float       typical_p             = 1.00f;    // 1.0 = disabled
+    float       temp                  = 0.80f;    // 1.0 = disabled
+    int32_t     penalty_last_n        = 64;       // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float       penalty_repeat        = 1.10f;    // 1.0 = disabled
+    float       penalty_freq          = 0.00f;    // 0.0 = disabled
+    float       penalty_present       = 0.00f;    // 0.0 = disabled
+    int32_t     mirostat              = 0;        // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float       mirostat_tau          = 5.00f;    // target entropy
+    float       mirostat_eta          = 0.10f;    // learning rate
+    bool        penalize_nl           = true;     // consider newlines as a repeatable token
+    std::string samplers_sequence     = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp
 
     std::string grammar;  // optional BNF-like grammar to constrain sampling
 
@@ -80,6 +81,9 @@ std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama
 // Print sampling parameters into a string
 std::string llama_sampling_print(const llama_sampling_params & params);
 
+// Print sampling order into a string
+std::string llama_sampling_order_print(const llama_sampling_params & params);
+
 // this is a common sampling function used across the examples for convenience
 // it can serve as a starting point for implementing your own sampling function
 // Note: When using multiple sequences, it is the caller's responsibility to call
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index c5cdfbf21..c096f110b 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -437,6 +437,7 @@ int main(int argc, char ** argv) {
         }
     }
     LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
+    LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
     LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
     LOG_TEE("\n\n");
 

From 5aa365d88fdb8fdd430ef3fc141c7a5fd37c3502 Mon Sep 17 00:00:00 2001
From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Date: Tue, 5 Dec 2023 10:19:18 -0700
Subject: [PATCH 59/64] llama : allow overriding GGUF metadata when loading
 model (#4092)

* feat: Allow overriding GGUF metadata when loading model

* Fix the one time GCC is stricter than clang about something

* Step1

* Refactor... basically everything!

* Nuke obsolete GetArrayLen struct

* simplify std::string specialization

* Various cleanups

Add informational output when overrides are applied

Warn user when an override with the wrong type is specified

* Fix broken logic for parsing bool KV overrides
Fix issue where overrides didn't apply when key missing in GGUF metadata
Resolve merge changes

* llama : rearrange model params

* Update new GET_KEY call

Add note that metadata KV overrides aren't reflected in initial metadata KV info dump

---------

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 common/common.cpp |  55 +++++++
 common/common.h   |   2 +
 llama.cpp         | 370 +++++++++++++++++++++++++++++++++++-----------
 llama.h           |  20 +++
 4 files changed, 361 insertions(+), 86 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 8e6d74d0d..4e823c526 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -690,6 +690,47 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 std::istreambuf_iterator<char>(),
                 std::back_inserter(sparams.grammar)
             );
+        } else if (arg == "--override-kv") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            char * sep = strchr(argv[i], '=');
+            if (sep == nullptr || sep - argv[i] >= 128) {
+                fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
+            struct llama_model_kv_override kvo;
+            std::strncpy(kvo.key, argv[i], sep - argv[i]);
+            kvo.key[sep - argv[i]] = 0;
+            sep++;
+            if (strncmp(sep, "int:", 4) == 0) {
+                sep += 4;
+                kvo.tag = LLAMA_KV_OVERRIDE_INT;
+                kvo.int_value = std::atol(sep);
+            } else if (strncmp(sep, "float:", 6) == 0) {
+                sep += 6;
+                kvo.tag = LLAMA_KV_OVERRIDE_FLOAT;
+                kvo.float_value = std::atof(sep);
+            } else if (strncmp(sep, "bool:", 5) == 0) {
+                sep += 5;
+                kvo.tag = LLAMA_KV_OVERRIDE_BOOL;
+                if (std::strcmp(sep, "true") == 0) {
+                    kvo.bool_value = true;
+                } else if (std::strcmp(sep, "false") == 0) {
+                    kvo.bool_value = false;
+                } else {
+                    fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
+                    invalid_param = true;
+                    break;
+                }
+            } else {
+                fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
+            params.kv_overrides.push_back(kvo);
 #ifndef LOG_DISABLE_LOGS
         // Parse args for logging parameters
         } else if ( log_param_single_parse( argv[i] ) ) {
@@ -733,6 +774,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
         }
     }
 
+    if (!params.kv_overrides.empty()) {
+        params.kv_overrides.emplace_back(llama_model_kv_override());
+        params.kv_overrides.back().key[0] = 0;
+    }
+
     return true;
 }
 
@@ -864,6 +910,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        draft model for speculative decoding (default: %s)\n", params.model.c_str());
     printf("  -ld LOGDIR, --logdir LOGDIR\n");
     printf("                        path under which to save YAML logs (no logging if unset)\n");
+    printf("  --override-kv KEY=TYPE:VALUE\n");
+    printf("                        advanced option to override model metadata by key. may be specified multiple times.\n");
+    printf("                        types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
     printf("\n");
 #ifndef LOG_DISABLE_LOGS
     log_print_usage();
@@ -956,6 +1005,12 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
     mparams.tensor_split    = params.tensor_split;
     mparams.use_mmap        = params.use_mmap;
     mparams.use_mlock       = params.use_mlock;
+    if (params.kv_overrides.empty()) {
+        mparams.kv_overrides = NULL;
+    } else {
+        GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
+        mparams.kv_overrides = params.kv_overrides.data();
+    }
 
     return mparams;
 }
diff --git a/common/common.h b/common/common.h
index 534f7b132..024679380 100644
--- a/common/common.h
+++ b/common/common.h
@@ -86,6 +86,8 @@ struct gpt_params {
     std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
     std::string logdir            = "";  // directory in which to save YAML log files
 
+    std::vector<llama_model_kv_override> kv_overrides;
+
     // TODO: avoid tuple, use struct
     std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
     std::string lora_base  = "";                              // base model path for the lora adapter
diff --git a/llama.cpp b/llama.cpp
index fd905ade7..b77020e10 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -74,6 +74,7 @@
 #include <set>
 #include <sstream>
 #include <thread>
+#include <type_traits>
 #include <unordered_map>
 
 #if defined(_MSC_VER)
@@ -590,21 +591,6 @@ struct LLM_TN {
 // gguf helpers
 //
 
-#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
-do { \
-    const std::string skey(key); \
-    const int kid = gguf_find_key(ctx, skey.c_str()); \
-    if (kid >= 0) { \
-        enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
-        if (ktype != (type)) { \
-            throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \
-        } \
-        (dst) = func(ctx, kid); \
-    } else if (req) { \
-        throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
-    } \
-} while (0)
-
 static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = {
     { LLAMA_ROPE_SCALING_NONE,   "none"   },
     { LLAMA_ROPE_SCALING_LINEAR, "linear" },
@@ -638,7 +624,7 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
     }
 }
 
-static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
+static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
     const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
 
     switch (type) {
@@ -1797,6 +1783,169 @@ static std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
     return buf;
 }
 
+namespace GGUFMeta {
+    template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int)>
+    struct GKV_Base_Type {
+        static constexpr gguf_type gt = gt_;
+
+        static T getter(const gguf_context * ctx, const int kid) {
+            return gfun(ctx, kid);
+        }
+    };
+
+    template<typename T> struct GKV_Base;
+
+    template<> struct GKV_Base<bool        >: GKV_Base_Type<bool,         GGUF_TYPE_BOOL,    gguf_get_val_bool> {};
+    template<> struct GKV_Base<uint8_t     >: GKV_Base_Type<uint8_t,      GGUF_TYPE_UINT8,   gguf_get_val_u8  > {};
+    template<> struct GKV_Base<uint16_t    >: GKV_Base_Type<uint16_t,     GGUF_TYPE_UINT16,  gguf_get_val_u16 > {};
+    template<> struct GKV_Base<uint32_t    >: GKV_Base_Type<uint32_t,     GGUF_TYPE_UINT32,  gguf_get_val_u32 > {};
+    template<> struct GKV_Base<uint64_t    >: GKV_Base_Type<uint64_t,     GGUF_TYPE_UINT64,  gguf_get_val_u64 > {};
+    template<> struct GKV_Base<int8_t      >: GKV_Base_Type<int8_t,       GGUF_TYPE_INT8,    gguf_get_val_i8  > {};
+    template<> struct GKV_Base<int16_t     >: GKV_Base_Type<int16_t,      GGUF_TYPE_INT16,   gguf_get_val_i16 > {};
+    template<> struct GKV_Base<int32_t     >: GKV_Base_Type<int32_t,      GGUF_TYPE_INT32,   gguf_get_val_i32 > {};
+    template<> struct GKV_Base<int64_t     >: GKV_Base_Type<int64_t,      GGUF_TYPE_INT64,   gguf_get_val_i64 > {};
+    template<> struct GKV_Base<float       >: GKV_Base_Type<float,        GGUF_TYPE_FLOAT32, gguf_get_val_f32 > {};
+    template<> struct GKV_Base<double      >: GKV_Base_Type<double,       GGUF_TYPE_FLOAT64, gguf_get_val_f64 > {};
+    template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, GGUF_TYPE_STRING,  gguf_get_val_str > {};
+
+    template<> struct GKV_Base<std::string> {
+        static constexpr gguf_type gt = GGUF_TYPE_STRING;
+
+        static std::string getter(const gguf_context * ctx, const int kid) {
+            return gguf_get_val_str(ctx, kid);
+        }
+    };
+
+    struct ArrayInfo{
+        const gguf_type gt;
+        const size_t length;
+        const void * data;
+    };
+
+    template<> struct GKV_Base<ArrayInfo> {
+        public:
+        static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
+        static ArrayInfo getter(const gguf_context *ctx, const int k) {
+            return ArrayInfo {
+                gguf_get_arr_type(ctx, k),
+                size_t(gguf_get_arr_n(ctx, k)),
+                gguf_get_arr_data(ctx, k),
+            };
+        }
+    };
+
+    template<typename T>
+    class GKV: public GKV_Base<T> {
+        GKV() = delete;
+
+        public:
+        static T get_kv(const gguf_context * ctx, const int k) {
+            const enum gguf_type kt = gguf_get_kv_type(ctx, k);
+
+            if (kt != GKV::gt) {
+                throw std::runtime_error(format("key %s has wrong type %s but expected type %s",
+                    gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt)));
+            }
+            return GKV::getter(ctx, k);
+        }
+
+        static const char * override_type_to_str(const llama_model_kv_override_type ty) {
+            switch (ty) {
+                case LLAMA_KV_OVERRIDE_BOOL:  return "bool";
+                case LLAMA_KV_OVERRIDE_INT:   return "int";
+                case LLAMA_KV_OVERRIDE_FLOAT: return "float";
+            }
+            return "unknown";
+        }
+
+        static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *override) {
+            if (!override) { return false; }
+            if (override->tag == expected_type) {
+                LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
+                    __func__, override_type_to_str(override->tag), override->key);
+                switch (override->tag) {
+                    case LLAMA_KV_OVERRIDE_BOOL:  {
+                        printf("%s\n", override->bool_value ? "true" : "false");
+                    } break;
+                    case LLAMA_KV_OVERRIDE_INT:   {
+                        printf("%" PRId64 "\n", override->int_value);
+                    } break;
+                    case LLAMA_KV_OVERRIDE_FLOAT: {
+                        printf("%.6f\n", override->float_value);
+                    } break;
+                    default:
+                        // Shouldn't be possible to end up here, but just in case...
+                        throw std::runtime_error(
+                            format("Unsupported attempt to override %s type for metadata key %s\n",
+                                override_type_to_str(override->tag), override->key));
+                }
+                return true;
+            }
+            LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
+                __func__, override->key, override_type_to_str(expected_type), override_type_to_str(override->tag));
+            return false;
+        }
+
+        template<typename OT>
+        static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
+        try_override(OT & target, const struct llama_model_kv_override *override) {
+            if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) {
+                target = override->bool_value;
+                return true;
+            }
+            return true;
+        }
+
+        template<typename OT>
+        static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
+        try_override(OT & target, const struct llama_model_kv_override *override) {
+            if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) {
+                target = override->int_value;
+                return true;
+            }
+            return false;
+        }
+
+        template<typename OT>
+        static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
+        try_override(T & target, const struct llama_model_kv_override *override) {
+            if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) {
+                target = override->float_value;
+                return true;
+            }
+            return false;
+        }
+
+        template<typename OT>
+        static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
+        try_override(T & target, const struct llama_model_kv_override *override) {
+            (void)target;
+            (void)override;
+            if (!override) { return false; }
+            // Currently, we should never end up here so it would be a bug if we do.
+            throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
+                override ? override->key : "NULL"));
+        }
+
+        static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) {
+            if (try_override<T>(target, override)) {
+                return true;
+            }
+            if (k < 0) { return false; }
+            target = get_kv(ctx, k);
+            return true;
+        }
+
+        static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) {
+            return set(ctx, gguf_find_key(ctx, key), target, override);
+        }
+
+        static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) {
+            return set(ctx, key.c_str(), target, override);
+        }
+    };
+}
+
 struct llama_model_loader {
     int n_kv      = 0;
     int n_tensors = 0;
@@ -1812,21 +1961,34 @@ struct llama_model_loader {
     llama_fver  fver;
 
     std::unique_ptr<llama_mmap> mapping;
+    std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
 
     struct gguf_context * ctx_gguf = NULL;
     struct ggml_context * ctx_meta = NULL;
 
-    llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") {
+    std::string arch_name;
+    LLM_KV      llm_kv    = LLM_KV(LLM_ARCH_UNKNOWN);
+
+    llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
         struct gguf_init_params params = {
             /*.no_alloc = */ true,
             /*.ctx      = */ &ctx_meta,
         };
 
+        if (param_overrides_p != nullptr) {
+            for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
+                kv_overrides.insert({std::string(p->key), *p});
+            }
+        }
+
         ctx_gguf = gguf_init_from_file(fname.c_str(), params);
         if (!ctx_gguf) {
             throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
         }
 
+        get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
+        llm_kv = LLM_KV(llm_arch_from_string(arch_name));
+
         n_kv      = gguf_get_n_kv(ctx_gguf);
         n_tensors = gguf_get_n_tensors(ctx_gguf);
 
@@ -1894,6 +2056,7 @@ struct llama_model_loader {
                 }
             }
 
+            LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
             for (int i = 0; i < n_kv; i++) {
                 const char * name           = gguf_get_key(ctx_gguf, i);
                 const enum gguf_type type   = gguf_get_kv_type(ctx_gguf, i);
@@ -1939,19 +2102,59 @@ struct llama_model_loader {
         }
     }
 
+    template<typename T>
+    typename std::enable_if<std::is_integral<T>::value, bool>::type
+    get_arr_n(const std::string & key, T & result, const bool required = true) {
+        const int kid = gguf_find_key(ctx_gguf, key.c_str());
+
+        if (kid < 0) {
+            if (required) {
+                throw std::runtime_error(format("key not found in model: %s", key.c_str()));
+            }
+            return false;
+        }
+
+        struct GGUFMeta::ArrayInfo arr_info =
+            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);
+
+
+        result = arr_info.length;
+        return true;
+    }
+
+    template<typename T>
+    typename std::enable_if<std::is_integral<T>::value, bool>::type
+    get_arr_n(const enum llm_kv kid, T & result, const bool required = true) {
+        return get_arr_n(llm_kv(kid), result, required);
+    }
+
+    template<typename T>
+    bool get_key(const std::string & key, T & result, const bool required = true) {
+        auto it = kv_overrides.find(key);
+
+        const struct llama_model_kv_override * override =
+            it != kv_overrides.end() ? &it->second : nullptr;
+
+        const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
+
+        if (required && !found) {
+            throw std::runtime_error(format("key not found in model: %s", key.c_str()));
+        }
+
+        return found;
+    }
+
+    template<typename T>
+    bool get_key(const enum llm_kv kid, T & result, const bool required = true) {
+        return get_key(llm_kv(kid), result, required);
+    }
+
     std::string get_arch_name() const {
-        const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
-
-        std::string arch_name;
-        GGUF_GET_KEY(ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_ARCHITECTURE));
-
         return arch_name;
     }
 
     enum llm_arch get_arch() const {
-        const std::string arch_name = get_arch_name();
-
-        return llm_arch_from_string(arch_name);
+        return llm_kv.arch;
     }
 
     const char * get_tensor_name(int i) const {
@@ -2201,11 +2404,8 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
 static void llm_load_hparams(
         llama_model_loader & ml,
         llama_model & model) {
-    struct gguf_context * ctx = ml.ctx_gguf;
-
-    const auto kv = LLM_KV(model.arch);
-
     auto & hparams = model.hparams;
+    const gguf_context * ctx = ml.ctx_gguf;
 
     // get metadata as string
     for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
@@ -2219,42 +2419,41 @@ static void llm_load_hparams(
     }
 
     // get general kv
-    GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
+    ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
 
     // get hparams kv
-    GGUF_GET_KEY(ctx, hparams.n_vocab,        gguf_get_arr_n,   GGUF_TYPE_ARRAY,  true, kv(LLM_KV_TOKENIZER_LIST));
-    GGUF_GET_KEY(ctx, hparams.n_ctx_train,    gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
-    GGUF_GET_KEY(ctx, hparams.n_embd,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
-    GGUF_GET_KEY(ctx, hparams.n_ff,           gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
-    GGUF_GET_KEY(ctx, hparams.n_head,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
-    GGUF_GET_KEY(ctx, hparams.n_layer,        gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
+    ml.get_arr_n(LLM_KV_TOKENIZER_LIST,       hparams.n_vocab);
+    ml.get_key  (LLM_KV_CONTEXT_LENGTH,       hparams.n_ctx_train);
+    ml.get_key  (LLM_KV_EMBEDDING_LENGTH,     hparams.n_embd);
+    ml.get_key  (LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff);
+    ml.get_key  (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
+    ml.get_key  (LLM_KV_BLOCK_COUNT,          hparams.n_layer);
 
     // n_head_kv is optional, default to n_head
     hparams.n_head_kv = hparams.n_head;
-    GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
+    ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv, false);
 
-    hparams.rope_finetuned = false;
-    GGUF_GET_KEY(ctx, hparams.rope_finetuned, gguf_get_val_bool, GGUF_TYPE_BOOL, false,
-                 kv(LLM_KV_ROPE_SCALING_FINETUNED));
+    bool rope_finetuned = false;
+    ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
+    hparams.rope_finetuned = rope_finetuned;
 
     hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
-    GGUF_GET_KEY(ctx, hparams.n_yarn_orig_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false,
-                 kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN));
+    ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_yarn_orig_ctx, false);
 
     // rope_freq_base (optional)
     hparams.rope_freq_base_train = 10000.0f;
-    GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
+    ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
 
     std::string rope_scaling("linear");
-    GGUF_GET_KEY(ctx, rope_scaling, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_ROPE_SCALING_TYPE));
+    ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
     hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
     GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
 
     // rope_freq_scale (inverse of the kv) is optional
     float ropescale = 0.0f;
-    GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALING_FACTOR));
-    if (ropescale == 0.0f) { // try the old key name
-        GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
+    if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
+        // try the old key name
+        ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
     }
     hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
 
@@ -2262,7 +2461,7 @@ static void llm_load_hparams(
     {
         hparams.n_rot = hparams.n_embd / hparams.n_head;
 
-        GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
+        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
 
         if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
             if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
@@ -2277,7 +2476,7 @@ static void llm_load_hparams(
     switch (model.arch) {
         case LLM_ARCH_LLAMA:
             {
-                GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
                 switch (hparams.n_layer) {
                     case 26: model.type = e_model::MODEL_3B; break;
@@ -2291,7 +2490,7 @@ static void llm_load_hparams(
             } break;
         case LLM_ARCH_FALCON:
             {
-                GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
 
                 switch (hparams.n_layer) {
                     case 32: model.type = e_model::MODEL_7B; break;
@@ -2301,7 +2500,7 @@ static void llm_load_hparams(
             } break;
         case LLM_ARCH_BAICHUAN:
             {
-                GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 switch (hparams.n_layer) {
                     case 32: model.type = e_model::MODEL_7B; break;
                     case 40: model.type = e_model::MODEL_13B; break;
@@ -2310,7 +2509,7 @@ static void llm_load_hparams(
             } break;
         case LLM_ARCH_STARCODER:
             {
-                GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
                 switch (hparams.n_layer) {
                     case 24: model.type = e_model::MODEL_1B; break;
                     case 36: model.type = e_model::MODEL_3B; break;
@@ -2321,7 +2520,7 @@ static void llm_load_hparams(
             } break;
         case LLM_ARCH_PERSIMMON:
             {
-                GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
                 switch (hparams.n_layer) {
                     case 36: model.type = e_model::MODEL_8B; break;
                     default: model.type = e_model::MODEL_UNKNOWN;
@@ -2329,7 +2528,7 @@ static void llm_load_hparams(
             } break;
         case LLM_ARCH_REFACT:
             {
-                GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 switch (hparams.n_layer) {
                     case 32: model.type = e_model::MODEL_1B; break;
                     default: model.type = e_model::MODEL_UNKNOWN;
@@ -2337,7 +2536,7 @@ static void llm_load_hparams(
             } break;
         case LLM_ARCH_BLOOM:
             {
-                GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
 
                 switch (hparams.n_layer) {
                     case 24: model.type = e_model::MODEL_1B; break;
@@ -2352,9 +2551,9 @@ static void llm_load_hparams(
             {
                 hparams.f_clamp_kqv = 0.0f;
 
-                GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
-                GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
-                GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps);
+                ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,      hparams.f_clamp_kqv, false);
+                ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
 
                 switch (hparams.n_layer) {
                     case 32: model.type = e_model::MODEL_7B; break;
@@ -2364,7 +2563,7 @@ static void llm_load_hparams(
             } break;
         case LLM_ARCH_STABLELM:
             {
-                GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
 
                 switch (hparams.n_layer) {
                     case 32: model.type = e_model::MODEL_3B; break;
@@ -2373,7 +2572,8 @@ static void llm_load_hparams(
             } break;
         case LLM_ARCH_QWEN:
             {
-                GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
                 switch (hparams.n_layer) {
                     case 32: model.type = e_model::MODEL_7B; break;
                     case 40: model.type = e_model::MODEL_13B; break;
@@ -2421,7 +2621,7 @@ static void llm_load_vocab(
     {
         std::string tokenizer_name;
 
-        GGUF_GET_KEY(ctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_TOKENIZER_MODEL));
+        ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
 
         if (tokenizer_name == "llama") {
             vocab.type = LLAMA_VOCAB_TYPE_SPM;
@@ -2511,34 +2711,31 @@ static void llm_load_vocab(
         };
         for (const auto & it : special_token_types) {
             const std::string & key = kv(std::get<0>(it));
-            int32_t & id = std::get<1>(it), old_id = id;
+            int32_t & id = std::get<1>(it);
 
-            GGUF_GET_KEY(ctx, id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, key);
-            // Must be >= -1 and < vocab size. Since the key is unsigned, -1
-            // can only come from the default value, so there's no point in
-            // validating that.
-            if (size_t(id + 1) > vocab.id_to_token.size()) {
-                LLAMA_LOG_WARN("%s: bad special token: '%s' = %d, using default id %d\n",
-                    __func__, key.c_str(), id, old_id);
-                id = old_id;
+            uint32_t new_id;
+            if (!ml.get_key(std::get<0>(it), new_id, false)) {
+                continue;
+            }
+            if (new_id >= vocab.id_to_token.size()) {
+                LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
+                    __func__, key.c_str(), new_id, id);
+            } else {
+                id = new_id;
             }
 
         }
 
         // Handle add_bos_token and add_eos_token
-        std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS);
-        int kid = gguf_find_key(ctx, key.c_str());
-        enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
-        vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
-        if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
-            LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
-        }
-        key = kv(LLM_KV_TOKENIZER_ADD_EOS);
-        kid = gguf_find_key(ctx, key.c_str());
-        ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
-        vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
-        if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
-            LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
+        {
+            bool temp = true;
+
+            if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
+                vocab.special_add_bos = int(temp);
+            }
+            if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
+                vocab.special_add_eos = int(temp);
+            }
         }
     }
 
@@ -3487,7 +3684,7 @@ static void llm_load_tensors(
 
 static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
     try {
-        llama_model_loader ml(fname, params.use_mmap);
+        llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
 
         model.hparams.vocab_only = params.vocab_only;
 
@@ -8078,7 +8275,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     constexpr bool use_mmap = false;
 #endif
 
-    llama_model_loader ml(fname_inp, use_mmap);
+    llama_model_loader ml(fname_inp, use_mmap, NULL);
     if (ml.use_mmap) {
         ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
     }
@@ -8374,7 +8571,7 @@ static int llama_apply_lora_from_file_internal(
     std::vector<uint8_t> base_buf;
     if (path_base_model) {
         LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
-        ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
+        ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ NULL));
 
         size_t ctx_size;
         size_t mmapped_size;
@@ -8602,6 +8799,7 @@ struct llama_model_params llama_model_default_params() {
         /*.tensor_split                =*/ nullptr,
         /*.progress_callback           =*/ nullptr,
         /*.progress_callback_user_data =*/ nullptr,
+        /*.kv_overrides                =*/ nullptr,
         /*.vocab_only                  =*/ false,
         /*.use_mmap                    =*/ true,
         /*.use_mlock                   =*/ false,
diff --git a/llama.h b/llama.h
index 89cb6198e..517245a35 100644
--- a/llama.h
+++ b/llama.h
@@ -158,6 +158,22 @@ extern "C" {
         llama_seq_id all_seq_id; // used if seq_id == NULL
     } llama_batch;
 
+    enum llama_model_kv_override_type {
+        LLAMA_KV_OVERRIDE_INT,
+        LLAMA_KV_OVERRIDE_FLOAT,
+        LLAMA_KV_OVERRIDE_BOOL,
+    };
+
+    struct llama_model_kv_override {
+        char key[128];
+        enum llama_model_kv_override_type tag;
+        union {
+            int64_t int_value;
+            double float_value;
+            bool bool_value;
+        };
+    };
+
     struct llama_model_params {
         int32_t n_gpu_layers; // number of layers to store in VRAM
         int32_t main_gpu;     // the GPU that is used for scratch and small tensors
@@ -165,9 +181,13 @@ extern "C" {
 
         // called with a progress value between 0 and 1, pass NULL to disable
         llama_progress_callback progress_callback;
+
         // context pointer passed to the progress callback
         void * progress_callback_user_data;
 
+        // override key-value pairs of the model meta data
+        const struct llama_model_kv_override * kv_overrides;
+
         // Keep the booleans together to avoid misalignment during copy-by-value.
         bool vocab_only; // only load the vocabulary, no weights
         bool use_mmap;   // use mmap if possible

From 5f6e0c0dff1e7a89331e6b25eca9a9fd71324069 Mon Sep 17 00:00:00 2001
From: Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
Date: Tue, 5 Dec 2023 10:55:12 -1000
Subject: [PATCH 60/64] grammar : pre-computed pieces + reserve mem + less
 string copies (#4330)

* reserve space for codepoints

* improvement for the appended 0

* used precomputed token text for grammar sample

* reserve canidates_decoded

* reserve canidates_grammar

* remove candidates_decoded

* Revert "remove candidates_decoded"

This reverts commit 3773328080e6a139ee83198329a13cf4ff61d707.

* changed decode_utf8 to take src by ref
---
 llama.cpp | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index b77020e10..14e5d312e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6851,14 +6851,13 @@ struct llama_grammar_candidate {
 // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
 // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
 static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
-        const char         * src,
-        size_t               n_src,
+        const std::string & src,
         llama_partial_utf8   partial_start) {
     static const int      lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
-    const char          * pos      = src;
+    const char          * pos      = src.c_str();
     std::vector<uint32_t> code_points;
     // common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
-    code_points.reserve(n_src + 1);
+    code_points.reserve(src.size() + 1);
     uint32_t              value    = partial_start.value;
     int                   n_remain = partial_start.n_remain;
 
@@ -6909,13 +6908,6 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
     return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
 }
 
-static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
-        std::string src,
-        llama_partial_utf8 partial_start
-) {
-    return decode_utf8(src.c_str(), src.size(), partial_start);
-}
-
 // returns true iff pos points to the end of one of the definitions of a rule
 static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
     switch (pos->type) {
@@ -7554,11 +7546,13 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
     const llama_token eos = llama_token_eos(&ctx->model);
 
     std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
+    candidates_decoded.reserve(candidates->size);
     std::vector<llama_grammar_candidate>                              candidates_grammar;
+    candidates_grammar.reserve(candidates->size);
 
     for (size_t i = 0; i < candidates->size; ++i) {
         const llama_token id    = candidates->data[i].id;
-        const std::string piece = llama_token_to_piece(ctx, id);
+        const std::string & piece = ctx->model.vocab.id_to_token[id].text;
         if (id == eos) {
             if (!allow_eos) {
                 candidates->data[i].logit = -INFINITY;
@@ -7770,7 +7764,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
         GGML_ASSERT(false);
     }
 
-    const std::string piece = llama_token_to_piece(ctx, token);
+    const std::string & piece = ctx->model.vocab.id_to_token[token].text;
 
     // Note terminating 0 in decoded string
     const auto   decoded     = decode_utf8(piece, grammar->partial_utf8);

From da5eaef1f34d0a1f584cd4a092e7691ea46a9d91 Mon Sep 17 00:00:00 2001
From: stduhpf <stephduh@live.fr>
Date: Wed, 6 Dec 2023 09:08:17 +0100
Subject: [PATCH 61/64] speculative : support `--color` (#4343)

* speculative: add some colors

* minor : add braces

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 examples/speculative/speculative.cpp | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index ace755c51..dca3f84a5 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -203,8 +203,9 @@ int main(int argc, char ** argv) {
 
             const std::string token_str = llama_token_to_piece(ctx_tgt, id);
 
-            printf("%s", token_str.c_str());
-            fflush(stdout);
+            if (!params.use_color) {
+                printf("%s", token_str.c_str());
+            }
 
             if (id == llama_token_eos(model_tgt)) {
                 has_eos = true;
@@ -236,10 +237,18 @@ int main(int argc, char ** argv) {
                     ++n_past_tgt;
                     ++n_past_dft;
                     ++i_dft;
-
+                    if (params.use_color) {
+                        // Color token according to its origin sequence
+                        printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
+                        fflush(stdout);
+                    }
                     continue;
                 }
             }
+            if (params.use_color) {
+                printf("%s", token_str.c_str());
+            }
+            fflush(stdout);
 
             LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
 

From caa9249217c5fd524b900add5ddcbeaa20cbcb12 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 6 Dec 2023 10:41:03 +0200
Subject: [PATCH 62/64] common : fix compile warning

---
 common/sampling.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/common/sampling.cpp b/common/sampling.cpp
index b6bb886c6..f4e76df31 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -113,13 +113,15 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
                 default : break;
             }
         }
-    } else result += "-> mirostat ";
+    } else {
+        result += "-> mirostat ";
+    }
 
     return result;
 }
 
 // no reasons to expose this function in header
-void sampler_queue(
+static void sampler_queue(
                    struct llama_context * ctx_main,
             const llama_sampling_params & params,
                  llama_token_data_array & cur_p,

From 05cd6e5036d72d0930de4d8f6be7bce09e8dda24 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 6 Dec 2023 20:21:59 +0200
Subject: [PATCH 63/64] server : recognize cache_prompt parameter in OAI API
 (#4347)

---
 examples/server/server.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 911f7bbe1..369f81a84 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2387,6 +2387,7 @@ json oaicompat_completion_params_parse(
 
     // Map OpenAI parameters to llama.cpp parameters
     llama_params["prompt"]            = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
+    llama_params["cache_prompt"]      = json_value(body, "cache_prompt", false);
     llama_params["temperature"]       = json_value(body, "temperature", 0.8);
     llama_params["top_k"]             = json_value(body, "top_k", 40);
     llama_params["top_p"]             = json_value(body, "top_p", 0.95);

From 81bc9214a389362010f7a57f4cbc30e5f83a2d28 Mon Sep 17 00:00:00 2001
From: Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
Date: Thu, 7 Dec 2023 02:25:22 -0800
Subject: [PATCH 64/64] train : fix #4227 (double free in
 examples/train-text-from-scratch/train-text-from-scratch.cpp) (#4351)

On commit b1108 (44c117f4) xaedes added

    ggml_allocr * alloc = NULL;

    ... (many lines in between)

    if (alloc) {
        ggml_allocr_free(alloc);
    }

Which is correct, but it's easy to lose context after many lines in between.

On commit b1287 (0e76a899) xaedes made a big change. From here on, alloc is freed eagerly.

    alloc = ggml_allocr_new(...)
    ... (short lines of code)
    ggml_allocr_free(alloc)

This happens a few times, but alloc is never set to NULL, and many lines below,
we still have

    if (alloc) {
        ggml_allocr_free(alloc);
    }

which causes a double-free.
---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index f049a3923..f7ed63365 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1295,10 +1295,6 @@ int main(int argc, char ** argv) {
         opt_cb_data.last_save_iter = opt->iter;
     }
 
-    if (alloc) {
-        ggml_allocr_free(alloc);
-    }
-
     ggml_free(opt->ctx);
     free_train_state(train);
     ggml_free(model.ctx);