From 9a4b79bcfa4338b922fa8cf903bd5ac058aaf46f Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Tue, 26 Nov 2024 18:08:37 +0800
Subject: [PATCH 01/43] CANN: Improve the Inferencing Performance for Ascend
 NPU Device (#10454)

* improve inferencing performance for ascend npu.

Co-authored-by: Frank Mai <thxCode@thxcode0824@gmail.com>

* some modification after review

* some modifications after review

* restore some modifications

* restore some modifications

---------

Co-authored-by: shanshan shen <shanshanshen333@gmail.com>
Co-authored-by: Frank Mai <thxCode@thxcode0824@gmail.com>
---
 ggml/src/ggml-cann/aclnn_ops.cpp | 301 +++++++++++++++++++++++--------
 ggml/src/ggml-cann/common.h      |   9 +-
 ggml/src/ggml-cann/ggml-cann.cpp |  58 ++++--
 3 files changed, 266 insertions(+), 102 deletions(-)

diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index 6113b59f4..d7472ee3a 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -33,6 +33,8 @@
 #include <aclnnop/aclnn_group_norm.h>
 #include <aclnnop/aclnn_index_fill_tensor.h>
 #include <aclnnop/aclnn_layer_norm.h>
+#include <aclnnop/aclnn_mm.h>
+#include <aclnnop/aclnn_batch_matmul.h>
 #include <aclnnop/aclnn_matmul.h>
 #include <aclnnop/aclnn_max_pool.h>
 #include <aclnnop/aclnn_permute.h>
@@ -2423,7 +2425,6 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
                           aclTensor* acl_weight, aclTensor* acl_dst) {
     int8_t cube_math_type = 1;  // ALLOW_FP32_DOWN_PRECISION, when input is
                                 // fp32, atlas a2 will transpose it to HFLOAT32.
-
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
@@ -2441,6 +2442,80 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
         aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream()));
 }
 
+/**
+ * @brief Performs matrix multiplication of two 2D tensors.
+ *
+ * This function computes the matrix multiplication of the input tensor
+ * `acl_input` and the weight tensor `acl_weight`, and stores the result in the
+ * destination tensor `acl_dst`.
+ * The operation is defined as:
+ * \f[
+ *     \text {acl_dst}=\text {acl_input@acl_weight}
+ * \f]
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_input The input tensor for the matrix multiplication.
+ * @param acl_weight The weight tensor for the matrix multiplication.
+ * @param acl_dst The destination tensor where the result of the matrix
+ * multiplication will be stored.
+ */
+static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
+                             aclTensor* acl_weight, aclTensor* acl_dst) {
+    int8_t cube_math_type = 2;
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnMmGetWorkspaceSize(acl_input, acl_weight, acl_dst,
+                                      cube_math_type, &workspaceSize,
+                                      &executor));
+
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(
+        aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
+/**
+ * @brief Performs matrix multiplication of two 3D tensors.
+ *
+ * This function computes the matrix multiplication of the input tensor
+ * `acl_input` and the weight tensor `acl_weight`, and stores the result in the
+ * destination tensor `acl_dst`.
+ * The operation is defined as:
+ * \f[
+ *     \text {acl_dst}=\text {acl_input@acl_weight}
+ * \f]
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_input The input tensor for the matrix multiplication.
+ * @param acl_weight The weight tensor for the matrix multiplication.
+ * @param acl_dst The destination tensor where the result of the matrix
+ * multiplication will be stored.
+ */
+static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
+                             aclTensor* acl_weight, aclTensor* acl_dst) {
+    int8_t cube_math_type = 2;
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnBatchMatMulGetWorkspaceSize(acl_input, acl_weight, acl_dst,
+                                               cube_math_type, &workspaceSize,
+                                               &executor));
+
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(
+        aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
 /**
  * @brief Performs matrix multiplication with floating-point precision on
  * tensors using the CANN backend.
@@ -2462,20 +2537,43 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
     // broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
     BCAST_MUL_MAT_SHAPE(input, weight, dst);
 
-    // transpose weight: [1,2,3,4] -> [1,2,4,3]
-    int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0],
-                              bcast_weight_ne[2], bcast_weight_ne[3],
-                              bcast_weight_ne[4], bcast_weight_ne[5]};
-    size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
-                             bcast_weight_nb[2], bcast_weight_nb[3],
-                             bcast_weight_nb[4], bcast_weight_nb[5]};
+    int64_t n_dims = bcast_dims;
+    if (bcast_input_ne[3] == bcast_weight_ne[3] && bcast_input_ne[3] == 1) {
+        if (bcast_input_ne[2] == 1 && bcast_weight_ne[2] == 1) {
+            n_dims = 2;
+        } else if (bcast_input_ne[2] == 1) {
+            n_dims = 3;
+        }
+    }
 
-    aclTensor* acl_weight_tensor =
-        ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, bcast_dims);
     aclTensor* acl_input_tensor =
-        ggml_cann_create_tensor(input, BCAST_MUL_MAT_PARAM(input));
-    aclTensor* acl_dst = ggml_cann_create_tensor(dst, BCAST_MUL_MAT_PARAM(dst));
-    aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
+        ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
+    int64_t transpose_ne[] = {
+        bcast_weight_ne[1], bcast_weight_ne[0],
+        bcast_weight_ne[2], bcast_weight_ne[3],
+        bcast_weight_ne[4], bcast_weight_ne[5]
+    };
+    size_t transpose_nb[] = {
+        bcast_weight_nb[1], bcast_weight_nb[0],
+        bcast_weight_nb[2], bcast_weight_nb[3],
+        bcast_weight_nb[4], bcast_weight_nb[5]
+    };
+    aclTensor* acl_weight_tensor =
+        ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
+    aclTensor* acl_dst =
+        ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
+
+    switch (n_dims) {
+    case 2:
+        aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
+        break;
+    case 3:
+        aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
+        break;
+    default:
+        aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
+        break;
+    }
 
     ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
     ACL_CHECK(aclDestroyTensor(acl_input_tensor));
@@ -2501,46 +2599,40 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
     ggml_tensor* src0 = dst->src[0];  // weight
     ggml_tensor* src1 = dst->src[1];  // input
 
-    // The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC
-    // is regarded as batch. weight need transpose.
-    int64_t weight_ne[] = {src0->ne[1], src0->ne[0]};
+    // The shape of the weight is NCHW.
+    // Matrix multiplication uses HW dims.
+    // HC is regarded as batch.
+    // weight need transpose.
     float weight_elem_size;
     if (type == GGML_TYPE_Q4_0) {
         weight_elem_size = float(sizeof(uint8_t)) / 2;
-    }
-    else if (type == GGML_TYPE_Q8_0) {
+    } else if (type == GGML_TYPE_Q8_0) {
         weight_elem_size = float(sizeof(uint8_t));
-    }
-    else {
+    } else {
         GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
     }
-    float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
-
-    // size of one matrix is element_size * height * width.
-    size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1];
+    float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size};
+    size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
     size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
 
     // scale stored at the end of weight. Also need transpose.
-    GGML_ASSERT(QK4_0 == QK8_0);
-    int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0};
     size_t scale_elem_size = sizeof(uint16_t);
-    size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
-                         scale_elem_size};
-    size_t scale_stride = scale_elem_size * src0->ne[0] * src0->ne[1] / QK8_0;
+    size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size};
+    size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
     char* scale_offset = (char*)src0->data + weight_size;
 
     // input
-    void* input_buffer;
     size_t input_elem_size = sizeof(uint16_t);
     int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
-    size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]};
-    size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1];
-
+    size_t input_nb[] = {input_elem_size,  input_ne[0] * input_elem_size};
+    size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
     ggml_cann_pool_alloc input_alloctor(ctx.pool());
+    void* input_buffer = src1->data;
+
+    // case in
     if (src1->type != GGML_TYPE_F16) {
         aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
-        input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
-        input_buffer = input_alloctor.get();
+        input_buffer = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
 
         int64_t* input_cast_ne = src1->ne;
         size_t input_cast_nb[GGML_MAX_DIMS];
@@ -2550,88 +2642,139 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
         }
 
         aclTensor* acl_input_tensor = ggml_cann_create_tensor(
-            input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
-            input_cast_nb, GGML_MAX_DIMS);
+            input_buffer,
+            ACL_FLOAT16,
+            input_elem_size, input_cast_ne, input_cast_nb, GGML_MAX_DIMS);
         aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
+
         ACL_CHECK(aclDestroyTensor(acl_input_tensor));
         ACL_CHECK(aclDestroyTensor(acl_src1_tensor));
-    } else {
-        input_buffer = src1->data;
     }
 
     // output
     size_t output_elem_size = sizeof(uint16_t);
-    int64_t output_ne[] = {dst->ne[0], dst->ne[1]};
-    size_t output_nb[] = {output_elem_size, output_elem_size * dst->ne[0]};
-    ggml_cann_pool_alloc output_alloctor(
-        ctx.pool(), ggml_nelements(dst) * output_elem_size);
-    void* output_buffer = output_alloctor.get();
-    size_t output_stride = output_elem_size * dst->ne[0] * dst->ne[1];
+    size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
+    ggml_cann_pool_alloc output_allocator(ctx.pool());
+    void* output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
+    size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
 
     // aclnn
+    int64_t max_elem_size = 65535;
+    int64_t split_size = (src0->ne[1] / max_elem_size) + 1;
+    ggml_cann_pool_alloc workspace_allocator(ctx.pool());
+    aclOpExecutor* executor = nullptr;
     uint64_t workspaceSize = 0;
-    aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
-
     for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
         for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
             int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
             int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]);
 
-            int64_t batch1 = n1 * src1->ne[2] + c1;
-            int64_t batch0 = n0 * src0->ne[2] + c0;
+            int64_t batch1 = (n1 * src1->ne[2]) + c1;
+            int64_t batch0 = (n0 * src0->ne[2]) + c0;
 
             aclTensor* acl_input_tensor = ggml_cann_create_tensor(
                 (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
                 input_elem_size, input_ne, input_nb, 2);
+
+            // first split
+            int64_t weight_ne_offset = 0;
+            int64_t weight_ne[2] = {max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0]};
+            int64_t scale_ne_offset = 0;
+            int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
+            int64_t output_ne_offset = 0;
+            int64_t output_ne[2] = {weight_ne[0], dst->ne[1]};
+
             aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
                 (char*)src0->data + batch0 * weight_stride,
-                ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
-                weight_nb, 2);
+                ggml_cann_type_mapping(type),
+                weight_elem_size, weight_ne, weight_nb, 2,
+                ACL_FORMAT_ND, weight_ne_offset);
             aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
-                scale_offset + batch0 * scale_stride, ACL_FLOAT16,
-                scale_elem_size, scale_ne, scale_nb, 2);
+                scale_offset + batch0 * scale_stride,
+                ACL_FLOAT16,
+                scale_elem_size, scale_ne, scale_nb, 2,
+                ACL_FORMAT_ND, scale_ne_offset);
             aclTensor* acl_output_tensor = ggml_cann_create_tensor(
-                (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
-                output_elem_size, output_ne, output_nb, 2);
+                (char*)output_buffer + batch1 * output_stride,
+                ACL_FLOAT16,
+                output_elem_size, output_ne, output_nb, 2,
+                ACL_FORMAT_ND, output_ne_offset);
 
             ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
-                acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
-                nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
-                &workspaceSize, &executor));
-
-            if (workspaceSize > 0 && workspaceAddr == nullptr) {
-                ggml_cann_pool_alloc workspace_allocator(ctx.pool(),
-                                                         workspaceSize);
-                workspaceAddr = workspace_allocator.get();
+                acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
+                nullptr, nullptr, nullptr, nullptr, QK8_0,
+                acl_output_tensor, &workspaceSize, &executor));
+            if (workspaceAddr == nullptr) {
+                workspaceAddr = workspace_allocator.alloc(workspaceSize);
             }
-
             ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
                 workspaceAddr, workspaceSize, executor, ctx.stream()));
 
-            ACL_CHECK(aclDestroyTensor(acl_input_tensor));
             ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
             ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
             ACL_CHECK(aclDestroyTensor(acl_output_tensor));
+
+            // other splits
+            for (int64_t split = 1; split < split_size; split++) {
+                weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1];
+                weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size;
+                scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
+                scale_ne[0] = weight_ne[0];
+                output_ne_offset += output_elem_size * output_ne[0] * output_ne[1];
+                output_ne[0] = weight_ne[0];
+
+                acl_weight_tensor = ggml_cann_create_tensor(
+                    (char*)src0->data + batch0 * weight_stride,
+                    ggml_cann_type_mapping(type),
+                    weight_elem_size, weight_ne, weight_nb, 2,
+                    ACL_FORMAT_ND, weight_ne_offset);
+                acl_scale_tensor = ggml_cann_create_tensor(
+                    scale_offset + batch0 * scale_stride,
+                    ACL_FLOAT16,
+                    scale_elem_size, scale_ne, scale_nb, 2,
+                    ACL_FORMAT_ND, scale_ne_offset);
+                acl_output_tensor = ggml_cann_create_tensor(
+                    (char*)output_buffer + batch1 * output_stride,
+                    ACL_FLOAT16,
+                    output_elem_size, output_ne, output_nb, 2,
+                    ACL_FORMAT_ND, output_ne_offset);
+
+                ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
+                    acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
+                    nullptr, nullptr, nullptr, nullptr, QK8_0,
+                    acl_output_tensor, &workspaceSize, &executor));
+                ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
+                    workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+                ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
+                ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
+                ACL_CHECK(aclDestroyTensor(acl_output_tensor));
+            }
+
+            ACL_CHECK(aclDestroyTensor(acl_input_tensor));
         }
     }
 
     // cast out
-    int64_t* output_cast_ne = dst->ne;
-    size_t output_cast_nb[GGML_MAX_DIMS];
-    output_cast_nb[0] = sizeof(uint16_t);
-    for (int i = 1; i < GGML_MAX_DIMS; i++) {
-        output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
+    if (dst->type != GGML_TYPE_F16) {
+        int64_t* output_cast_ne = dst->ne;
+        size_t output_cast_nb[GGML_MAX_DIMS];
+        output_cast_nb[0] = sizeof(uint16_t);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
+        }
+
+        aclTensor* acl_output_tensor = ggml_cann_create_tensor(
+            output_buffer,
+            ACL_FLOAT16,
+            output_elem_size, output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
+        aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
+        aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
+
+        ACL_CHECK(aclDestroyTensor(acl_output_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
     }
-
-    aclTensor* acl_output_tensor =
-        ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, output_elem_size,
-                                output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
-    aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
-    aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ACL_FLOAT);
-
-    ACL_CHECK(aclDestroyTensor(acl_output_tensor));
-    ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
 }
 
 void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h
index edfa49614..5164cb74e 100644
--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@@ -211,17 +211,20 @@ struct ggml_cann_pool_alloc {
 struct ggml_backend_cann_context {
     int32_t device;                  /**< Device ID. */
     std::string name;                /**< Name of the device. */
+    std::string description;         /**< Description of the device. */
     aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
 
-    aclrtStream streams[GGML_CANN_MAX_STREAMS] = {
-        {nullptr}}; /**< Array of streams for the device. */
+    aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
 
     /**
      * @brief Constructor for initializing the context with a given device.
      * @param device Device ID.
      */
     explicit ggml_backend_cann_context(int device)
-        : device(device), name("CANN" + std::to_string(device)) {}
+        : device(device), name("CANN" + std::to_string(device)) {
+        ggml_cann_set_device(device);
+        description = aclrtGetSocName();
+    }
 
     /**
      * @brief Destructor for cleaning up resources.
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index 2ef5b590a..c7a3419c7 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -122,6 +122,10 @@ static ggml_cann_device_info ggml_cann_init() {
         ACL_CHECK(aclrtMemGetAllocationGranularity(
             &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
             &info.devices[id].vmm_granularity));
+
+        size_t free, total;
+        ggml_backend_cann_get_device_memory(id, &free, &total);
+        info.devices[id].total_vram = free;
     }
 
     // TODO: add more device info later.
@@ -208,6 +212,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
      * @return A pointer to the allocated buffer.
      */
     void* alloc(size_t size, size_t* actual_size) override {
+        const size_t alignment = 128;
+        size = GGML_PAD(size, alignment);
+        if (size == 0) {
+            size = alignment;
+        }
 #ifdef DEBUG_CANN_MALLOC
         int nnz = 0;
         size_t max_size = 0;
@@ -246,13 +255,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
             return ptr;
         }
         void* ptr;
-        size_t look_ahead_size = (size_t)(1.05 * size);
-        look_ahead_size = 256 * ((look_ahead_size + 255) / 256);
         ggml_cann_set_device(device);
         ACL_CHECK(
-            aclrtMalloc(&ptr, look_ahead_size, ACL_MEM_MALLOC_HUGE_FIRST));
-        *actual_size = look_ahead_size;
-        pool_size += look_ahead_size;
+            aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
+        *actual_size = size;
+        pool_size += size;
 #ifdef DEBUG_CANN_MALLOC
         GGML_LOG_INFO(
             "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
@@ -296,7 +303,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
     /**
      * @brief The maximum size of the virtual memory pool (32 GB).
      */
-    static const size_t CANN_POOL_VMM_MAX_SIZE = 1ull << 35;  // 32 GB
+    size_t max_size;
 
     /**
      * @brief The device ID associated with this buffer pool.
@@ -341,7 +348,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
      */
     explicit ggml_cann_pool_vmm(int device)
         : device(device),
-          granularity(ggml_cann_info().devices[device].vmm_granularity) {}
+          granularity(ggml_cann_info().devices[device].vmm_granularity) {
+        auto dev = ggml_cann_info().devices[device];
+        granularity = dev.vmm_granularity;
+        max_size = dev.total_vram;
+    }
 
     /**
      * @brief Destructor to free all buffers in the virtual memory pool.
@@ -370,17 +381,19 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
         // round up the allocation size to the alignment to ensure that all
         // allocations are aligned for all data types
         const size_t alignment = 128;
-        size = alignment * ((size + alignment - 1) / alignment);
+        size = GGML_PAD(size, alignment);
+        if (size == 0) {
+            size = alignment;
+        }
 
         size_t avail = pool_size - pool_used;
 
         if (size > avail) {
             // round up to the next multiple of the granularity
             size_t reserve_size = size - avail;
-            reserve_size =
-                granularity * ((reserve_size + granularity - 1) / granularity);
+            reserve_size = GGML_PAD(reserve_size, granularity);
 
-            GGML_ASSERT(pool_size + reserve_size <= CANN_POOL_VMM_MAX_SIZE);
+            GGML_ASSERT(pool_size + reserve_size <= max_size);
 
             // allocate more physical memory
             aclrtPhysicalMemProp prop = {};
@@ -396,7 +409,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
             // reserve virtual address space (if not already reserved)
             if (pool_addr == 0) {
                 ACL_CHECK(aclrtReserveMemAddress(
-                    &pool_addr, CANN_POOL_VMM_MAX_SIZE, 0, NULL, 1));
+                    &pool_addr, max_size, 0, NULL, 1));
             }
 
             // map at the end of the pool
@@ -409,10 +422,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
             // add to the pool
             pool_size += reserve_size;
 
-            // GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (
-            // reserved %llu MB)\n",
-            //       device, (unsigned long long) (pool_size/1024/1024),
-            //       (unsigned long long) (reserve_size/1024/1024));
+#ifdef DEBUG_CANN_MALLOC
+             GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
+                   device, (unsigned long long) (pool_size/1024/1024),
+                   (unsigned long long) (reserve_size/1024/1024));
+#endif
         }
 
         GGML_ASSERT(pool_addr != 0);
@@ -457,7 +471,6 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
  */
 std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
     int device) {
-    // return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_leg(device));
     return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
 }
 
@@ -1130,10 +1143,10 @@ ggml_backend_cann_buffer_type(int32_t device) {
     static bool ggml_backend_cann_buffer_type_initialized = false;
 
     if (!ggml_backend_cann_buffer_type_initialized) {
-        for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) {
+        for (int32_t i = 0; i < ggml_cann_info().device_count; i++) {
             ggml_backend_cann_buffer_types[i] = {
                 /* .iface    = */ ggml_backend_cann_buffer_type_interface,
-                /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
+                /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i),
                 /* .context  = */
                  new ggml_backend_cann_buffer_type_context{
                     i, "CANN" + std::to_string(i)},
@@ -1199,10 +1212,15 @@ static void * ggml_cann_host_malloc(size_t size) {
         return nullptr;
     }
 
+    const size_t alignment = 128;
+    size = GGML_PAD(size, alignment);
+    if (size == 0) {
+        size = alignment;
+    }
+
     void * hostPtr = nullptr;
     aclError err = aclrtMallocHost((void **) &hostPtr, size);
     if (err != ACL_SUCCESS) {
-
         GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
                            size / 1024.0 / 1024.0, aclGetRecentErrMsg());
         return nullptr;

From 811872a59daefb25fc0c4326bcb6d8ae893c2f7c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 26 Nov 2024 12:29:38 +0200
Subject: [PATCH 02/43] speculative : simplify the implementation (#10504)

ggml-ci
---
 .../speculative-simple/speculative-simple.cpp | 57 ++++++++-----------
 1 file changed, 24 insertions(+), 33 deletions(-)

diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp
index 7bf9056bf..2ea49d47c 100644
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -117,7 +117,8 @@ int main(int argc, char ** argv) {
     llama_token id_last = inp.back();
 
     // all tokens currently in the target context
-    auto prompt_tgt = std::vector<llama_token>(inp.begin(), inp.end() - 1);
+    llama_tokens prompt_tgt(inp.begin(), inp.end() - 1);
+    prompt_tgt.reserve(llama_n_ctx(ctx_tgt));
 
     int n_past = inp.size() - 1;
 
@@ -181,54 +182,44 @@ int main(int argc, char ** argv) {
         GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token
 
         n_past    += ids.size() - 1;
-        n_drafted += batch_tgt.n_tokens - 1;
+        n_drafted += draft.size(); // note: we ignore the discarded small drafts
         n_accept  += ids.size() - 1;
+        n_predict += ids.size();
 
         // process the accepted tokens and update contexts
         //
         // this is the standard token post-processing that we normally do
         // in this case, we do it for a group of accepted tokens at once
         //
-        {
-            llama_token id;
-            std::string token_str;
+        for (size_t i = 0; i < ids.size(); ++i) {
+            prompt_tgt.push_back(id_last);
 
-            for (size_t i = 0; i < ids.size(); ++i) {
-                id = ids[i];
+            id_last = ids[i];
 
-                ++n_predict;
-
-                if (llama_token_is_eog(model_tgt, id)) {
-                    has_eos = true;
-                    break;
-                }
-
-                token_str = common_token_to_piece(ctx_tgt, id);
-
-                if (params.use_color && i + 1 < ids.size()) {
-                    LOG("\u001b[%dm%s\u001b[37m", (36 - 0 % 6), token_str.c_str());
-                } else {
-                    LOG("%s", token_str.c_str());
-                }
-            }
-
-            if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
+            if (llama_token_is_eog(model_tgt, id_last)) {
+                has_eos = true;
                 break;
             }
 
-            LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d, '%s')\n", (int) ids.size() - 1, (int) draft.size(), id, token_str.c_str());
+            const std::string token_str = common_token_to_piece(ctx_tgt, id_last);
 
-            {
-                LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
-
-                llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1);
+            if (params.use_color && i + 1 < ids.size()) {
+                LOG("\u001b[%dm%s\u001b[37m", (36 - 0 % 6), token_str.c_str());
+            } else {
+                LOG("%s", token_str.c_str());
             }
+        }
 
-            prompt_tgt.push_back(id_last);
-            prompt_tgt.insert(prompt_tgt.end(), ids.begin(), ids.end() - 1);
+        LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d)\n", (int) ids.size() - 1, (int) draft.size(), id_last);
 
-            // remember the last accepted token for the next iteration
-            id_last = id;
+        {
+            LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
+
+            llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1);
+        }
+
+        if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
+            break;
         }
     }
 

From 84e1c33cde9e0a7aafcda2d4f21ba51c300482d7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 26 Nov 2024 13:36:40 +0200
Subject: [PATCH 03/43] server : fix parallel speculative decoding (#10513)

ggml-ci
---
 examples/server/server.cpp | 71 +++++++++++++++++++-------------------
 1 file changed, 35 insertions(+), 36 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index c0ea4faf7..9c86407c2 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2267,49 +2267,48 @@ struct server_context {
                     continue; // continue loop of slots
                 }
 
-                llama_token id;
+                llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
 
-                {
-                    completion_token_output result;
+                slot.i_batch = -1;
 
-                    id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
+                common_sampler_accept(slot.smpl, id, true);
 
-                    slot.i_batch = -1;
-
-                    common_sampler_accept(slot.smpl, id, true);
-
-                    slot.n_decoded += 1;
-                    if (slot.n_decoded == 1) {
-                        slot.t_start_generation = ggml_time_us();
-                        slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
-                        metrics.on_prompt_eval(slot);
-                    }
-
-                    result.tok = id;
-
-                    const auto * cur_p = common_sampler_get_candidates(slot.smpl);
-
-                    for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) {
-                        result.probs.push_back({
-                            cur_p->data[i].id,
-                                i >= cur_p->size ? 0.0f : cur_p->data[i].p,
-                        });
-                    }
-
-                    if (!process_token(result, slot)) {
-                        // release slot because of stop condition
-                        slot.release();
-                        slot.print_timings();
-                        send_final_response(slot);
-                        metrics.on_prediction(slot);
-                        continue;
-                    }
+                slot.n_decoded += 1;
+                if (slot.n_decoded == 1) {
+                    slot.t_start_generation = ggml_time_us();
+                    slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
+                    metrics.on_prompt_eval(slot);
                 }
 
-                // check if the slot supports speculative decoding
-                if (!slot.can_speculate()) {
+                completion_token_output result;
+                result.tok = id;
+
+                const auto * cur_p = common_sampler_get_candidates(slot.smpl);
+
+                for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) {
+                    result.probs.push_back({
+                        cur_p->data[i].id,
+                            i >= cur_p->size ? 0.0f : cur_p->data[i].p,
+                    });
+                }
+
+                if (!process_token(result, slot)) {
+                    // release slot because of stop condition
+                    slot.release();
+                    slot.print_timings();
+                    send_final_response(slot);
+                    metrics.on_prediction(slot);
                     continue;
                 }
+            }
+
+            // do speculative decoding
+            for (auto & slot : slots) {
+                if (!slot.is_processing() || !slot.can_speculate()) {
+                    continue;
+                }
+
+                llama_token id = slot.sampled;
 
                 struct common_speculative_params params_spec;
                 params_spec.n_draft   = slot.params.speculative.n_max;

From 25669aa92caaddff09f39b54a5173e5cb2680fa3 Mon Sep 17 00:00:00 2001
From: Charles Xu <charles.xu@arm.com>
Date: Tue, 26 Nov 2024 12:37:05 +0100
Subject: [PATCH 04/43] ggml-cpu: cmake add arm64 cpu feature check for macos
 (#10487)

* ggml-cpu: cmake add arm64 cpu feature check for macos

* use vmmlaq_s32 for compile option i8mm check
---
 ggml/src/ggml-cpu/CMakeLists.txt | 33 ++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index c2905d1fb..ddc05ecef 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -96,6 +96,39 @@ if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
         endif ()
 
         set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
+    elseif (APPLE)
+        if (GGML_NATIVE)
+            set(USER_PROVIDED_MARCH FALSE)
+            foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS)
+                if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+")
+                    set(USER_PROVIDED_MARCH TRUE)
+                    break()
+                endif()
+            endforeach()
+
+            if (NOT USER_PROVIDED_MARCH)
+                set(MARCH_FLAGS "-march=armv8.2a")
+
+                check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
+                if (GGML_COMPILER_SUPPORT_DOTPROD)
+                    set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
+                    add_compile_definitions(__ARM_FEATURE_DOTPROD)
+                endif ()
+
+                set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm")
+
+                set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+                set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}")
+                check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
+                if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
+                    set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
+                    add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
+                endif ()
+                set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
+
+                list(APPEND ARCH_FLAGS "${MARCH_FLAGS}")
+            endif ()
+        endif ()
     else()
         check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
         if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")

From c6807b3f28cc3dbfda3ec390bcb87e69fb5634e2 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Tue, 26 Nov 2024 13:05:07 +0100
Subject: [PATCH 05/43] ci : add ubuntu cuda build, build with one arch on
 windows (#10456)

---
 .github/labeler.yml               | 15 +++-----
 .github/workflows/build.yml       | 59 ++++++++++++++++++++++++++++++-
 .github/workflows/nix-ci.yml      |  2 ++
 .github/workflows/python-lint.yml |  9 ++++-
 4 files changed, 73 insertions(+), 12 deletions(-)

diff --git a/.github/labeler.yml b/.github/labeler.yml
index 89436740d..1b47bc968 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -3,19 +3,18 @@ Kompute:
     - changed-files:
         - any-glob-to-any-file:
             - ggml/include/ggml-kompute.h
-            - ggml/src/ggml-kompute.cpp
+            - ggml/src/ggml-kompute/**
             - README-kompute.md
 Apple Metal:
     - changed-files:
         - any-glob-to-any-file:
             - ggml/include/ggml-metal.h
-            - ggml/src/ggml-metal.cpp
+            - ggml/src/ggml-metal/**
             - README-metal.md
 SYCL:
     - changed-files:
         - any-glob-to-any-file:
             - ggml/include/ggml-sycl.h
-            - ggml/src/ggml-sycl.cpp
             - ggml/src/ggml-sycl/**
             - docs/backend/SYCL.md
             - examples/sycl/**
@@ -27,8 +26,8 @@ Nvidia GPU:
 Vulkan:
     - changed-files:
         - any-glob-to-any-file:
-            - ggml/ggml_vk_generate_shaders.py
-            - ggml/src/ggml-vulkan*
+            - ggml/include/ggml-vulkan.h
+            - ggml/src/ggml-vulkan/**
 documentation:
     - changed-files:
         - any-glob-to-any-file:
@@ -75,11 +74,7 @@ server:
 ggml:
     - changed-files:
         - any-glob-to-any-file:
-            - ggml/include/ggml*.h
-            - ggml/src/ggml*.c
-            - ggml/src/ggml*.cpp
-            - ggml/src/ggml*.h
-            - ggml-cuda/**
+            - ggml/**
 nix:
     - changed-files:
         - any-glob-to-any-file:
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index abaf2c504..6281663ec 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -871,8 +871,65 @@ jobs:
           path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
           name: llama-bin-win-${{ matrix.build }}.zip
 
+  ubuntu-latest-cmake-cuda:
+    runs-on: ubuntu-latest
+    container: nvidia/cuda:12.6.2-devel-ubuntu24.04
+
+    steps:
+        - name: Clone
+          id: checkout
+          uses: actions/checkout@v4
+
+        - name: Install dependencies
+          env:
+            DEBIAN_FRONTEND: noninteractive
+          run: |
+              apt update
+              apt install -y cmake build-essential ninja-build libgomp1 git
+
+        - name: Build with CMake
+          run: |
+            cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89-real -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined -DLLAMA_FATAL_WARNINGS=ON
+            cmake --build build
+
   windows-latest-cmake-cuda:
+    runs-on: windows-latest
+
+    strategy:
+      matrix:
+        cuda: ['12.6.2']
+        build: ['cuda']
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Install CUDA toolkit
+        id: cuda-toolkit
+        uses: Jimver/cuda-toolkit@v0.2.19
+        with:
+          cuda: ${{ matrix.cuda }}
+          method: 'network'
+          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
+
+      - name: Install Ninja
+        id: install_ninja
+        run: |
+          choco install ninja
+
+      - name: Build
+        id: cmake_build
+        shell: cmd
+        run: |
+          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
+          cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON -DCMAKE_CUDA_ARCHITECTURES=89-real
+          cmake --build build --config Release -t ggml-cuda
+          cmake --build build --config Release
+
+  windows-2019-cmake-cuda:
     runs-on: windows-2019
+    if: ${{ github.event == 'push' && github.ref == 'refs/heads/master' }}
 
     strategy:
       matrix:
@@ -1173,7 +1230,7 @@ jobs:
       - macOS-latest-make
       - macOS-latest-cmake
       - windows-latest-cmake
-      - windows-latest-cmake-cuda
+      - windows-2019-cmake-cuda
       - windows-latest-cmake-hip-release
       - macOS-latest-cmake-arm64
       - macOS-latest-cmake-x64
diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml
index 8ecbbe53b..3fe941576 100644
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@@ -5,8 +5,10 @@ on:
   push:
     branches:
       - master
+    paths: ['.github/workflows/nix-ci.yml', '**/flake.nix', '**/flake.lock', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
   pull_request:
     types: [opened, synchronize, reopened]
+    paths: ['.github/workflows/nix-ci.yml', '**/flake.nix', '**/flake.lock', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml
index a8d46f31d..ddfdf73b8 100644
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@@ -1,6 +1,13 @@
 name: flake8 Lint
 
-on: [push, pull_request]
+on:
+  push:
+    branches:
+      - master
+    paths: ['.github/workflows/python-lint.yml', '**/*.py']
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: ['.github/workflows/python-lint.yml', '**/*.py']
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}

From 7db3846a94ce7683b3e8120abe427457edf840c9 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Tue, 26 Nov 2024 13:05:20 +0100
Subject: [PATCH 06/43] ci : publish the docker images created during scheduled
 runs (#10515)

---
 .github/workflows/docker.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 9cef283d9..bc2e5020d 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -114,7 +114,7 @@ jobs:
           swap-storage: true
 
       - name: Build and push Docker image (tagged + versioned)
-        if: github.event_name == 'push'
+        if: ${{ github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
         uses: docker/build-push-action@v6
         with:
           context: .

From ab96610b1e58684bc5e8b810130c4cf6d8252e21 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 26 Nov 2024 14:18:08 +0200
Subject: [PATCH 07/43] cmake : enable warnings in llama (#10474)

* cmake : enable warnings in llama

ggml-ci

* cmake : add llama_get_flags and respect LLAMA_FATAL_WARNINGS

* cmake : get_flags -> ggml_get_flags

* speculative-simple : fix warnings

* cmake : reuse ggml_get_flags

ggml-ci

* speculative-simple : fix compile warning

ggml-ci
---
 CMakeLists.txt                                |  1 +
 cmake/common.cmake                            | 33 +++++++++++++++++++
 common/CMakeLists.txt                         |  2 ++
 examples/CMakeLists.txt                       |  4 +++
 .../speculative-simple/speculative-simple.cpp |  6 ++--
 ggml/src/CMakeLists.txt                       |  5 +--
 ggml/src/ggml-cuda/CMakeLists.txt             |  2 +-
 src/CMakeLists.txt                            |  2 ++
 8 files changed, 49 insertions(+), 6 deletions(-)
 create mode 100644 cmake/common.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e7d91a5b5..0d389dccb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -82,6 +82,7 @@ option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
 
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
+include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
 
 # override ggml options
 set(GGML_SANITIZE_THREAD    ${LLAMA_SANITIZE_THREAD})
diff --git a/cmake/common.cmake b/cmake/common.cmake
new file mode 100644
index 000000000..0f54871e4
--- /dev/null
+++ b/cmake/common.cmake
@@ -0,0 +1,33 @@
+function(llama_add_compile_flags)
+    if (LLAMA_FATAL_WARNINGS)
+        if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+            list(APPEND C_FLAGS   -Werror)
+            list(APPEND CXX_FLAGS -Werror)
+        elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+            add_compile_options(/WX)
+        endif()
+    endif()
+
+    if (LLAMA_ALL_WARNINGS)
+        if (NOT MSVC)
+            list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
+                                -Werror=implicit-int -Werror=implicit-function-declaration)
+
+            list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
+
+            list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
+
+            list(APPEND C_FLAGS   ${WARNING_FLAGS})
+            list(APPEND CXX_FLAGS ${WARNING_FLAGS})
+
+            ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
+
+            add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
+                                "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
+        else()
+            # todo : msvc
+            set(C_FLAGS   "" PARENT_SCOPE)
+            set(CXX_FLAGS "" PARENT_SCOPE)
+        endif()
+    endif()
+endfunction()
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 62a8a7db5..223174884 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -2,6 +2,8 @@
 
 find_package(Threads REQUIRED)
 
+llama_add_compile_flags()
+
 # Build info header
 #
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 21db1f3c2..9210e9fea 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -6,6 +6,10 @@ find_package(Threads REQUIRED)
 
 # ...
 
+# flags
+
+llama_add_compile_flags()
+
 # examples
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp
index 2ea49d47c..8ca84f7af 100644
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -70,13 +70,13 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> inp;
     inp = common_tokenize(ctx_tgt, params.prompt, true, true);
 
-    if (llama_n_ctx(ctx_tgt) < (int) inp.size()) {
+    if (llama_n_ctx(ctx_tgt) < (uint32_t) inp.size()) {
         LOG_ERR("%s: the prompt exceeds the context size (%d tokens, ctx %d)\n", __func__, (int) inp.size(), llama_n_ctx(ctx_tgt));
 
         return 1;
     }
 
-    if (llama_n_batch(ctx_tgt) < (int) inp.size()) {
+    if (llama_n_batch(ctx_tgt) < (uint32_t) inp.size()) {
         LOG_ERR("%s: the prompt exceeds the batch size (%d tokens, batch %d)\n", __func__, (int) inp.size(), llama_n_batch(ctx_tgt));
 
         return 1;
@@ -155,7 +155,7 @@ int main(int argc, char ** argv) {
         // evaluate the target model on [id_last, draft0, draft1, ..., draftN-1]
         {
             // do not waste time on small drafts
-            if (draft.size() < n_draft_min) {
+            if (draft.size() < (size_t) n_draft_min) {
                 draft.clear();
             }
 
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 071508dda..9022aa3ae 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -24,7 +24,7 @@ if (NOT MSVC)
     endif()
 endif()
 
-function(get_flags CCID CCVER)
+function(ggml_get_flags CCID CCVER)
     set(C_FLAGS "")
     set(CXX_FLAGS "")
 
@@ -41,6 +41,7 @@ function(get_flags CCID CCVER)
     elseif (CCID STREQUAL "GNU")
         set(C_FLAGS   -Wdouble-promotion)
         set(CXX_FLAGS -Wno-array-bounds)
+
         if (CCVER VERSION_GREATER_EQUAL 8.1.0)
             list(APPEND CXX_FLAGS -Wextra-semi)
         endif()
@@ -69,7 +70,7 @@ if (GGML_ALL_WARNINGS)
         list(APPEND C_FLAGS   ${WARNING_FLAGS})
         list(APPEND CXX_FLAGS ${WARNING_FLAGS})
 
-        get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
+        ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
 
         add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
                             "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt
index b0cb93e07..14761650f 100644
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -132,7 +132,7 @@ if (CUDAToolkit_FOUND)
 
         message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
 
-        get_flags(${CUDA_CCID} ${CUDA_CCVER})
+        ggml_get_flags(${CUDA_CCID} ${CUDA_CCVER})
         list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS})  # This is passed to -Xcompiler later
     endif()
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a86624750..2f581b921 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -5,6 +5,8 @@ if (WIN32)
     endif()
 endif()
 
+llama_add_compile_flags()
+
 #
 # libraries
 #

From 0bbd2262a3263f37385297b30de37941836e57f7 Mon Sep 17 00:00:00 2001
From: Neo Zhang Jianyu <jianyu.zhang@intel.com>
Date: Tue, 26 Nov 2024 21:43:47 +0800
Subject: [PATCH 08/43] restore the condistion to build & update pacakge when
 merge (#10507)

Co-authored-by: arthw <14088817+arthw@users.noreply.github.com>
---
 .github/workflows/build.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 6281663ec..c6aecec6e 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1041,7 +1041,7 @@ jobs:
 
       - name: Build the release package
         id: pack_artifacts
-        if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }}
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         run: |
           echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
 
@@ -1066,7 +1066,7 @@ jobs:
           7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
 
       - name: Upload the release package
-        if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }}
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         uses: actions/upload-artifact@v4
         with:
           path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip

From 45abe0f74ee281aea6e5283c1e738061256cfcae Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Tue, 26 Nov 2024 16:20:18 +0100
Subject: [PATCH 09/43] server : replace behave with pytest (#10416)

* server : replace behave with pytest

* fix test on windows

* misc

* add more tests

* more tests

* styling

* log less, fix embd test

* added all sequential tests

* fix coding style

* fix save slot test

* add parallel completion test

* fix parallel test

* remove feature files

* update test docs

* no cache_prompt for some tests

* add test_cache_vs_nocache_prompt
---
 .devops/nix/python-scripts.nix                |    2 +-
 .github/workflows/server.yml                  |    9 +-
 examples/server/tests/.gitignore              |    1 +
 examples/server/tests/README.md               |   33 +-
 examples/server/tests/conftest.py             |   15 +
 .../server/tests/features/ctx_shift.feature   |   66 -
 .../server/tests/features/embeddings.feature  |  113 --
 examples/server/tests/features/environment.py |   71 -
 examples/server/tests/features/infill.feature |   36 -
 examples/server/tests/features/issues.feature |    5 -
 examples/server/tests/features/lora.feature   |   36 -
 .../server/tests/features/parallel.feature    |  131 --
 .../server/tests/features/passkey.feature     |   56 -
 examples/server/tests/features/rerank.feature |   42 -
 .../server/tests/features/results.feature     |  118 --
 .../server/tests/features/security.feature    |   68 -
 examples/server/tests/features/server.feature |  120 --
 .../server/tests/features/slotsave.feature    |   58 -
 examples/server/tests/features/steps/steps.py | 1518 -----------------
 .../tests/features/wrong_usages.feature       |   25 -
 examples/server/tests/requirements.txt        |    2 +-
 examples/server/tests/tests.sh                |    5 +-
 examples/server/tests/unit/test_basic.py      |   34 +
 .../server/tests/unit/test_chat_completion.py |  129 ++
 examples/server/tests/unit/test_completion.py |  223 +++
 examples/server/tests/unit/test_ctx_shift.py  |   67 +
 examples/server/tests/unit/test_embedding.py  |   99 ++
 examples/server/tests/unit/test_infill.py     |   35 +
 examples/server/tests/unit/test_lora.py       |   42 +
 examples/server/tests/unit/test_rerank.py     |   38 +
 examples/server/tests/unit/test_security.py   |   83 +
 examples/server/tests/unit/test_slot_save.py  |   98 ++
 examples/server/tests/unit/test_tokenize.py   |   59 +
 examples/server/tests/utils.py                |  377 ++++
 34 files changed, 1317 insertions(+), 2497 deletions(-)
 create mode 100644 examples/server/tests/conftest.py
 delete mode 100644 examples/server/tests/features/ctx_shift.feature
 delete mode 100644 examples/server/tests/features/embeddings.feature
 delete mode 100644 examples/server/tests/features/environment.py
 delete mode 100644 examples/server/tests/features/infill.feature
 delete mode 100644 examples/server/tests/features/issues.feature
 delete mode 100644 examples/server/tests/features/lora.feature
 delete mode 100644 examples/server/tests/features/parallel.feature
 delete mode 100644 examples/server/tests/features/passkey.feature
 delete mode 100644 examples/server/tests/features/rerank.feature
 delete mode 100644 examples/server/tests/features/results.feature
 delete mode 100644 examples/server/tests/features/security.feature
 delete mode 100644 examples/server/tests/features/server.feature
 delete mode 100644 examples/server/tests/features/slotsave.feature
 delete mode 100644 examples/server/tests/features/steps/steps.py
 delete mode 100644 examples/server/tests/features/wrong_usages.feature
 create mode 100644 examples/server/tests/unit/test_basic.py
 create mode 100644 examples/server/tests/unit/test_chat_completion.py
 create mode 100644 examples/server/tests/unit/test_completion.py
 create mode 100644 examples/server/tests/unit/test_ctx_shift.py
 create mode 100644 examples/server/tests/unit/test_embedding.py
 create mode 100644 examples/server/tests/unit/test_infill.py
 create mode 100644 examples/server/tests/unit/test_lora.py
 create mode 100644 examples/server/tests/unit/test_rerank.py
 create mode 100644 examples/server/tests/unit/test_security.py
 create mode 100644 examples/server/tests/unit/test_slot_save.py
 create mode 100644 examples/server/tests/unit/test_tokenize.py
 create mode 100644 examples/server/tests/utils.py

diff --git a/.devops/nix/python-scripts.nix b/.devops/nix/python-scripts.nix
index 392e9ffe4..56ea18278 100644
--- a/.devops/nix/python-scripts.nix
+++ b/.devops/nix/python-scripts.nix
@@ -34,7 +34,7 @@ let
 
     # server tests
     openai
-    behave
+    pytest
     prometheus-client
   ];
 in
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 699ac095d..2e8e3348f 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -122,14 +122,14 @@ jobs:
         id: server_integration_tests
         run: |
           cd examples/server/tests
-          PORT=8888 ./tests.sh
+          ./tests.sh
 
       - name: Slow tests
         id: server_integration_tests_slow
         if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
         run: |
           cd examples/server/tests
-          PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
+          SLOW_TESTS=1 ./tests.sh
 
 
   server-windows:
@@ -180,11 +180,12 @@ jobs:
         run: |
           cd examples/server/tests
           $env:PYTHONIOENCODING = ":replace"
-          behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
+          pytest -v -x
 
       - name: Slow tests
         id: server_integration_tests_slow
         if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
         run: |
           cd examples/server/tests
-          behave.exe --stop --no-skipped --no-capture --tags slow
+          $env:SLOW_TESTS = "1"
+          pytest -v -x
diff --git a/examples/server/tests/.gitignore b/examples/server/tests/.gitignore
index 1d17dae13..90ee7fe6d 100644
--- a/examples/server/tests/.gitignore
+++ b/examples/server/tests/.gitignore
@@ -1 +1,2 @@
 .venv
+tmp
diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md
index 10f22c447..2930a2e0d 100644
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@@ -1,19 +1,9 @@
 # Server tests
 
-Python based server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development)
-and [behave](https://behave.readthedocs.io/en/latest/):
-
-* [issues.feature](./features/issues.feature) Pending issues scenario
-* [parallel.feature](./features/parallel.feature) Scenario involving multi slots and concurrent requests
-* [security.feature](./features/security.feature) Security, CORS and API Key
-* [server.feature](./features/server.feature) Server base scenario: completion, embedding, tokenization, etc...
+Python based server tests scenario using [pytest](https://docs.pytest.org/en/stable/).
 
 Tests target GitHub workflows job runners with 4 vCPU.
 
-Requests are
-using [aiohttp](https://docs.aiohttp.org/en/stable/client_reference.html), [asyncio](https://docs.python.org/fr/3/library/asyncio.html)
-based http client.
-
 Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail.
 To mitigate it, you can increase values in `n_predict`, `kv_size`.
 
@@ -39,26 +29,19 @@ It's possible to override some scenario steps values with environment variables:
 |--------------------------|------------------------------------------------------------------------------------------------|
 | `PORT`                   | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
 | `LLAMA_SERVER_BIN_PATH`  | to change the server binary path, default: `../../../build/bin/llama-server`                         |
-| `DEBUG`                  | "ON" to enable steps and server verbose mode `--verbose`                                       |
+| `DEBUG`                  | to enable steps and server verbose mode `--verbose`                                       |
 | `N_GPU_LAYERS`           | number of model layers to offload to VRAM `-ngl --n-gpu-layers`                                |
 
-### Run @bug, @wip or @wrong_usage annotated scenario
-
-Feature or Scenario must be annotated with `@llama.cpp` to be included in the default scope.
-
-- `@bug` annotation aims to link a scenario with a GitHub issue.
-- `@wrong_usage` are meant to show user issue that are actually an expected behavior
-- `@wip` to focus on a scenario working in progress
-- `@slow` heavy test, disabled by default
-
-To run a scenario annotated with `@bug`, start:
+To run slow tests:
 
 ```shell
-DEBUG=ON ./tests.sh --no-skipped --tags bug --stop
+SLOW_TESTS=1 ./tests.sh
 ```
 
-After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated.
+To run with stdout/stderr display in real time (verbose output, but useful for debugging):
 
 ```shell
-./tests.sh --no-skipped --tags bug,wrong_usage || echo "should failed but compile"
+DEBUG=1 ./tests.sh -s -v -x
 ```
+
+To see all available arguments, please refer to [pytest documentation](https://docs.pytest.org/en/stable/how-to/usage.html)
diff --git a/examples/server/tests/conftest.py b/examples/server/tests/conftest.py
new file mode 100644
index 000000000..017d1bb84
--- /dev/null
+++ b/examples/server/tests/conftest.py
@@ -0,0 +1,15 @@
+import pytest
+from utils import *
+
+
+# ref: https://stackoverflow.com/questions/22627659/run-code-before-and-after-each-test-in-py-test
+@pytest.fixture(autouse=True)
+def stop_server_after_each_test():
+    # do nothing before each test
+    yield
+    # stop all servers after each test
+    instances = set(
+        server_instances
+    )  # copy the set to prevent 'Set changed size during iteration'
+    for server in instances:
+        server.stop()
diff --git a/examples/server/tests/features/ctx_shift.feature b/examples/server/tests/features/ctx_shift.feature
deleted file mode 100644
index ae6c6b01b..000000000
--- a/examples/server/tests/features/ctx_shift.feature
+++ /dev/null
@@ -1,66 +0,0 @@
-@llama.cpp
-@ctx_shift
-Feature: llama.cpp server
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    And   a model file test-model.gguf
-    And   a model alias tinyllama-2
-    And   BOS token is 1
-    And   42 as server seed
-    And   256 KV cache size
-    And   32 as batch size
-    And   2 slots
-
-    # the prompt is 301 tokens
-    # the slot context is 256/2 = 128 tokens
-    # the prompt is truncated to keep the last 109 tokens
-    # 64 tokens are generated thanks to shifting the context when it gets full
-  Scenario: Inference with context shift
-    And   64 server max tokens to predict
-    Then  the server is starting
-    Then  the server is healthy
-    Given a prompt:
-    """
-    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
-    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
-    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
-    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
-    """
-    And   a completion request with no api error
-    Then  64 tokens are predicted matching fun|Annaks|popcorns|pictry|bowl
-    And   the completion is  truncated
-    And   109 prompt tokens are processed
-
-  Scenario Outline: Inference without context shift
-    And   <n_predict> server max tokens to predict
-    And   disable context shifting
-    Then  the server is starting
-    Then  the server is healthy
-    Given a prompt:
-    """
-    Hi how are you
-    """
-    And   a completion request with no api error
-    Then  <n_token_output> tokens are predicted matching twind|Anna
-    And   the completion is <truncated> truncated
-    And   8 prompt tokens are processed
-    Examples:
-      | n_predict | n_token_output | truncated |
-      | 64        | 64             | not       |
-      | -1        | 120            |           |
-
-  Scenario: Inference without context shift (expected error: prompt too long)
-    And   disable context shifting
-    Then  the server is starting
-    Then  the server is healthy
-    Given a prompt:
-    """
-    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
-    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
-    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
-    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
-    """
-    And   a completion request with 400 api error
-
diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature
deleted file mode 100644
index f4fe2ee43..000000000
--- a/examples/server/tests/features/embeddings.feature
+++ /dev/null
@@ -1,113 +0,0 @@
-@llama.cpp
-@embeddings
-Feature: llama.cpp server
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf
-    And   a model file bert-bge-small.gguf
-    And   a model alias bert-bge-small
-    And   42 as server seed
-    And   2 slots
-    # the bert-bge-small model has context size of 512
-    # since the generated prompts are as big as the batch size, we need to set the batch size to <= 512
-    # ref: https://huggingface.co/BAAI/bge-small-en-v1.5/blob/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/config.json#L20
-    And   128 as batch size
-    And   128 as ubatch size
-    And   512 KV cache size
-    And   enable embeddings endpoint
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario: Embedding
-    When embeddings are computed for:
-    """
-    What is the capital of Bulgaria ?
-    """
-    Then embeddings are generated
-
-  Scenario: Embedding (error: prompt too long)
-    When embeddings are computed for:
-    """
-    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
-    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
-    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
-    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
-    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
-    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
-    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
-    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
-    """
-    And  embeddings request with 500 api error
-
-  Scenario: OAI Embeddings compatibility
-    Given a model bert-bge-small
-    When an OAI compatible embeddings computation request for:
-    """
-    What is the capital of Spain ?
-    """
-    Then embeddings are generated
-
-  Scenario: OAI Embeddings compatibility with multiple inputs
-    Given a model bert-bge-small
-    Given a prompt:
-      """
-      In which country Paris is located ?
-      """
-    And a prompt:
-      """
-      Is Madrid the capital of Spain ?
-      """
-    When an OAI compatible embeddings computation request for multiple inputs
-    Then embeddings are generated
-
-  Scenario: Multi users embeddings
-    Given a prompt:
-      """
-      Write a very long story about AI.
-      """
-    And a prompt:
-      """
-      Write another very long music lyrics.
-      """
-    And a prompt:
-      """
-      Write a very long poem.
-      """
-    And a prompt:
-      """
-      Write a very long joke.
-      """
-    Given concurrent embedding requests
-    Then the server is busy
-    Then the server is idle
-    Then all embeddings are generated
-
-  Scenario: Multi users OAI compatibility embeddings
-    Given a prompt:
-      """
-      In which country Paris is located ?
-      """
-    And a prompt:
-      """
-      Is Madrid the capital of Spain ?
-      """
-    And a prompt:
-      """
-      What is the biggest US city ?
-      """
-    And a prompt:
-      """
-      What is the capital of Bulgaria ?
-      """
-    And   a model bert-bge-small
-    Given concurrent OAI embedding requests
-    Then the server is busy
-    Then the server is idle
-    Then all embeddings are generated
-
-  Scenario: All embeddings should be the same
-    Given 10 fixed prompts
-    And   a model bert-bge-small
-    Given concurrent OAI embedding requests
-    Then all embeddings are the same
diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py
deleted file mode 100644
index e7845dc2f..000000000
--- a/examples/server/tests/features/environment.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import os
-import signal
-import socket
-import sys
-import time
-import traceback
-from contextlib import closing
-from subprocess import TimeoutExpired
-
-
-def before_scenario(context, scenario):
-    context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON'
-    if context.debug:
-        print("DEBUG=ON")
-    print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m")
-    port = 8080
-    if 'PORT' in os.environ:
-        port = int(os.environ['PORT'])
-    if is_server_listening("localhost", port):
-        assert False, "Server already started"
-
-
-def after_scenario(context, scenario):
-    try:
-        if 'server_process' not in context or context.server_process is None:
-            return
-        if scenario.status == "failed":
-            if 'GITHUB_ACTIONS' in os.environ:
-                print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n")
-                if os.path.isfile('llama.log'):
-                    with closing(open('llama.log', 'r')) as f:
-                        for line in f:
-                            print(line)
-            if not is_server_listening(context.server_fqdn, context.server_port):
-                print("\x1b[33;101mERROR: Server stopped listening\x1b[0m")
-
-        if context.server_process.poll() is not None:
-            assert False, f"Server not running pid={context.server_process.pid} ..."
-
-        server_graceful_shutdown(context)  # SIGINT
-
-        try:
-            context.server_process.wait(0.5)
-        except TimeoutExpired:
-            print(f"server still alive after 500ms, force-killing pid={context.server_process.pid} ...")
-            context.server_process.kill()  # SIGKILL
-            context.server_process.wait()
-
-        while is_server_listening(context.server_fqdn, context.server_port):
-            time.sleep(0.1)
-    except Exception:
-        print("ignoring error in after_scenario:")
-        traceback.print_exc(file=sys.stdout)
-
-
-def server_graceful_shutdown(context):
-    print(f"shutting down server pid={context.server_process.pid} ...")
-    if os.name == 'nt':
-        interrupt = signal.CTRL_C_EVENT
-    else:
-        interrupt = signal.SIGINT
-    context.server_process.send_signal(interrupt)
-
-
-def is_server_listening(server_fqdn, server_port):
-    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
-        result = sock.connect_ex((server_fqdn, server_port))
-        _is_server_listening = result == 0
-        if _is_server_listening:
-            print(f"server is listening on {server_fqdn}:{server_port}...")
-        return _is_server_listening
diff --git a/examples/server/tests/features/infill.feature b/examples/server/tests/features/infill.feature
deleted file mode 100644
index a0bbfef77..000000000
--- a/examples/server/tests/features/infill.feature
+++ /dev/null
@@ -1,36 +0,0 @@
-@llama.cpp
-@infill
-Feature: llama.cpp server
-
-  # The current model is made by adding FIM tokens to the existing stories260K
-  # We may want to use a better model in the future, maybe something like SmolLM 360M
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K-infill.gguf from HF repo ggml-org/models
-    And   a model file test-model-infill.gguf
-    And   a model alias tinyllama-infill
-    And   42 as server seed
-    And   1024 as batch size
-    And   1024 as ubatch size
-    And   2048 KV cache size
-    And   64 max tokens to predict
-    And   0.0 temperature
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario: Infill without input_extra
-    Given a prompt "Complete this"
-    And   an infill input extra none none
-    And   an infill input prefix "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_"
-    And   an infill input suffix "}\n"
-    And   an infill request with no api error
-    Then  64 tokens are predicted matching One|day|she|saw|big|scary|bird
-
-  Scenario: Infill with input_extra
-    Given a prompt "Complete this"
-    And   an infill input extra "llama.h" "LLAMA_API int32_t llama_n_threads();\n"
-    And   an infill input prefix "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_"
-    And   an infill input suffix "}\n"
-    And   an infill request with no api error
-    Then  64 tokens are predicted matching cuts|Jimmy|mom|came|into|the|room"
diff --git a/examples/server/tests/features/issues.feature b/examples/server/tests/features/issues.feature
deleted file mode 100644
index 7b13e44ca..000000000
--- a/examples/server/tests/features/issues.feature
+++ /dev/null
@@ -1,5 +0,0 @@
-# List of ongoing issues
-# run with: DEBUG=ON ./tests.sh --no-skipped --tags bug
-@bug
-Feature: Issues
-  # No confirmed issue at the moment
diff --git a/examples/server/tests/features/lora.feature b/examples/server/tests/features/lora.feature
deleted file mode 100644
index 7b85988ac..000000000
--- a/examples/server/tests/features/lora.feature
+++ /dev/null
@@ -1,36 +0,0 @@
-@llama.cpp
-@lora
-Feature: llama.cpp server
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model url https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/stories15M_MOE-F16.gguf
-    And   a model file stories15M_MOE-F16.gguf
-    And   a model alias stories15M_MOE
-    And   a lora adapter file from https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf
-    And   42 as server seed
-    And   1024 as batch size
-    And   1024 as ubatch size
-    And   2048 KV cache size
-    And   64 max tokens to predict
-    And   0.0 temperature
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario: Completion LoRA disabled
-    Given switch off lora adapter 0
-    Given a prompt:
-    """
-    Look in thy glass
-    """
-    And   a completion request with no api error
-    Then  64 tokens are predicted matching little|girl|three|years|old
-
-  Scenario: Completion LoRA enabled
-    Given switch on lora adapter 0
-    Given a prompt:
-    """
-    Look in thy glass
-    """
-    And   a completion request with no api error
-    Then  64 tokens are predicted matching eye|love|glass|sun
diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature
deleted file mode 100644
index 423d0f1d4..000000000
--- a/examples/server/tests/features/parallel.feature
+++ /dev/null
@@ -1,131 +0,0 @@
-@llama.cpp
-@parallel
-Feature: Parallel
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
-    And   a model file test-model-00001-of-00003.gguf
-    And   42 as server seed
-    And   128 as batch size
-    And   256 KV cache size
-    And   2 slots
-    And   continuous batching
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario Outline: Multi users completion
-    Given a prompt:
-      """
-      Write a very long story about AI.
-      """
-    And a prompt:
-      """
-      Write another very long music lyrics.
-      """
-    And <n_predict> max tokens to predict
-    Given concurrent completion requests
-    Then the server is busy
-    Then the server is idle
-    And  all slots are idle
-    Then all prompts are predicted with <n_predict> tokens
-    Examples:
-      | n_predict |
-      | 128       |
-
-  Scenario Outline: Multi users OAI completions compatibility
-    Given a system prompt You are a writer.
-    And   a model tinyllama-2
-    Given a prompt:
-      """
-      Write a very long book.
-      """
-    And a prompt:
-      """
-      Write another a poem.
-      """
-    And <n_predict> max tokens to predict
-    And streaming is <streaming>
-    Given concurrent OAI completions requests
-    Then the server is busy
-    Then the server is idle
-    Then all prompts are predicted with <n_predict> tokens
-    Examples:
-      | streaming | n_predict |
-      | disabled  | 128       |
-      | enabled   | 64        |
-
-  Scenario Outline: Multi users OAI completions compatibility no v1
-    Given a system prompt You are a writer.
-    And   a model tinyllama-2
-    Given a prompt:
-      """
-      Write a very long book.
-      """
-    And a prompt:
-      """
-      Write another a poem.
-      """
-    And <n_predict> max tokens to predict
-    And streaming is <streaming>
-    Given concurrent OAI completions requests no v1
-    Then the server is busy
-    Then the server is idle
-    Then all prompts are predicted with <n_predict> tokens
-    Examples:
-      | streaming | n_predict |
-      | disabled  | 128       |
-      | enabled   | 64        |
-
-  Scenario Outline: Multi users with number of prompts exceeding number of slots
-    Given a system prompt You are a writer.
-    And   a model tinyllama-2
-    Given a prompt:
-      """
-      Write a very long book.
-      """
-    And a prompt:
-      """
-      Write another a poem.
-      """
-    And a prompt:
-      """
-      What is LLM?
-      """
-    And a prompt:
-      """
-      The sky is blue and I love it.
-      """
-    And <n_predict> max tokens to predict
-    And streaming is <streaming>
-    Given concurrent OAI completions requests
-    Then the server is busy
-    Then the server is idle
-    Then all prompts are predicted with <n_predict> tokens
-    Examples:
-      | streaming | n_predict |
-      | disabled  | 128       |
-      | enabled   | 64        |
-
-  Scenario:  Multi users with total number of tokens to predict exceeds the KV Cache size #3969
-    Given a prompt:
-      """
-      Write a very long story about AI.
-      """
-    And a prompt:
-      """
-      Write another very long music lyrics.
-      """
-    And a prompt:
-      """
-      Write a very long poem.
-      """
-    And a prompt:
-      """
-      Write a very long joke.
-      """
-    And 128 max tokens to predict
-    Given concurrent completion requests
-    Then the server is busy
-    Then the server is idle
-    Then all prompts are predicted
diff --git a/examples/server/tests/features/passkey.feature b/examples/server/tests/features/passkey.feature
deleted file mode 100644
index ff0a82cc4..000000000
--- a/examples/server/tests/features/passkey.feature
+++ /dev/null
@@ -1,56 +0,0 @@
-# run with: ./tests.sh --no-skipped --tags passkey
-@passkey
-@slow
-Feature: Passkey / Self-extend with context shift
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-
-  # Generates a long text of junk and inserts a secret passkey number inside it.
-  # Then we query the LLM for the secret passkey.
-  # see #3856 and #4810
-  Scenario Outline: Passkey
-    Given a model file <hf_file> from HF repo <hf_repo>
-    And   <n_batch> as batch size
-    And   <n_junk> as number of junk
-    And   <n_predicted> server max tokens to predict
-    And   42 as seed
-    And   0.0 temperature
-    And   <n_ctx> KV cache size
-    And   1 slots
-    And   <n_ga> group attention factor to extend context size through self-extend
-    And   <n_ga_w> group attention width to extend context size through self-extend
-    # Can be override with N_GPU_LAYERS
-    And   <ngl> GPU offloaded layers
-    Then  the server is starting
-    # Higher timeout because the model may need to be downloaded from the internet
-    Then  the server is healthy with timeout 120 seconds
-    Given available models
-    Then  model 0 is trained on <n_ctx_train> tokens context
-    Given a prefix prompt:
-    """
-    here is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.
-    """
-    And a passkey prompt template:
-    """
-    The pass key is <passkey> Remember it. <passkey> is the pass key.
-    """
-    And a junk suffix prompt:
-    """
-    The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again.
-    """
-    And a suffix prompt:
-    """
-    What is the pass key? The pass key is
-    """
-    Given a "<passkey>" passkey challenge prompt with the passkey inserted every <i_pos> junk
-    And  a completion request with no api error
-    Then <n_predicted> tokens are predicted matching <re_content>
-
-    Examples:
-      | hf_repo                         | hf_file                     | n_ctx_train | ngl | n_ctx | n_batch | n_ga | n_ga_w | n_junk | i_pos | passkey | n_predicted | re_content     |
-      | TheBloke/phi-2-GGUF             | phi-2.Q4_K_M.gguf           | 2048        | 5   | 8192  | 512     | 4    | 512    | 250    | 50    | 42      | 1           | 42             |
-      | TheBloke/phi-2-GGUF             | phi-2.Q4_K_M.gguf           | 2048        | 5   | 8192  | 512     | 2    | 512    | 250    | 50    | 42      | 1           | \b((?!42)\w)+\b  |
-      #| TheBloke/Llama-2-7B-GGUF        | llama-2-7b.Q2_K.gguf        | 4096        | 3   | 16384 | 512     | 4    | 512    | 500    | 300   | 1234    | 5           | 1234           |
-      #| TheBloke/Mixtral-8x7B-v0.1-GGUF | mixtral-8x7b-v0.1.Q2_K.gguf | 32768       | 2   | 16384 | 512     | 4    | 512    | 500    | 100   | 0987    | 5           | 0
-      # 987           |
diff --git a/examples/server/tests/features/rerank.feature b/examples/server/tests/features/rerank.feature
deleted file mode 100644
index c36cc8e21..000000000
--- a/examples/server/tests/features/rerank.feature
+++ /dev/null
@@ -1,42 +0,0 @@
-@llama.cpp
-@rerank
-Feature: llama.cpp server
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model url https://huggingface.co/ggml-org/models/resolve/main/jina-reranker-v1-tiny-en/ggml-model-f16.gguf
-    And   a model file jina-reranker-v1-tiny-en.gguf
-    And   a model alias jina-reranker-v1-tiny-en
-    And   42 as server seed
-    And   2 slots
-    And   512 as batch size
-    And   512 as ubatch size
-    And   512 KV cache size
-    And   enable reranking endpoint
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario: Rerank
-    Given a rerank query:
-      """
-      Machine learning is
-      """
-    And   a rerank document:
-      """
-      A machine is a physical system that uses power to apply forces and control movement to perform an action. The term is commonly applied to artificial devices, such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines.
-      """
-    And   a rerank document:
-      """
-      Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences. The ability to learn is possessed by humans, non-human animals, and some machines; there is also evidence for some kind of learning in certain plants.
-      """
-    And   a rerank document:
-      """
-      Machine learning is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.
-      """
-    And   a rerank document:
-      """
-      Paris, capitale de la France, est une grande ville européenne et un centre mondial de l'art, de la mode, de la gastronomie et de la culture. Son paysage urbain du XIXe siècle est traversé par de larges boulevards et la Seine.
-      """
-    When  reranking request
-    Then  reranking results are returned
-    Then  reranking highest score is index 2 and lowest score is index 3
diff --git a/examples/server/tests/features/results.feature b/examples/server/tests/features/results.feature
deleted file mode 100644
index e8e1b5414..000000000
--- a/examples/server/tests/features/results.feature
+++ /dev/null
@@ -1,118 +0,0 @@
-@llama.cpp
-@results
-Feature: Results
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
-    And   a model file test-model-00001-of-00003.gguf
-    And   128 as batch size
-    And   1024 KV cache size
-    And   128 max tokens to predict
-    And   continuous batching
-
-  Scenario Outline: consistent results with same seed
-    Given <n_slots> slots
-    And   1.0 temperature
-    Then  the server is starting
-    Then  the server is healthy
-
-    Given 4 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42
-
-    Given concurrent completion requests
-    Then the server is busy
-    Then the server is idle
-    And  all slots are idle
-    Then all predictions are equal
-    Examples:
-      | n_slots |
-      | 1       |
-      # FIXME: unified KV cache nondeterminism
-      # | 2       |
-
-  Scenario Outline: different results with different seed
-    Given <n_slots> slots
-    And   1.0 temperature
-    Then  the server is starting
-    Then  the server is healthy
-
-    Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42
-    Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 43
-    Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 44
-    Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 45
-
-    Given concurrent completion requests
-    Then the server is busy
-    Then the server is idle
-    And  all slots are idle
-    Then all predictions are different
-    Examples:
-      | n_slots |
-      | 1       |
-      | 2       |
-
-  Scenario Outline: consistent results with same seed and varying batch size
-    Given 4 slots
-    And   <temp> temperature
-    # And   0 as draft
-    Then  the server is starting
-    Then  the server is healthy
-
-    Given 1 prompts "Write a very long story about AI." with seed 42
-    And   concurrent completion requests
-    # Then the server is busy # Not all slots will be utilized.
-    Then  the server is idle
-    And   all slots are idle
-
-    Given <n_parallel> prompts "Write a very long story about AI." with seed 42
-    And   concurrent completion requests
-    # Then the server is busy # Not all slots will be utilized.
-    Then the server is idle
-    And  all slots are idle
-
-    Then all predictions are equal
-    Examples:
-      | n_parallel | temp |
-      | 1          | 0.0  |
-      | 1          | 1.0  |
-      # FIXME: unified KV cache nondeterminism
-      # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
-      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
-      # and https://github.com/ggerganov/llama.cpp/pull/7347 .
-      # | 2          | 0.0  |
-      # | 4          | 0.0  |
-      # | 2          | 1.0  |
-      # | 4          | 1.0  |
-
-  Scenario Outline: consistent token probs with same seed and prompt
-    Given <n_slots> slots
-    And   <n_kv> KV cache size
-    And   1.0 temperature
-    And   <n_predict> max tokens to predict
-    Then  the server is starting
-    Then  the server is healthy
-
-    Given 1 prompts "The meaning of life is" with seed 42
-    And   concurrent completion requests
-    # Then the server is busy # Not all slots will be utilized.
-    Then  the server is idle
-    And   all slots are idle
-
-    Given <n_parallel> prompts "The meaning of life is" with seed 42
-    And   concurrent completion requests
-    # Then the server is busy # Not all slots will be utilized.
-    Then the server is idle
-    And  all slots are idle
-
-    Then all token probabilities are equal
-    Examples:
-      | n_slots | n_kv | n_predict | n_parallel |
-      | 4       | 1024 | 1         | 1          |
-      # FIXME: unified KV cache nondeterminism
-      # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
-      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
-      # and https://github.com/ggerganov/llama.cpp/pull/7347 .
-      # | 4       | 1024 | 1         | 4          |
-      # | 4       | 1024 | 100       | 1          |
-      # This test still fails even the above patches; the first token probabilities are already different.
-      # | 4       | 1024 | 100       | 4          |
diff --git a/examples/server/tests/features/security.feature b/examples/server/tests/features/security.feature
deleted file mode 100644
index ef30007c3..000000000
--- a/examples/server/tests/features/security.feature
+++ /dev/null
@@ -1,68 +0,0 @@
-@llama.cpp
-@security
-Feature: Security
-
-  Background: Server startup with an api key defined
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    And   a server api key THIS_IS_THE_KEY
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario Outline: Completion with some user api key
-    Given a prompt test
-    And   a user api key <api_key>
-    And   4 max tokens to predict
-    And   a completion request with <api_error> api error
-
-    Examples: Prompts
-      | api_key         | api_error |
-      | THIS_IS_THE_KEY | no        |
-      | THIS_IS_THE_KEY | no        |
-      | hackeme         | raised    |
-      |                 | raised    |
-
-  Scenario Outline: OAI Compatibility
-    Given a system prompt test
-    And   a user prompt test
-    And   a model test
-    And   2 max tokens to predict
-    And   streaming is disabled
-    And   a user api key <api_key>
-    Given an OAI compatible chat completions request with <api_error> api error
-
-    Examples: Prompts
-      | api_key         | api_error |
-      | THIS_IS_THE_KEY | no        |
-      | THIS_IS_THE_KEY | no        |
-      | hackme          | raised    |
-
-  Scenario Outline: OAI Compatibility (invalid response formats)
-    Given a system prompt test
-    And   a user prompt test
-    And   a response format <response_format>
-    And   a model test
-    And   2 max tokens to predict
-    And   streaming is disabled
-    Given an OAI compatible chat completions request with raised api error
-
-    Examples: Prompts
-      | response_format                                       |
-      | {"type": "sound"}                                     |
-      | {"type": "json_object", "schema": 123}                |
-      | {"type": "json_object", "schema": {"type": 123}}      |
-      | {"type": "json_object", "schema": {"type": "hiccup"}} |
-
-
-  Scenario Outline: CORS Options
-    Given a user api key THIS_IS_THE_KEY
-    When  an OPTIONS request is sent from <origin>
-    Then  CORS header <cors_header> is set to <cors_header_value>
-
-    Examples: Headers
-      | origin          | cors_header                      | cors_header_value |
-      | localhost       | Access-Control-Allow-Origin      | localhost         |
-      | web.mydomain.fr | Access-Control-Allow-Origin      | web.mydomain.fr   |
-      | origin          | Access-Control-Allow-Credentials | true              |
-      | web.mydomain.fr | Access-Control-Allow-Methods     | GET, POST         |
-      | web.mydomain.fr | Access-Control-Allow-Headers     | *                 |
diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature
deleted file mode 100644
index 15e24c624..000000000
--- a/examples/server/tests/features/server.feature
+++ /dev/null
@@ -1,120 +0,0 @@
-@llama.cpp
-@server
-Feature: llama.cpp server
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    And   a model file test-model.gguf
-    And   a model alias tinyllama-2
-    And   BOS token is 1
-    And   42 as server seed
-      # KV Cache corresponds to the total amount of tokens
-      # that can be stored across all independent sequences: #4130
-      # see --ctx-size and #5568
-    And   256 KV cache size
-    And   32 as batch size
-    And   2 slots
-    And   64 server max tokens to predict
-    And   prometheus compatible metrics exposed
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario: Health
-    Then the server is ready
-    And  all slots are idle
-
-
-  Scenario Outline: Completion
-    Given a prompt <prompt>
-    And   <n_predict> max tokens to predict
-    And   a completion request with no api error
-    Then  <n_predicted> tokens are predicted matching <re_content>
-    And   the completion is <truncated> truncated
-    And   <n_prompt> prompt tokens are processed
-    And   prometheus metrics are exposed
-    And   metric llamacpp:tokens_predicted is <n_predicted>
-
-    Examples: Prompts
-      | prompt                                                                    | n_predict | re_content                                  | n_prompt | n_predicted | truncated |
-      | I believe the meaning of life is                                          | 8         | (read\|going)+                              | 18       | 8           | not       |
-      | Write a joke about AI from a very long prompt which will not be truncated | 256       | (princesses\|everyone\|kids\|Anna\|forest)+ | 46       | 64          | not       |
-
-  Scenario: Completion prompt truncated
-    Given a prompt:
-    """
-    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
-    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
-    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
-    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
-    """
-    And   a completion request with no api error
-    Then  64 tokens are predicted matching fun|Annaks|popcorns|pictry|bowl
-    And   the completion is  truncated
-    And   109 prompt tokens are processed
-
-
-  Scenario Outline: OAI Compatibility
-    Given a model <model>
-    And   a system prompt <system_prompt>
-    And   a user prompt <user_prompt>
-    And   <max_tokens> max tokens to predict
-    And   streaming is <enable_streaming>
-    Given an OAI compatible chat completions request with no api error
-    Then  <n_predicted> tokens are predicted matching <re_content>
-    And   <n_prompt> prompt tokens are processed
-    And   the completion is <truncated> truncated
-
-    Examples: Prompts
-      | model        | system_prompt               | user_prompt                          | max_tokens | re_content                        | n_prompt | n_predicted | enable_streaming | truncated |
-      | llama-2      | Book                        | What is the best book                | 8          | (Here\|what)+                     | 77       | 8           | disabled         | not       |
-      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128        | (thanks\|happy\|bird\|Annabyear)+ | -1       | 64          | enabled          |           |
-
-
-  Scenario Outline: OAI Compatibility w/ response format
-    Given a model test
-    And   a system prompt test
-    And   a user prompt test
-    And   a response format <response_format>
-    And   10 max tokens to predict
-    Given an OAI compatible chat completions request with no api error
-    Then  <n_predicted> tokens are predicted matching <re_content>
-
-    Examples: Prompts
-      | response_format                                                     | n_predicted | re_content             |
-      | {"type": "json_object", "schema": {"const": "42"}}                  | 6           | "42"                   |
-      | {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10          | \[ -300 \]             |
-      | {"type": "json_object"}                                             | 10          | \{ " Jacky.            |
-
-
-  Scenario: Tokenize / Detokenize
-    When tokenizing:
-    """
-    What is the capital of France ?
-    """
-    Then tokens can be detokenized
-    And  tokens do not begin with BOS
-
-  Scenario: Tokenize w/ BOS
-    Given adding special tokens
-    When  tokenizing:
-    """
-    What is the capital of Germany?
-    """
-    Then  tokens begin with BOS
-    Given first token is removed
-    Then  tokens can be detokenized
-
-  Scenario: Tokenize with pieces
-    When  tokenizing with pieces:
-    """
-    What is the capital of Germany?
-    媽
-    """
-    Then  tokens are given with pieces
-
-  Scenario: Models available
-    Given available models
-    Then  1 models are supported
-    Then  model 0 is identified by tinyllama-2
-    Then  model 0 is trained on 128 tokens context
diff --git a/examples/server/tests/features/slotsave.feature b/examples/server/tests/features/slotsave.feature
deleted file mode 100644
index 1c281c074..000000000
--- a/examples/server/tests/features/slotsave.feature
+++ /dev/null
@@ -1,58 +0,0 @@
-@llama.cpp
-@slotsave
-Feature: llama.cpp server slot management
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    And   prompt caching is enabled
-    And   2 slots
-    And   . as slot save path
-    And   2048 KV cache size
-    And   42 as server seed
-    And   24 max tokens to predict
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario: Save and Restore Slot
-    # First prompt in slot 1 should be fully processed
-    Given a user prompt "What is the capital of France?"
-    And   using slot id 1
-    And   a completion request with no api error
-    Then  24 tokens are predicted matching (Lily|cake)
-    And   22 prompt tokens are processed
-    When  the slot 1 is saved with filename "slot1.bin"
-    Then  the server responds with status code 200
-    # Since we have cache, this should only process the last tokens
-    Given a user prompt "What is the capital of Germany?"
-    And   a completion request with no api error
-    Then  24 tokens are predicted matching (Thank|special)
-    And   7 prompt tokens are processed
-    # Loading the original cache into slot 0,
-    # we should only be processing 1 prompt token and get the same output
-    When  the slot 0 is restored with filename "slot1.bin"
-    Then  the server responds with status code 200
-    Given a user prompt "What is the capital of France?"
-    And   using slot id 0
-    And   a completion request with no api error
-    Then  24 tokens are predicted matching (Lily|cake)
-    And   1 prompt tokens are processed
-    # For verification that slot 1 was not corrupted during slot 0 load, same thing
-    Given a user prompt "What is the capital of Germany?"
-    And   using slot id 1
-    And   a completion request with no api error
-    Then  24 tokens are predicted matching (Thank|special)
-    And   1 prompt tokens are processed
-
-  Scenario: Erase Slot
-    Given a user prompt "What is the capital of France?"
-    And   using slot id 1
-    And   a completion request with no api error
-    Then  24 tokens are predicted matching (Lily|cake)
-    And   22 prompt tokens are processed
-    When  the slot 1 is erased
-    Then  the server responds with status code 200
-    Given a user prompt "What is the capital of France?"
-    And   a completion request with no api error
-    Then  24 tokens are predicted matching (Lily|cake)
-    And   22 prompt tokens are processed
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
deleted file mode 100644
index 687b163f4..000000000
--- a/examples/server/tests/features/steps/steps.py
+++ /dev/null
@@ -1,1518 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-import asyncio
-import json
-import os
-import re
-import socket
-import subprocess
-import sys
-import threading
-import time
-import requests
-from collections.abc import Sequence
-from contextlib import closing
-from re import RegexFlag
-from typing import Any, Literal, cast
-
-import aiohttp
-import numpy as np
-import openai
-from openai.types.chat import ChatCompletionChunk
-from behave import step  # pyright: ignore[reportAttributeAccessIssue]
-from behave.api.async_step import async_run_until_complete
-from prometheus_client import parser
-
-# pyright: reportRedeclaration=false
-
-DEFAULT_TIMEOUT_SECONDS = aiohttp.ClientTimeout(total=600)
-
-@step("a server listening on {server_fqdn}:{server_port}")
-def step_server_config(context, server_fqdn: str, server_port: str):
-    context.server_fqdn = server_fqdn
-    context.server_port = int(server_port)
-    context.n_threads = None
-    context.n_gpu_layer = None
-    if 'PORT' in os.environ:
-        context.server_port = int(os.environ['PORT'])
-        print(f"$PORT set, overriding server port with to {context.server_port}")
-    if 'FQDN' in os.environ:
-        context.server_fqdn = os.environ['FQDN']
-        print(f"$FQDN set, overriding server fqdn with to {context.server_fqdn}")
-    if 'N_GPU_LAYERS' in os.environ:
-        context.n_gpu_layer = int(os.environ['N_GPU_LAYERS'])
-        print(f"$N_GPU_LAYERS set, overriding n_gpu_layer with to {context.n_gpu_layer}")
-
-    context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
-
-    context.model_alias = None
-    context.model_file = None
-    context.model_hf_repo = None
-    context.model_hf_file = None
-    context.model_url = None
-    context.n_batch = None
-    context.n_ubatch = None
-    context.n_ctx = None
-    context.n_ga = None
-    context.n_ga_w = None
-    context.n_predict = None
-    context.n_prompts = 0
-    context.n_server_predict = None
-    context.slot_save_path = None
-    context.id_slot = None
-    context.cache_prompt = None
-    context.n_slots = None
-    context.prompt_prefix = None
-    context.prompt_suffix = None
-    context.server_api_key = None
-    context.server_continuous_batching = False
-    context.server_embeddings = False
-    context.server_reranking = False
-    context.server_metrics = False
-    context.server_process = None
-    context.seed = None
-    context.draft = None
-    context.server_seed = None
-    context.user_api_key = None
-    context.response_format = None
-    context.temperature = None
-    context.lora_file = None
-    context.disable_ctx_shift = False
-
-    # infill
-    context.infill_input_extra = None
-    context.infill_input_suffix = ''
-    context.infill_input_prefix = ''
-
-    context.tasks_result = []
-    context.concurrent_tasks = []
-    context.prompts = []
-
-    context.reranking_query = None
-    context.reranking_documents = []
-    context.reranking_results = None
-
-
-@step('a model file {hf_file} from HF repo {hf_repo}')
-def step_download_hf_model(context, hf_file: str, hf_repo: str):
-    context.model_hf_repo = hf_repo
-    context.model_hf_file = hf_file
-    context.model_file = os.path.basename(hf_file)
-
-@step('a lora adapter file from {lora_file_url}')
-def step_download_lora_file(context, lora_file_url: str):
-    file_name = lora_file_url.split('/').pop()
-    context.lora_file = f'../../../{file_name}'
-    with open(context.lora_file, 'wb') as f:
-        f.write(requests.get(lora_file_url).content)
-
-@step('a model file {model_file}')
-def step_model_file(context, model_file: str):
-    context.model_file = model_file
-
-
-@step('a model url {model_url}')
-def step_model_url(context, model_url: str):
-    context.model_url = model_url
-
-
-@step('a model alias {model_alias}')
-def step_model_alias(context, model_alias: str):
-    context.model_alias = model_alias
-
-
-@step('{seed:d} as server seed')
-def step_seed(context, seed: int):
-    context.server_seed = seed
-
-
-@step('{ngl:d} GPU offloaded layers')
-def step_n_gpu_layer(context, ngl: int):
-    if 'N_GPU_LAYERS' in os.environ:
-        new_ngl = int(os.environ['N_GPU_LAYERS'])
-        if context.debug:
-            print(f"-ngl upgraded from {ngl} to {new_ngl}")
-        ngl = new_ngl
-    context.n_gpu_layer = ngl
-
-
-@step('{n_threads:d} threads')
-def step_n_threads(context, n_threads: int):
-    context.n_thread = n_threads
-
-
-@step('{draft:d} as draft')
-def step_draft(context, draft: int):
-    context.draft = draft
-
-
-@step('{n_ctx:d} KV cache size')
-def step_n_ctx(context, n_ctx: int):
-    context.n_ctx = n_ctx
-
-
-@step('{n_slots:d} slots')
-def step_n_slots(context, n_slots: int):
-    context.n_slots = n_slots
-
-
-@step('{n_predict:d} server max tokens to predict')
-def step_server_n_predict(context, n_predict: int):
-    context.n_server_predict = n_predict if n_predict > 0 else None
-
-
-@step('{slot_save_path} as slot save path')
-def step_slot_save_path(context, slot_save_path: str):
-    context.slot_save_path = slot_save_path
-
-
-@step('using slot id {id_slot:d}')
-def step_id_slot(context, id_slot: int):
-    context.id_slot = id_slot
-
-
-@step('prompt caching is enabled')
-def step_enable_prompt_cache(context):
-    context.cache_prompt = True
-
-
-@step('continuous batching')
-def step_server_continuous_batching(context):
-    context.server_continuous_batching = True
-
-
-@step('enable embeddings endpoint')
-def step_server_embeddings(context):
-    context.server_embeddings = True
-
-@step('enable reranking endpoint')
-def step_server_reranking(context):
-    context.server_reranking = True
-
-@step('prometheus compatible metrics exposed')
-def step_server_metrics(context):
-    context.server_metrics = True
-
-@step('disable context shifting')
-def step_server_disable_ctx_shift(context):
-    context.disable_ctx_shift = True
-
-@step("the server is starting")
-def step_start_server(context):
-    start_server_background(context)
-    attempts = 0
-    max_attempts = 20
-    if 'GITHUB_ACTIONS' in os.environ:
-        max_attempts *= 2
-
-    addrs = socket.getaddrinfo(context.server_fqdn, context.server_port, type=socket.SOCK_STREAM)
-    family, typ, proto, _, sockaddr = addrs[0]
-
-    while True:
-        with closing(socket.socket(family, typ, proto)) as sock:
-            result = sock.connect_ex(sockaddr)
-            if result == 0:
-                print("\x1b[33;46mserver started!\x1b[0m")
-                return
-            attempts += 1
-            if attempts > max_attempts:
-                assert False, "server not started"
-            print(f"waiting for server to start, connect error code = {result}...")
-            time.sleep(0.1)
-
-
-async def wait_for_server_status_with_timeout(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str, timeout: int):
-    match expecting_status:
-        case 'healthy':
-            await wait_for_slots_status(context, context.base_url, 200,
-                                        timeout=timeout)
-
-        case 'ready' | 'idle':
-            await wait_for_slots_status(context, context.base_url, 200,
-                                        timeout=timeout,
-                                        params={'fail_on_no_slot': 1},
-                                        slots_idle=context.n_slots,
-                                        slots_processing=0)
-        case 'busy':
-            await wait_for_slots_status(context, context.base_url, 503,
-                                        params={'fail_on_no_slot': 1},
-                                        slots_idle=0,
-                                        slots_processing=context.n_slots)
-        case _:
-            assert False, "unknown status"
-
-
-@step("the server is {expecting_status} with timeout {timeout:d} seconds")
-@async_run_until_complete
-async def step_wait_for_server_status_with_timeout(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str, timeout: int):
-    await wait_for_server_status_with_timeout(context, expecting_status, timeout)
-
-
-@step("the server is {expecting_status}")
-@async_run_until_complete
-async def step_wait_for_server_status(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str):
-    await wait_for_server_status_with_timeout(context, expecting_status, 30)
-
-
-@step('all slots are {expected_slot_status_string}')
-@async_run_until_complete
-async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
-    match expected_slot_status_string:
-        case 'idle':
-            expected_slot_status = False
-        case 'busy':
-            expected_slot_status = True
-        case _:
-            assert False, "unknown status"
-
-    expected_slots = [{'id': slot_id, 'is_processing': expected_slot_status}
-                      for slot_id in range(context.n_slots)]
-    await request_slots_status(context, expected_slots)
-
-
-@step('a completion request with {api_error} api error')
-@async_run_until_complete
-async def step_request_completion(context, api_error: Literal['raised'] | str):
-    expect_api_error = api_error == 'raised' or api_error != 'no'
-    seeds = await completions_seed(context, num_seeds=1)
-    completion = await request_completion(context.prompts.pop(),
-                                          seeds[0] if seeds is not None else seeds,
-                                          context.base_url,
-                                          debug=context.debug,
-                                          n_predict=context.n_predict,
-                                          cache_prompt=context.cache_prompt,
-                                          id_slot=context.id_slot,
-                                          expect_api_error=expect_api_error,
-                                          user_api_key=context.user_api_key,
-                                          temperature=context.temperature)
-    context.tasks_result.append(completion)
-    if context.debug:
-        print(f"Completion response: {completion}")
-    if api_error == 'raised':
-        assert completion == 401, f"completion must be an 401 status code: {completion}"
-    elif api_error.isdigit():
-        api_error_code = int(api_error)
-        assert completion == api_error_code, f"completion must be an {api_error_code} status code: {completion}"
-
-
-@step('an infill request with {api_error} api error')
-@async_run_until_complete
-async def step_request_completion(context, api_error: Literal['raised'] | str):
-    if api_error != 'no':
-        raise ValueError(f'api_error={api_error} is not yet implemented')
-    payload = {
-        "prompt": context.prompts[0],
-        "input_suffix": context.infill_input_suffix,
-        "input_prefix": context.infill_input_prefix,
-        "n_predict": context.n_predict,
-        "seed": context.seed,
-        "temperature": context.temperature,
-    }
-    if context.infill_input_extra is not None:
-        payload['input_extra'] = context.infill_input_extra
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        async with session.post(f'{context.base_url}/infill',
-                                json=payload) as response:
-            assert response.status == 200
-            context.tasks_result = [await response.json()]
-
-
-@step('{predicted_n:d} tokens are predicted matching {re_content}')
-def step_n_tokens_predicted_with_content(context, predicted_n, re_content):
-    context.completion = context.tasks_result.pop()
-    assert_n_tokens_predicted(context.completion, predicted_n, re_content)
-
-
-@step('{predicted_n:d} tokens are predicted')
-def step_n_tokens_predicted(context, predicted_n):
-    context.completion = context.tasks_result.pop()
-    assert_n_tokens_predicted(context.completion, predicted_n)
-
-
-@step('all predictions are equal')
-@async_run_until_complete
-async def step_predictions_equal(context):
-    n_completions = await gather_tasks_results(context)
-    assert n_completions >= 2, "need at least 2 completions"
-    assert_all_predictions_equal(context.tasks_result)
-    context.tasks_result = []
-
-
-@step('all predictions are different')
-@async_run_until_complete
-async def step_predictions_different(context):
-    n_completions = await gather_tasks_results(context)
-    assert n_completions >= 2, "need at least 2 completions"
-    assert_all_predictions_different(context.tasks_result)
-    context.tasks_result = []
-
-
-@step('all token probabilities are equal')
-@async_run_until_complete
-async def step_token_probabilities_equal(context):
-    n_completions = await gather_tasks_results(context)
-    assert n_completions >= 2, "need at least 2 completions"
-    assert_all_token_probabilities_equal(context.tasks_result)
-    context.tasks_result = []
-
-
-@step('the completion is  truncated')
-def step_assert_completion_truncated(context):
-    step_assert_completion_truncated(context, '')
-
-
-@step('the completion is {truncated} truncated')
-def step_assert_completion_truncated(context, truncated):
-    truncated = truncated != "not"
-    assert context.completion['truncated'] == truncated, f'{context.completion}'
-
-
-@step('{n_prompt:d} prompt tokens are processed')
-def step_impl(context, n_prompt):
-    assert n_prompt < 0 or n_prompt == context.completion['timings']['prompt_n'], f"n_prompt={context.completion['timings']['prompt_n']}"
-
-
-@step('a user prompt {user_prompt}')
-def step_user_prompt(context, user_prompt):
-    context.prompts.append(user_prompt)
-    context.n_prompts = len(context.prompts)
-
-
-@step('a system prompt {system_prompt}')
-def step_system_prompt(context, system_prompt):
-    context.system_prompt = system_prompt
-
-
-@step('a model {model}')
-def step_model(context, model):
-    context.model = model
-
-
-@step('{max_tokens:d} max tokens to predict')
-def step_max_tokens(context, max_tokens):
-    context.n_predict = max_tokens
-
-
-@step('a response format {response_format}')
-def step_response_format(context, response_format):
-    context.response_format = json.loads(response_format)
-
-
-@step('{temperature:f} temperature')
-def step_temperature(context, temperature):
-    context.temperature = temperature
-
-
-@step('streaming is {enable_streaming}')
-def step_streaming(context, enable_streaming):
-    context.enable_streaming = enable_streaming == 'enabled'
-
-
-@step('a user api key {user_api_key}')
-def step_user_api_key(context, user_api_key):
-    context.user_api_key = user_api_key
-
-
-@step('no user api key')
-def step_no_user_api_key(context):
-    context.user_api_key = None
-
-
-@step('a user api key ')
-def step_no_user_api_key_space(context):
-    context.user_api_key = None
-
-
-@step('a server api key {server_api_key}')
-def step_server_api_key(context, server_api_key):
-    context.server_api_key = server_api_key
-
-
-@step('{n_junk:d} as number of junk')
-def step_n_junk(context, n_junk):
-    context.n_junk = n_junk
-
-
-@step('{n_batch:d} as batch size')
-def step_n_batch(context, n_batch):
-    context.n_batch = n_batch
-
-
-@step('{n_ubatch:d} as ubatch size')
-def step_n_ubatch(context, n_ubatch):
-    context.n_ubatch = n_ubatch
-
-
-@step('{seed:d} as seed')
-def step_seed(context, seed):
-    if context.seed is None:
-        context.seed = [seed]
-    else:
-        context.seed.append(seed)
-
-
-@step('BOS token is {bos:d}')
-def step_bos_token(context, bos):
-    context.bos = bos
-
-
-@step('a prefix prompt')
-def step_prompt_prefix(context):
-    context.prompt_prefix = context_text(context)
-
-
-@step('a junk suffix prompt')
-def step_prompt_junk_suffix(context):
-    context.prompt_junk_suffix = context_text(context)
-
-
-@step('a suffix prompt')
-def step_prompt_suffix(context):
-    context.prompt_suffix = context_text(context)
-
-
-@step('{n_ga:d} group attention factor'
-      ' to extend context size through self-extend')
-def step_impl(context, n_ga):
-    context.n_ga = n_ga
-
-
-@step('{n_ga_w:d} group attention width to extend context size through self-extend')
-def step_impl(context, n_ga_w):
-    context.n_ga_w = n_ga_w
-
-
-@step('a passkey prompt template')
-def step_prompt_passkey(context):
-    context.prompt_passkey = context_text(context)
-
-@step('a rerank query')
-def step_set_rerank_query(context):
-    context.reranking_query = context_text(context)
-    context.reranking_documents = []
-
-@step('a rerank document')
-def step_set_rerank_document(context):
-    context.reranking_documents.append(context_text(context))
-
-@step('{n_prompts:d} fixed prompts')
-def step_fixed_prompts(context, n_prompts):
-    context.prompts.extend([str(0)*(context.n_batch if context.n_batch is not None else 512) for i in range(n_prompts)])
-    context.n_prompts = n_prompts
-
-
-@step('a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk')
-def step_prompt_passkey(context, passkey, i_pos):
-    prompt = ""
-    for i in range(context.n_junk):
-        if i % context.n_junk == i_pos:
-            prompt += context.prompt_passkey # the passkey is already substituted
-        prompt += context.prompt_junk_suffix
-    if context.debug:
-        passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m"
-        print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```")
-    context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix)
-    context.n_prompts = len(context.prompts)
-
-
-@step('an OAI compatible chat completions request with {api_error} api error')
-@async_run_until_complete
-async def step_oai_chat_completions(context, api_error):
-    if context.debug:
-        print(f"Submitting OAI compatible completions request...")
-    expect_api_error = api_error == 'raised'
-    seeds = await completions_seed(context, num_seeds=1),
-    completion = await oai_chat_completions(context.prompts.pop(),
-                                            seeds[0] if seeds is not None else seeds,
-                                            context.system_prompt,
-                                            context.base_url,
-                                            '/v1/chat',
-                                            False,
-                                            model=context.model if hasattr(context, 'model') else None,
-
-                                            n_predict=context.n_predict
-                                            if hasattr(context, 'n_predict') else None,
-
-                                            enable_streaming=context.enable_streaming
-                                            if hasattr(context, 'enable_streaming') else None,
-
-                                            response_format=context.response_format
-                                            if hasattr(context, 'response_format') else None,
-
-                                            user_api_key=context.user_api_key
-                                            if hasattr(context, 'user_api_key') else None,
-
-                                            expect_api_error=expect_api_error)
-    context.tasks_result.append(completion)
-    if context.debug:
-        print(f"Completion response: {completion}")
-    if expect_api_error:
-        assert completion == 401, f"completion must be an 401 status code: {completion}"
-
-    if context.debug:
-        print(f"Completion response: {completion}")
-
-
-@step('a prompt')
-def step_a_prompt(context):
-    context.prompts.append(context_text(context))
-    context.n_prompts = len(context.prompts)
-
-
-@step('a prompt {prompt}')
-def step_a_prompt_prompt(context, prompt):
-    context.prompts.append(prompt)
-    context.n_prompts = len(context.prompts)
-
-
-# TODO: allow this to be repeated
-@step('an infill input extra {filename} {text}')
-def step_infill_input_extra(context, filename, text):
-    if filename == 'none':
-        context.infill_input_extra = None
-    else:
-        context.infill_input_extra = [{'filename': filename, 'text': text}]
-
-
-@step('an infill input suffix {text}')
-def step_infill_input_suffix(context, text):
-    context.infill_input_suffix = text
-
-
-@step('an infill input prefix {text}')
-def step_infill_input_prefix(context, text):
-    context.infill_input_prefix = text
-
-
-@step('{num_prompts:d} prompts {prompt} with seed {seed:d}')
-def step_many_prompts(context, num_prompts, prompt, seed):
-    if context.seed is None:
-        context.seed = []
-    for _ in range(num_prompts):
-        context.seed.append(seed)
-        context.prompts.append(prompt)
-    context.n_prompts = len(context.prompts)
-
-
-@step('concurrent completion requests')
-@async_run_until_complete()
-async def step_concurrent_completion_requests(context):
-    await concurrent_requests(
-        context,
-        request_completion,
-        # prompt is inserted automatically
-        context.base_url,
-        debug=context.debug,
-        prompt_prefix=context.prompt_prefix,
-        prompt_suffix=context.prompt_suffix,
-        n_predict=context.n_predict if hasattr(context, 'n_predict') else None,
-        user_api_key=context.user_api_key if hasattr(context, 'user_api_key') else None,
-        temperature=context.temperature,
-    )
-
-
-@step('concurrent OAI completions requests')
-@async_run_until_complete
-async def step_oai_chat_completions(context):
-    await concurrent_requests(context, oai_chat_completions,
-                              # user_prompt is inserted automatically
-                              context.system_prompt,
-                              context.base_url,
-                              '/v1/chat/completions',
-                              True,  # async_client
-                              model=context.model
-                              if hasattr(context, 'model') else None,
-                              n_predict=context.n_predict
-                              if hasattr(context, 'n_predict') else None,
-                              enable_streaming=context.enable_streaming
-                              if hasattr(context, 'enable_streaming') else None,
-                              response_format=context.response_format
-                              if hasattr(context, 'response_format') else None,
-                              user_api_key=context.user_api_key
-                              if hasattr(context, 'user_api_key') else None)
-
-
-@step('concurrent OAI completions requests no v1')
-@async_run_until_complete
-async def step_oai_chat_completions(context):
-    await concurrent_requests(context, oai_chat_completions,
-                              # user_prompt is inserted automatically
-                              context.system_prompt,
-                              context.base_url,
-                              '/chat/completions',
-                              True,  # async_client
-                              model=context.model
-                              if hasattr(context, 'model') else None,
-                              n_predict=context.n_predict
-                              if hasattr(context, 'n_predict') else None,
-                              enable_streaming=context.enable_streaming
-                              if hasattr(context, 'enable_streaming') else None,
-                              response_format=context.response_format
-                              if hasattr(context, 'response_format') else None,
-                              user_api_key=context.user_api_key
-                              if hasattr(context, 'user_api_key') else None)
-
-
-@step('all prompts are predicted')
-@async_run_until_complete
-async def step_all_prompts_are_predicted(context):
-    await all_prompts_are_predicted(context)
-
-
-@step('all prompts are predicted with {n_expected_predicted:d} tokens')
-@async_run_until_complete
-async def step_all_prompts_are_predicted_with_n_tokens(context, n_expected_predicted):
-    await all_prompts_are_predicted(context, n_expected_predicted)
-
-
-async def all_prompts_are_predicted(context, expected_predicted_n=None):
-    n_completions = await gather_tasks_results(context)
-    assert n_completions > 0
-    for i in range(n_completions):
-        assert_n_tokens_predicted(context.tasks_result.pop(), expected_predicted_n=expected_predicted_n)
-    assert len(context.concurrent_tasks) == 0, f"{len(context.concurrent_tasks)} pending requests"
-
-
-@step('embeddings are computed for')
-@async_run_until_complete
-async def step_compute_embedding(context):
-    context.n_prompts = 1
-    context.embeddings = await request_embedding(context_text(context), None, base_url=context.base_url)
-
-
-@step('reranking request')
-@async_run_until_complete
-async def step_compute_reranking(context):
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        async with session.post(f'{context.base_url}/reranking',
-                                json={
-                                    "query": context.reranking_query,
-                                    "documents": context.reranking_documents,
-                                }) as response:
-            if response.status == 200:
-                response_json = await response.json()
-                context.reranking_results = response_json['results']
-            else:
-                context.reranking_results = response.status
-
-
-@step('all embeddings are the same')
-@async_run_until_complete
-async def step_all_embeddings_are_the_same(context):
-    n_embedding_requests = await gather_tasks_results(context)
-    assert n_embedding_requests > 0
-    embeddings = []
-    for i in range(n_embedding_requests):
-        embedding = context.tasks_result.pop().pop()
-        embeddings.append(embedding)
-        assert_embeddings(embedding)
-    n = len(embeddings)
-    for i in range(n-1):
-        for j in range(i+1, n):
-            embedding1 = np.array(embeddings[i])
-            embedding2 = np.array(embeddings[j])
-            if context.debug:
-                print(f"embedding1: {embedding1[-8:]}")
-                print(f"embedding2: {embedding2[-8:]}")
-            similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
-            msg = f"Similarity between {i} and {j}: {similarity:.10f}"
-            if context.debug:
-                print(f"{msg}")
-            assert np.isclose(similarity, 1.0, rtol=1e-05, atol=1e-08, equal_nan=False), msg
-
-
-@step('embeddings are generated')
-def step_assert_embeddings(context):
-    assert context.n_prompts == len(context.embeddings), (f"unexpected response:\n"
-                                                             f"context.n_prompts={context.n_prompts}\n"
-                                                             f"context.embeddings={context.embeddings}")
-    for embedding in context.embeddings:
-        assert_embeddings(embedding)
-
-@step('embeddings request with {api_error_code:d} api error')
-def step_assert_embeddings(context, api_error_code: int):
-    assert context.embeddings == api_error_code, f"embeddings request must return code {api_error_code}, but got {context.embeddings}"
-
-@step('an OAI compatible embeddings computation request for')
-@async_run_until_complete
-async def step_oai_compute_embeddings(context):
-    context.n_prompts = 1
-    context.embeddings = await request_oai_embeddings(context_text(context), None,
-                                                      base_url=context.base_url,
-                                                      user_api_key=context.user_api_key,
-                                                      model=context.model)
-
-
-@step('an OAI compatible embeddings computation request for multiple inputs')
-@async_run_until_complete
-async def step_oai_compute_embeddings_multiple_inputs(context):
-    context.embeddings = await request_oai_embeddings(context.prompts, None,
-                                                      base_url=context.base_url,
-                                                      user_api_key=context.user_api_key,
-                                                      model=context.model)
-    context.prompts.clear()
-
-
-@step('concurrent embedding requests')
-@async_run_until_complete()
-async def step_concurrent_embedding_requests(context):
-    await concurrent_requests(context,
-                              request_embedding,
-                              # prompt is inserted automatically
-                              base_url=context.base_url)
-
-
-@step('concurrent OAI embedding requests')
-@async_run_until_complete()
-async def step_concurrent_oai_embedding_requests(context):
-    await concurrent_requests(context,
-                              request_oai_embeddings,
-                              # prompt is inserted automatically
-                              base_url=context.base_url,
-                              async_client=True,
-                              model=context.model)
-
-
-@step('all embeddings are generated')
-@async_run_until_complete()
-async def all_embeddings_are_generated(context):
-    n_embedding_requests = await gather_tasks_results(context)
-    assert n_embedding_requests == context.n_prompts
-    for i in range(n_embedding_requests):
-        assert_embeddings(context.tasks_result.pop().pop())
-
-@step('reranking results are returned')
-def reranking_results_are_returned(context):
-    assert len(context.reranking_results) == len(context.reranking_documents)
-
-@step('reranking highest score is index {idx_high:d} and lowest score is index {idx_low:d}')
-def reranking_results_are_returned(context, idx_high: int, idx_low: int):
-    max_score, max_idx = 0, 0
-    min_score, min_idx = 0, 0
-    for res in context.reranking_results:
-        if max_score < res['relevance_score']:
-            max_score = res['relevance_score']
-            max_idx   = res['index']
-        if min_score > res['relevance_score']:
-            min_score = res['relevance_score']
-            min_idx   = res['index']
-    print(context.reranking_results)
-    assert max_idx == idx_high
-    assert min_idx == idx_low
-
-@step('adding special tokens')
-def step_tokenize_set_add_special(context):
-    context.tokenize_add_special = True
-
-
-@step("tokenizing with pieces")
-@async_run_until_complete
-async def step_tokenize_with_pieces(context):
-    context.tokenized_text = context_text(context)
-    async with aiohttp.ClientSession() as session:
-        tokenize_args = {"content": context.tokenized_text, "with_pieces": True}
-        if getattr(context, "tokenize_add_special", None) is not None:
-            tokenize_args["add_special"] = context.tokenize_add_special
-
-        async with session.post(
-            f"{context.base_url}/tokenize", json=tokenize_args
-        ) as response:
-            assert response.status == 200
-            tokenize_json = await response.json()
-            context.tokens_with_pieces = tokenize_json["tokens"]
-
-
-@step("tokens are given with pieces")
-@async_run_until_complete
-async def step_tokenize_with_pieces(context):
-    # Verify that the response contains both token IDs and pieces
-    assert all(
-        "id" in token and "piece" in token for token in context.tokens_with_pieces
-    )
-
-
-@step('tokenizing')
-@async_run_until_complete
-async def step_tokenize(context):
-    context.tokenized_text = context_text(context)
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        tokenize_args = {
-            "content": context.tokenized_text,
-        }
-        if getattr(context, 'tokenize_add_special', None) is not None:
-            tokenize_args['add_special'] = context.tokenize_add_special
-        async with session.post(f'{context.base_url}/tokenize',
-                                json=tokenize_args) as response:
-            assert response.status == 200
-            tokenize_json = await response.json()
-            context.tokens = tokenize_json['tokens']
-
-
-@step('tokens can be detokenized')
-@async_run_until_complete
-async def step_detokenize(context):
-    assert len(context.tokens) > 0
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        async with session.post(f'{context.base_url}/detokenize',
-                                json={
-                                    "tokens": context.tokens,
-                                }) as response:
-            assert response.status == 200
-            detokenize_json = await response.json()
-            # SPM tokenizer adds a whitespace prefix: https://github.com/google/sentencepiece/issues/15
-            assert context.tokenized_text == detokenize_json['content'].strip()
-
-
-@step('tokens begin with BOS')
-def step_strings_for_tokenization(context):
-    assert context.tokens[0] == context.bos
-
-
-@step('tokens do not begin with BOS')
-def step_strings_for_tokenization(context):
-    assert context.tokens[0] != context.bos
-
-
-@step('first token is removed')
-def step_strings_for_tokenization(context):
-    context.tokens = context.tokens[1:]
-
-
-@step('an OPTIONS request is sent from {origin}')
-@async_run_until_complete
-async def step_options_request(context, origin):
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        headers = {'Authorization': f'Bearer {context.user_api_key}', 'Origin': origin}
-        async with session.options(f'{context.base_url}/v1/chat/completions',
-                                    headers=headers) as response:
-            assert response.status == 200
-            context.options_response = response
-
-
-@step('CORS header {cors_header} is set to {cors_header_value}')
-def step_check_options_header_value(context, cors_header, cors_header_value):
-    assert context.options_response.headers[cors_header] == cors_header_value
-
-
-@step('prometheus metrics are exposed')
-@async_run_until_complete
-async def step_prometheus_metrics_exported(context):
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        async with await session.get(f'{context.base_url}/metrics') as metrics_response:
-            assert metrics_response.status == 200
-            assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4"
-            metrics_raw = await metrics_response.text()
-            metric_exported = False
-            if context.debug:
-                print(f"/metrics answer:\n{metrics_raw}")
-            context.metrics = {}
-            for metric in parser.text_string_to_metric_families(metrics_raw):
-                match metric.name:
-                    case "llamacpp:kv_cache_usage_ratio":
-                        assert len(metric.samples) > 0
-                        metric_exported = True
-                context.metrics[metric.name] = metric
-            assert int(metrics_response.headers["Process-Start-Time-Unix"]) > 0, "no header process start time"
-            assert metric_exported, "No metrics exported"
-
-
-@step('metric {metric_name} is {metric_value:d}')
-def step_assert_metric_value(context, metric_name, metric_value):
-    if metric_name not in context.metrics:
-        assert False, f"no metric {metric_name} in {context.metrics.keys()}"
-    assert context.metrics[metric_name].samples[0].value == metric_value, f"metric: {context.metrics[metric_name]}"
-
-
-@step('available models')
-def step_available_models(context):
-    # openai client always expects an api_key
-    openai.api_key = context.user_api_key if context.user_api_key is not None else 'nope'
-    openai.base_url = f'{context.base_url}/v1/'
-    context.models = openai.models.list().data
-
-
-@step('{n_model:d} models are supported')
-def step_supported_models(context, n_model):
-    if context.debug:
-        print("server models available:", context.models)
-    assert len(context.models) == n_model
-
-
-@step('model {i_model:d} is {param} {preposition} {param_value}')
-def step_supported_models(context, i_model: int, param: Literal['identified', 'trained'] | str, preposition: str, param_value: str):
-    assert i_model < len(context.models)
-    model = context.models[i_model]
-
-    param_value = param_value.split(' ', 1)[0]
-    match param:
-        case 'identified':
-            value = model.id
-        case 'trained':
-            value = str(model.meta["n_ctx_train"])
-        case _:
-            assert False, "param {param} not supported"
-    assert param_value == value, f"model param {param} {value} != {param_value}"
-
-
-async def concurrent_requests(context, f_completion, *args, **kwargs):
-    context.n_prompts = len(context.prompts)
-    if context.debug:
-        print(f"starting {context.n_prompts} concurrent completion requests...")
-    assert context.n_prompts > 0
-    seeds = await completions_seed(context)
-    assert seeds is not None
-    for prompt_no in range(context.n_prompts):
-        shifted_args = [context.prompts.pop(), seeds[prompt_no], *args]
-        context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
-    await asyncio.sleep(0.01)
-
-
-@step('the slot {slot_id:d} is saved with filename "{filename}"')
-@async_run_until_complete
-async def step_save_slot(context, slot_id, filename):
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        async with session.post(f'{context.base_url}/slots/{slot_id}?action=save',
-                                json={"filename": filename},
-                                headers={"Content-Type": "application/json"}) as response:
-            context.response = response
-
-
-@step('the slot {slot_id:d} is restored with filename "{filename}"')
-@async_run_until_complete
-async def step_restore_slot(context, slot_id, filename):
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        async with session.post(f'{context.base_url}/slots/{slot_id}?action=restore',
-                                json={"filename": filename},
-                                headers={"Content-Type": "application/json"}) as response:
-            context.response = response
-
-
-@step('the slot {slot_id:d} is erased')
-@async_run_until_complete
-async def step_erase_slot(context, slot_id):
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        async with session.post(f'{context.base_url}/slots/{slot_id}?action=erase',
-                                headers={"Content-Type": "application/json"}) as response:
-            context.response = response
-
-
-@step('switch {on_or_off} lora adapter {lora_id:d}')
-@async_run_until_complete
-async def toggle_lora_adapter(context, on_or_off: str, lora_id: int):
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        async with session.post(f'{context.base_url}/lora-adapters',
-                                json=[{'id': lora_id, 'scale': 1 if on_or_off == 'on' else 0}],
-                                headers={"Content-Type": "application/json"}) as response:
-            context.response = response
-            print([{'id': lora_id, 'scale': 1 if on_or_off == 'on' else 0}])
-
-
-@step('the server responds with status code {status_code:d}')
-def step_server_responds_with_status_code(context, status_code):
-    assert context.response.status == status_code
-
-
-async def request_completion(prompt,
-                             seed,
-                             base_url,
-                             debug=False,
-                             prompt_prefix=None,
-                             prompt_suffix=None,
-                             n_predict=None,
-                             cache_prompt=False,
-                             id_slot=None,
-                             expect_api_error=None,
-                             user_api_key=None,
-                             temperature=None) -> int | dict[str, Any]:
-    if debug:
-        print(f"Sending completion request: {prompt}")
-    origin = "my.super.domain"
-    headers = {
-        'Origin': origin
-    }
-    if user_api_key is not None:
-        if debug:
-            print(f"Set user_api_key: {user_api_key}")
-        headers['Authorization'] = f'Bearer {user_api_key}'
-
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        async with session.post(f'{base_url}/completion',
-                                json={
-                                    "input_prefix": prompt_prefix,
-                                    "prompt": prompt,
-                                    "input_suffix": prompt_suffix,
-                                    "n_predict": n_predict if n_predict is not None else -1,
-                                    "cache_prompt": cache_prompt,
-                                    "id_slot": id_slot,
-                                    "seed": seed if seed is not None else 42,
-                                    "temperature": temperature if temperature is not None else 0.8,
-                                    "n_probs": 2,
-                                },
-                                headers=headers) as response:
-            if expect_api_error is None or not expect_api_error:
-                assert response.status == 200
-                assert response.headers['Access-Control-Allow-Origin'] == origin
-                return await response.json()
-            else:
-                return response.status
-
-
-async def oai_chat_completions(user_prompt,
-                               seed,
-                               system_prompt,
-                               base_url: str,
-                               base_path: str,
-                               async_client,
-                               debug=False,
-                               temperature=None,
-                               model=None,
-                               n_predict=None,
-                               enable_streaming=None,
-                               response_format=None,
-                               user_api_key=None,
-                               expect_api_error=None) -> int | dict[str, Any]:
-    if debug:
-        print(f"Sending OAI Chat completions request: {user_prompt}")
-    # openai client always expects an api key
-    user_api_key = user_api_key if user_api_key is not None else 'nope'
-    seed = seed if seed is not None else 42
-    enable_streaming = enable_streaming if enable_streaming is not None else False
-    payload = {
-        "messages": [
-            {
-                "role": "system",
-                "content": system_prompt,
-            },
-            {
-                "role": "user",
-                "content": user_prompt,
-            }
-        ],
-        "model": model,
-        "max_tokens": n_predict,
-        "stream": enable_streaming,
-        "temperature": temperature if temperature is not None else 0.0,
-        "seed": seed,
-    }
-    if response_format is not None:
-        payload['response_format'] = response_format
-    completion_response = {
-        'content': '',
-        'timings': {
-            'predicted_n': 0,
-            'prompt_n': 0
-        }
-    }
-    if async_client:
-        origin = 'llama.cpp'
-        headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
-        async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-            async with session.post(f'{base_url}{base_path}',
-                                    json=payload,
-                                    headers=headers) as response:
-                if enable_streaming:
-                    assert response.status == 200
-                    assert response.headers['Access-Control-Allow-Origin'] == origin
-                    assert response.headers['Content-Type'] == "text/event-stream"
-                    event_received = True
-                    while event_received:
-                        event_received = False
-                        async for line_in_bytes in response.content:
-                            line = line_in_bytes.decode('utf-8')
-                            line = line.rstrip('\n').rstrip('\r')
-                            if line == '':
-                                continue
-                            event_data = line.split(': ', 1)
-                            assert event_data[0] == 'data', f'Bad event code received: ```{event_data}```'
-                            chunk_raw = event_data[1]
-                            if chunk_raw == '[DONE]':
-                                break
-
-                            chunk = json.loads(chunk_raw)
-                            assert len(chunk['choices']) == 1, f"no choices provided, line ```{line}```"
-                            delta = chunk['choices'][0]['delta']
-                            if 'content' in delta:
-                                completion_response['content'] += delta['content']
-                                completion_response['timings']['predicted_n'] += 1
-                else:
-                    if expect_api_error is None or not expect_api_error:
-                        assert response.status == 200
-                        assert response.headers['Access-Control-Allow-Origin'] == origin
-                        assert response.headers['Content-Type'] == "application/json; charset=utf-8"
-                        chat_completion_raw = await response.json()
-                        completion_response = {
-                            'content': chat_completion_raw['choices'][0]['message'],
-                            'timings': {
-                                'predicted_n': chat_completion_raw['usage']['completion_tokens'],
-                                'prompt_n': chat_completion_raw['usage']['prompt_tokens']
-                            }
-                        }
-                    else:
-                        return response.status
-    else:
-        try:
-            openai.api_key = user_api_key
-            openai.base_url = f'{base_url}{base_path.removesuffix("chat")}'
-            assert model is not None
-            chat_completion = openai.chat.completions.create(
-                messages=payload['messages'],
-                model=model,
-                max_tokens=n_predict,
-                stream=enable_streaming,
-                response_format=payload.get('response_format') or openai.NOT_GIVEN,
-                seed=seed,
-                temperature=payload['temperature']
-            )
-        except openai.AuthenticationError as e:
-            if expect_api_error is not None and expect_api_error:
-                return 401
-            else:
-                assert False, f'error raised: {e}'
-
-        if enable_streaming:
-            chat_completion = cast(openai.Stream[ChatCompletionChunk], chat_completion)
-            for chunk in chat_completion:
-                assert len(chunk.choices) == 1
-                delta = chunk.choices[0].delta
-                if delta.content is not None:
-                    completion_response['content'] += delta.content
-                    completion_response['timings']['predicted_n'] += 1
-                completion_response['truncated'] = chunk.choices[0].finish_reason != 'stop'
-        else:
-            assert len(chat_completion.choices) == 1
-            assert chat_completion.usage is not None
-            completion_response = {
-                'content': chat_completion.choices[0].message.content,
-                'timings': {
-                    'predicted_n': chat_completion.usage.completion_tokens,
-                    'prompt_n': chat_completion.usage.prompt_tokens
-                    },
-                'truncated': chat_completion.choices[0].finish_reason != 'stop'
-            }
-    if debug:
-        print("OAI response formatted to llama.cpp:", completion_response)
-    return completion_response
-
-
-async def request_embedding(content, seed, base_url=None) -> list[list[float]] | int:
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        async with session.post(f'{base_url}/embedding',
-                                json={
-                                    "content": content,
-                                }) as response:
-            if response.status == 200:
-                response_json = await response.json()
-                return [response_json['embedding']]
-            else:
-                return response.status
-
-
-async def request_oai_embeddings(input, seed,
-                                 base_url=None, user_api_key=None,
-                                 model=None, async_client=False) -> list[list[float]]:
-    # openai client always expects an api_key
-    user_api_key = user_api_key if user_api_key is not None else 'nope'
-    if async_client:
-        origin = 'llama.cpp'
-        headers=[]
-        if user_api_key is not None:
-            headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
-        async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-            async with session.post(f'{base_url}/v1/embeddings',
-                                    json={
-                                        "input": input,
-                                        "model": model,
-                                    },
-                                    headers=headers) as response:
-                assert response.status == 200, f"received status code not expected: {response.status}"
-                assert response.headers['Access-Control-Allow-Origin'] == origin
-                assert response.headers['Content-Type'] == "application/json; charset=utf-8"
-                response_json = await response.json()
-                assert response_json['model'] == model, f"invalid model received: {response_json['model']}"
-                assert response_json['object'] == 'list'
-                if isinstance(input, Sequence):
-                    embeddings = []
-                    for an_oai_embeddings in response_json['data']:
-                        embeddings.append(an_oai_embeddings['embedding'])
-                else:
-                    embeddings = [response_json['data']['embedding']]
-                return embeddings
-    else:
-        openai.api_key = user_api_key
-        openai.base_url = f'{base_url}/v1/'
-        assert model is not None
-        oai_embeddings = openai.embeddings.create(
-            model=model,
-            input=input,
-        )
-
-        return [e.embedding for e in oai_embeddings.data]
-
-
-def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None):
-    content = completion_response['content']
-    n_predicted = completion_response['timings']['predicted_n']
-    assert len(content) > 0, "no token predicted"
-    if re_content is not None:
-        p = re.compile(re_content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL)
-        matches = p.finditer(content)
-        last_match = 0
-        highlighted = ''
-        for match in matches:
-            start, end = match.span()
-            highlighted += content[last_match: start]
-            highlighted += '\x1b[33m'
-            highlighted += content[start: end]
-            highlighted += '\x1b[0m'
-            last_match = end
-        highlighted += content[last_match:]
-        if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
-          print(f"Checking completion response: {highlighted}")
-        assert last_match > 0, f'/{re_content}/ must match ```{highlighted}```'
-    if expected_predicted_n and expected_predicted_n > 0:
-        assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
-                                                     f' {n_predicted} <> {expected_predicted_n}')
-
-def assert_all_predictions_equal(completion_responses):
-    if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
-        for i, response_i in enumerate(completion_responses):
-            content_i = response_i['content']
-            print(f"content {i}: {content_i}")
-    for i, response_i in enumerate(completion_responses):
-        content_i = response_i['content']
-        for j, response_j in enumerate(completion_responses):
-            if i == j:
-                continue
-            content_j = response_j['content']
-            assert content_i == content_j, "contents not equal"
-
-
-def assert_all_predictions_different(completion_responses):
-    if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
-        for i, response_i in enumerate(completion_responses):
-            content_i = response_i['content']
-            print(f"content {i}: {content_i}")
-    for i, response_i in enumerate(completion_responses):
-        content_i = response_i['content']
-        for j, response_j in enumerate(completion_responses):
-            if i == j:
-                continue
-            content_j = response_j['content']
-            assert content_i != content_j, "contents not different"
-
-
-def assert_all_token_probabilities_equal(completion_responses):
-    n_predict = len(completion_responses[0]['completion_probabilities'])
-    if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
-        for pos in range(n_predict):
-            for i, response_i in enumerate(completion_responses):
-                probs_i = response_i['completion_probabilities'][pos]['probs']
-                print(f"pos {pos}, probs {i}: {probs_i}")
-    for pos in range(n_predict):
-        for i, response_i in enumerate(completion_responses):
-            probs_i = response_i['completion_probabilities'][pos]['probs']
-            for j, response_j in enumerate(completion_responses):
-                if i == j:
-                    continue
-                probs_j = response_j['completion_probabilities'][pos]['probs']
-                assert probs_i == probs_j, "contents not equal"
-
-
-async def gather_tasks_results(context):
-    n_tasks = len(context.concurrent_tasks)
-    if context.debug:
-        print(f"Waiting for all {n_tasks} tasks results...")
-    for task_no in range(n_tasks):
-        context.tasks_result.append(await context.concurrent_tasks.pop())
-    n_completions = len(context.tasks_result)
-    return n_completions
-
-
-async def wait_for_slots_status(context,
-                                base_url,
-                                expected_http_status_code,
-                                timeout=3,
-                                params=None,
-                                slots_idle=None,
-                                slots_processing=None):
-    if context.debug:
-        print(f"Starting checking for health for expected_http_status_code={expected_http_status_code}")
-    interval = 0.5
-    counter = 0
-    if 'GITHUB_ACTIONS' in os.environ:
-        timeout *= 2
-
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        while True:
-            headers = {'Authorization': f'Bearer {context.server_api_key}'}
-            async with await session.get(f'{base_url}/slots', params=params, headers=headers) as slots_response:
-                status_code = slots_response.status
-                slots = await slots_response.json()
-                if context.debug:
-                    print(f"slots responses {slots}\n")
-                if status_code == 503 and status_code == expected_http_status_code:
-                    return
-                if status_code == 200 and status_code == expected_http_status_code:
-                    n_slots_idle = sum(1 if not slot["is_processing"] else 0 for slot in slots)
-                    n_slots_processing = sum(1 if slot["is_processing"] else 0 for slot in slots)
-                    if ((slots_idle is None or slots_idle == n_slots_idle)
-                        and (slots_processing is None or slots_processing == n_slots_processing)):
-                        return
-            await asyncio.sleep(interval)
-
-            counter += interval
-            if counter >= timeout:
-                # Sometimes health requests are triggered after completions are predicted
-                if expected_http_status_code == 503:
-                    if len(context.tasks_result) == 0:
-                        print("\x1b[5;37;43mWARNING: forcing concurrent tasks,"
-                              " busy health check missed, probably too fast inference\x1b[0m\n")
-                        n_completions = await gather_tasks_results(context)
-                        if n_completions > 0:
-                            return
-
-                assert False, f'slots check timeout exceeded {counter}s>={timeout}'
-
-
-def assert_embeddings(embeddings):
-    assert len(embeddings) > 0
-    embeddings_computed = False
-    for emb in embeddings:
-        if not isinstance(emb, float):
-            assert False, f"Bad embeddings: {embeddings}"
-        if emb != 0:
-            embeddings_computed = True
-    assert embeddings_computed, f"Embeddings: {embeddings}"
-
-
-async def request_slots_status(context, expected_slots):
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        async with await session.get(f'{context.base_url}/slots') as slots_response:
-            assert slots_response.status == 200
-            slots = await slots_response.json()
-            assert_slots_status(slots, expected_slots)
-
-
-def assert_slots_status(slots, expected_slots):
-    assert len(slots) == len(expected_slots)
-    for slot_id, (expected, slot) in enumerate(zip(expected_slots, slots)):
-        for key in expected:
-            assert expected[key] == slot[key], (f"invalid slot {slot_id}"
-                                                f" expected[{key}] != slot[{key}]"
-                                                f" = {expected[key]} != {slot[key]}")
-
-
-async def completions_seed(context, num_seeds=None):
-    if hasattr(context, "seed") and context.seed is not None:
-        assert len(context.seed) == context.n_prompts
-        if num_seeds is None:
-            num_seeds = context.n_prompts
-        assert num_seeds <= context.n_prompts
-        seeds = context.seed[:num_seeds]
-        context.seed = context.seed[num_seeds:] if num_seeds < context.n_prompts else None
-        return seeds
-
-    if hasattr(context, "server_seed") and context.server_seed is not None:
-        if num_seeds is None:
-            return [context.server_seed] * context.n_prompts
-        else:
-            return [context.server_seed] * num_seeds
-    return None
-
-
-def context_text(context):
-    return context.text.replace('\r', '')
-
-
-def start_server_background(context):
-    if os.name == 'nt':
-        context.server_path = '../../../build/bin/Release/llama-server.exe'
-    else:
-        context.server_path = '../../../build/bin/llama-server'
-    if 'LLAMA_SERVER_BIN_PATH' in os.environ:
-        context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
-    server_listen_addr = context.server_fqdn
-    server_args = [
-        '--slots', # requires to get slot status via /slots endpoint
-        '--host', server_listen_addr,
-        '--port', context.server_port,
-    ]
-    if context.model_file:
-        server_args.extend(['--model', context.model_file])
-    if context.model_url:
-        server_args.extend(['--model-url', context.model_url])
-    if context.model_hf_repo:
-        server_args.extend(['--hf-repo', context.model_hf_repo])
-    if context.model_hf_file:
-        server_args.extend(['--hf-file', context.model_hf_file])
-    if context.n_batch:
-        server_args.extend(['--batch-size', context.n_batch])
-    if context.n_ubatch:
-        server_args.extend(['--ubatch-size', context.n_ubatch])
-    if context.n_threads:
-        server_args.extend(['--threads', context.threads])
-    if context.n_gpu_layer:
-        server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
-    if context.draft is not None:
-        server_args.extend(['--draft', context.draft])
-    if context.server_continuous_batching:
-        server_args.append('--cont-batching')
-    if context.server_embeddings:
-        server_args.append('--embedding')
-    if context.server_reranking:
-        server_args.append('--reranking')
-    if context.server_metrics:
-        server_args.append('--metrics')
-    if context.model_alias:
-        server_args.extend(['--alias', context.model_alias])
-    if context.n_ctx:
-        server_args.extend(['--ctx-size', context.n_ctx])
-    if context.n_slots:
-        server_args.extend(['--parallel', context.n_slots])
-    if context.n_server_predict:
-        server_args.extend(['--n-predict', context.n_server_predict])
-    if context.slot_save_path:
-        server_args.extend(['--slot-save-path', context.slot_save_path])
-    if context.server_api_key:
-        server_args.extend(['--api-key', context.server_api_key])
-    if context.n_ga:
-        server_args.extend(['--grp-attn-n', context.n_ga])
-    if context.n_ga_w:
-        server_args.extend(['--grp-attn-w', context.n_ga_w])
-    if context.debug:
-        server_args.append('--verbose')
-    if context.lora_file:
-        server_args.extend(['--lora', context.lora_file])
-    if context.disable_ctx_shift:
-        server_args.extend(['--no-context-shift'])
-
-    args = [str(arg) for arg in [context.server_path, *server_args]]
-    print(f"bench: starting server with: {' '.join(args)}")
-
-    flags = 0
-    if 'nt' == os.name:
-        flags |= subprocess.DETACHED_PROCESS
-        flags |= subprocess.CREATE_NEW_PROCESS_GROUP
-        flags |= subprocess.CREATE_NO_WINDOW
-
-    pkwargs = {
-        'creationflags': flags,
-        'stdout': subprocess.PIPE,
-        'stderr': subprocess.PIPE
-    }
-    context.server_process = subprocess.Popen(
-        [str(arg) for arg in [context.server_path, *server_args]],
-        **pkwargs)  # pyright: ignore[reportArgumentType, reportCallIssue]
-
-    def server_log(in_stream, out_stream):
-        for line in iter(in_stream.readline, b''):
-            print(line.decode('utf-8'), end='', file=out_stream)
-
-    thread_stdout = threading.Thread(target=server_log, args=(context.server_process.stdout, sys.stdout))
-    thread_stdout.start()
-
-    thread_stderr = threading.Thread(target=server_log, args=(context.server_process.stderr, sys.stderr))
-    thread_stderr.start()
-
-    print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
diff --git a/examples/server/tests/features/wrong_usages.feature b/examples/server/tests/features/wrong_usages.feature
deleted file mode 100644
index 61d5f315e..000000000
--- a/examples/server/tests/features/wrong_usages.feature
+++ /dev/null
@@ -1,25 +0,0 @@
-# run with: ./tests.sh --no-skipped --tags wrong_usage
-@wrong_usage
-Feature: Wrong usage of llama.cpp server
-
-  #3969 The user must always set --n-predict option
-  # to cap the number of tokens any completion request can generate
-  # or pass n_predict/max_tokens in the request.
-  Scenario: Infinite loop
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    And   42 as server seed
-    And   2048 KV cache size
-    # Uncomment below to fix the issue
-    #And   64 server max tokens to predict
-    Then  the server is starting
-    Then  the server is healthy
-    Given a prompt:
-      """
-      Go to: infinite loop
-      """
-    # Uncomment below to fix the issue
-    #And   128 max tokens to predict
-    Given concurrent completion requests
-    Then the server is idle
-    Then all prompts are predicted
diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt
index 553954872..935a79114 100644
--- a/examples/server/tests/requirements.txt
+++ b/examples/server/tests/requirements.txt
@@ -1,5 +1,5 @@
 aiohttp~=3.9.3
-behave~=1.2.6
+pytest~=8.3.3
 huggingface_hub~=0.23.2
 numpy~=1.26.4
 openai~=1.30.3
diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh
index 72a0fbad8..1e285dcda 100755
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@@ -4,8 +4,7 @@ set -eu
 
 if [ $# -lt 1 ]
 then
-    # Start @llama.cpp scenario
-    behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
+    pytest -v -x
 else
-    behave "$@"
+    pytest "$@"
 fi
diff --git a/examples/server/tests/unit/test_basic.py b/examples/server/tests/unit/test_basic.py
new file mode 100644
index 000000000..84db5ca1c
--- /dev/null
+++ b/examples/server/tests/unit/test_basic.py
@@ -0,0 +1,34 @@
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+
+
+def test_server_start_simple():
+    global server
+    server.start()
+    res = server.make_request("GET", "/health")
+    assert res.status_code == 200
+
+
+def test_server_props():
+    global server
+    server.start()
+    res = server.make_request("GET", "/props")
+    assert res.status_code == 200
+    assert res.body["total_slots"] == server.n_slots
+
+
+def test_server_models():
+    global server
+    server.start()
+    res = server.make_request("GET", "/models")
+    assert res.status_code == 200
+    assert len(res.body["data"]) == 1
+    assert res.body["data"][0]["id"] == server.model_alias
diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py
new file mode 100644
index 000000000..d7aeb288d
--- /dev/null
+++ b/examples/server/tests/unit/test_chat_completion.py
@@ -0,0 +1,129 @@
+import pytest
+from openai import OpenAI
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+
+
+@pytest.mark.parametrize(
+    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated",
+    [
+        ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False),
+        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False),
+    ]
+)
+def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated):
+    global server
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "model": model,
+        "max_tokens": max_tokens,
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ],
+    })
+    assert res.status_code == 200
+    assert res.body["usage"]["prompt_tokens"] == n_prompt
+    assert res.body["usage"]["completion_tokens"] == n_predicted
+    choice = res.body["choices"][0]
+    assert "assistant" == choice["message"]["role"]
+    assert match_regex(re_content, choice["message"]["content"])
+    if truncated:
+        assert choice["finish_reason"] == "length"
+    else:
+        assert choice["finish_reason"] == "stop"
+
+
+@pytest.mark.parametrize(
+    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated",
+    [
+        ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False),
+        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False),
+    ]
+)
+def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated):
+    global server
+    server.start()
+    res = server.make_stream_request("POST", "/chat/completions", data={
+        "model": model,
+        "max_tokens": max_tokens,
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ],
+        "stream": True,
+    })
+    content = ""
+    for data in res:
+        choice = data["choices"][0]
+        if choice["finish_reason"] in ["stop", "length"]:
+            assert data["usage"]["prompt_tokens"] == n_prompt
+            assert data["usage"]["completion_tokens"] == n_predicted
+            assert "content" not in choice["delta"]
+            assert match_regex(re_content, content)
+            # FIXME: not sure why this is incorrect in stream mode
+            # if truncated:
+            #   assert choice["finish_reason"] == "length"
+            # else:
+            #   assert choice["finish_reason"] == "stop"
+        else:
+            assert choice["finish_reason"] is None
+            content += choice["delta"]["content"]
+
+
+def test_chat_completion_with_openai_library():
+    global server
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
+    res = client.chat.completions.create(
+        model="gpt-3.5-turbo-instruct",
+        messages=[
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+        ],
+        max_tokens=8,
+        seed=42,
+        temperature=0.8,
+    )
+    print(res)
+    assert res.choices[0].finish_reason == "stop"
+    assert res.choices[0].message.content is not None
+    assert match_regex("(Suddenly)+", res.choices[0].message.content)
+
+
+@pytest.mark.parametrize("response_format,n_predicted,re_content", [
+    ({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""),
+    ({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"),
+    ({"type": "json_object"}, 10, "(\\{|John)+"),
+    ({"type": "sound"}, 0, None),
+    # invalid response format (expected to fail)
+    ({"type": "json_object", "schema": 123}, 0, None),
+    ({"type": "json_object", "schema": {"type": 123}}, 0, None),
+    ({"type": "json_object", "schema": {"type": "hiccup"}}, 0, None),
+])
+def test_completion_with_response_format(response_format: dict, n_predicted: int, re_content: str | None):
+    global server
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": n_predicted,
+        "messages": [
+            {"role": "system", "content": "You are a coding assistant."},
+            {"role": "user", "content": "Write an example"},
+        ],
+        "response_format": response_format,
+    })
+    if re_content is not None:
+        assert res.status_code == 200
+        choice = res.body["choices"][0]
+        assert match_regex(re_content, choice["message"]["content"])
+    else:
+        assert res.status_code != 200
+        assert "error" in res.body
+
diff --git a/examples/server/tests/unit/test_completion.py b/examples/server/tests/unit/test_completion.py
new file mode 100644
index 000000000..2fa30dd03
--- /dev/null
+++ b/examples/server/tests/unit/test_completion.py
@@ -0,0 +1,223 @@
+import pytest
+import time
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+
+@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
+    ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False),
+    ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False),
+])
+def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool):
+    global server
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": n_predict,
+        "prompt": prompt,
+    })
+    assert res.status_code == 200
+    assert res.body["timings"]["prompt_n"] == n_prompt
+    assert res.body["timings"]["predicted_n"] == n_predicted
+    assert res.body["truncated"] == truncated
+    assert match_regex(re_content, res.body["content"])
+
+
+@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
+    ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False),
+    ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False),
+])
+def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool):
+    global server
+    server.start()
+    res = server.make_stream_request("POST", "/completion", data={
+        "n_predict": n_predict,
+        "prompt": prompt,
+        "stream": True,
+    })
+    content = ""
+    for data in res:
+        if data["stop"]:
+            assert data["timings"]["prompt_n"] == n_prompt
+            assert data["timings"]["predicted_n"] == n_predicted
+            assert data["truncated"] == truncated
+            assert match_regex(re_content, content)
+        else:
+            content += data["content"]
+
+
+@pytest.mark.parametrize("n_slots", [1, 2])
+def test_consistent_result_same_seed(n_slots: int):
+    global server
+    server.n_slots = n_slots
+    server.start()
+    last_res = None
+    for _ in range(4):
+        res = server.make_request("POST", "/completion", data={
+            "prompt": "I believe the meaning of life is",
+            "seed": 42,
+            "temperature": 1.0,
+            "cache_prompt": False,  # TODO: remove this once test_cache_vs_nocache_prompt is fixed
+        })
+        if last_res is not None:
+            assert res.body["content"] == last_res.body["content"]
+        last_res = res
+
+
+@pytest.mark.parametrize("n_slots", [1, 2])
+def test_different_result_different_seed(n_slots: int):
+    global server
+    server.n_slots = n_slots
+    server.start()
+    last_res = None
+    for seed in range(4):
+        res = server.make_request("POST", "/completion", data={
+            "prompt": "I believe the meaning of life is",
+            "seed": seed,
+            "temperature": 1.0,
+            "cache_prompt": False,  # TODO: remove this once test_cache_vs_nocache_prompt is fixed
+        })
+        if last_res is not None:
+            assert res.body["content"] != last_res.body["content"]
+        last_res = res
+
+
+@pytest.mark.parametrize("n_batch", [16, 32])
+@pytest.mark.parametrize("temperature", [0.0, 1.0])
+def test_consistent_result_different_batch_size(n_batch: int, temperature: float):
+    global server
+    server.n_batch = n_batch
+    server.start()
+    last_res = None
+    for _ in range(4):
+        res = server.make_request("POST", "/completion", data={
+            "prompt": "I believe the meaning of life is",
+            "seed": 42,
+            "temperature": temperature,
+            "cache_prompt": False,  # TODO: remove this once test_cache_vs_nocache_prompt is fixed
+        })
+        if last_res is not None:
+            assert res.body["content"] == last_res.body["content"]
+        last_res = res
+
+
+@pytest.mark.skip(reason="This test fails on linux, need to be fixed")
+def test_cache_vs_nocache_prompt():
+    global server
+    server.start()
+    res_cache = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "seed": 42,
+        "temperature": 1.0,
+        "cache_prompt": True,
+    })
+    res_no_cache = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "seed": 42,
+        "temperature": 1.0,
+        "cache_prompt": False,
+    })
+    assert res_cache.body["content"] == res_no_cache.body["content"]
+
+
+def test_completion_with_tokens_input():
+    global server
+    server.temperature = 0.0
+    server.start()
+    prompt_str = "I believe the meaning of life is"
+    res = server.make_request("POST", "/tokenize", data={
+        "content": prompt_str,
+        "add_special": True,
+    })
+    assert res.status_code == 200
+    tokens = res.body["tokens"]
+
+    # single completion
+    res = server.make_request("POST", "/completion", data={
+        "prompt": tokens,
+    })
+    assert res.status_code == 200
+    assert type(res.body["content"]) == str
+
+    # batch completion
+    res = server.make_request("POST", "/completion", data={
+        "prompt": [tokens, tokens],
+    })
+    assert res.status_code == 200
+    assert type(res.body) == list
+    assert len(res.body) == 2
+    assert res.body[0]["content"] == res.body[1]["content"]
+
+    # mixed string and tokens
+    res = server.make_request("POST", "/completion", data={
+        "prompt": [tokens, prompt_str],
+    })
+    assert res.status_code == 200
+    assert type(res.body) == list
+    assert len(res.body) == 2
+    assert res.body[0]["content"] == res.body[1]["content"]
+
+    # mixed string and tokens in one sequence
+    res = server.make_request("POST", "/completion", data={
+        "prompt": [1, 2, 3, 4, 5, 6, prompt_str, 7, 8, 9, 10, prompt_str],
+    })
+    assert res.status_code == 200
+    assert type(res.body["content"]) == str
+
+
+@pytest.mark.parametrize("n_slots,n_requests", [
+    (1, 3),
+    (2, 2),
+    (2, 4),
+    (4, 2), # some slots must be idle
+    (4, 6),
+])
+def test_completion_parallel_slots(n_slots: int, n_requests: int):
+    global server
+    server.n_slots = n_slots
+    server.temperature = 0.0
+    server.start()
+
+    PROMPTS = [
+        ("Write a very long book.", "(very|special|big)+"),
+        ("Write another a poem.", "(small|house)+"),
+        ("What is LLM?", "(Dad|said)+"),
+        ("The sky is blue and I love it.", "(climb|leaf)+"),
+        ("Write another very long music lyrics.", "(friends|step|sky)+"),
+        ("Write a very long joke.", "(cat|Whiskers)+"),
+    ]
+    def check_slots_status():
+        should_all_slots_busy = n_requests >= n_slots
+        time.sleep(0.1)
+        res = server.make_request("GET", "/slots")
+        n_busy = sum([1 for slot in res.body if slot["is_processing"]])
+        if should_all_slots_busy:
+            assert n_busy == n_slots
+        else:
+            assert n_busy <= n_slots
+
+    tasks = []
+    for i in range(n_requests):
+        prompt, re_content = PROMPTS[i % len(PROMPTS)]
+        tasks.append((server.make_request, ("POST", "/completion", {
+            "prompt": prompt,
+            "seed": 42,
+            "temperature": 1.0,
+        })))
+    tasks.append((check_slots_status, ()))
+    results = parallel_function_calls(tasks)
+
+    # check results
+    for i in range(n_requests):
+        prompt, re_content = PROMPTS[i % len(PROMPTS)]
+        res = results[i]
+        assert res.status_code == 200
+        assert type(res.body["content"]) == str
+        assert len(res.body["content"]) > 10
+        # FIXME: the result is not deterministic when using other slot than slot 0
+        # assert match_regex(re_content, res.body["content"])
diff --git a/examples/server/tests/unit/test_ctx_shift.py b/examples/server/tests/unit/test_ctx_shift.py
new file mode 100644
index 000000000..be93a6d31
--- /dev/null
+++ b/examples/server/tests/unit/test_ctx_shift.py
@@ -0,0 +1,67 @@
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+
+LONG_TEXT = """
+Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
+Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
+Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+""".strip()
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+    server.n_ctx = 256
+    server.n_slots = 2
+
+
+def test_ctx_shift_enabled():
+    # the prompt is 301 tokens
+    # the slot context is 256/2 = 128 tokens
+    # the prompt is truncated to keep the last 109 tokens
+    # 64 tokens are generated thanks to shifting the context when it gets full
+    global server
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": 64,
+        "prompt": LONG_TEXT,
+    })
+    assert res.status_code == 200
+    assert res.body["timings"]["prompt_n"] == 109
+    assert res.body["timings"]["predicted_n"] == 64
+    assert res.body["truncated"] is True
+
+
+@pytest.mark.parametrize("n_predict,n_token_output,truncated", [
+    (64, 64, False),
+    (-1, 120, True),
+])
+def test_ctx_shift_disabled_short_prompt(n_predict: int, n_token_output: int, truncated: bool):
+    global server
+    server.disable_ctx_shift = True
+    server.n_predict = -1
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": n_predict,
+        "prompt": "Hi how are you",
+    })
+    assert res.status_code == 200
+    assert res.body["timings"]["predicted_n"] == n_token_output
+    assert res.body["truncated"] == truncated
+
+
+def test_ctx_shift_disabled_long_prompt():
+    global server
+    server.disable_ctx_shift = True
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": 64,
+        "prompt": LONG_TEXT,
+    })
+    assert res.status_code != 200
+    assert "error" in res.body
+    assert "exceeds the available context size" in res.body["error"]["message"]
diff --git a/examples/server/tests/unit/test_embedding.py b/examples/server/tests/unit/test_embedding.py
new file mode 100644
index 000000000..fc7c20064
--- /dev/null
+++ b/examples/server/tests/unit/test_embedding.py
@@ -0,0 +1,99 @@
+import pytest
+from openai import OpenAI
+from utils import *
+
+server = ServerPreset.bert_bge_small()
+
+EPSILON = 1e-3
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.bert_bge_small()
+
+
+def test_embedding_single():
+    global server
+    server.start()
+    res = server.make_request("POST", "/embeddings", data={
+        "input": "I believe the meaning of life is",
+    })
+    assert res.status_code == 200
+    assert len(res.body['data']) == 1
+    assert 'embedding' in res.body['data'][0]
+    assert len(res.body['data'][0]['embedding']) > 1
+
+    # make sure embedding vector is normalized
+    assert abs(sum([x ** 2 for x in res.body['data'][0]['embedding']]) - 1) < EPSILON
+
+
+def test_embedding_multiple():
+    global server
+    server.start()
+    res = server.make_request("POST", "/embeddings", data={
+        "input": [
+            "I believe the meaning of life is",
+            "Write a joke about AI from a very long prompt which will not be truncated",
+            "This is a test",
+            "This is another test",
+        ],
+    })
+    assert res.status_code == 200
+    assert len(res.body['data']) == 4
+    for d in res.body['data']:
+        assert 'embedding' in d
+        assert len(d['embedding']) > 1
+
+
+def test_embedding_openai_library_single():
+    global server
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
+    res = client.embeddings.create(model="text-embedding-3-small", input="I believe the meaning of life is")
+    assert len(res.data) == 1
+    assert len(res.data[0].embedding) > 1
+
+
+def test_embedding_openai_library_multiple():
+    global server
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
+    res = client.embeddings.create(model="text-embedding-3-small", input=[
+        "I believe the meaning of life is",
+        "Write a joke about AI from a very long prompt which will not be truncated",
+        "This is a test",
+        "This is another test",
+    ])
+    assert len(res.data) == 4
+    for d in res.data:
+        assert len(d.embedding) > 1
+
+
+def test_embedding_error_prompt_too_long():
+    global server
+    server.start()
+    res = server.make_request("POST", "/embeddings", data={
+        "input": "This is a test " * 512,
+    })
+    assert res.status_code != 200
+    assert "too large" in res.body["error"]["message"]
+
+
+def test_same_prompt_give_same_result():
+    server.start()
+    res = server.make_request("POST", "/embeddings", data={
+        "input": [
+            "I believe the meaning of life is",
+            "I believe the meaning of life is",
+            "I believe the meaning of life is",
+            "I believe the meaning of life is",
+            "I believe the meaning of life is",
+        ],
+    })
+    assert res.status_code == 200
+    assert len(res.body['data']) == 5
+    for i in range(1, len(res.body['data'])):
+        v0 = res.body['data'][0]['embedding']
+        vi = res.body['data'][i]['embedding']
+        for x, y in zip(v0, vi):
+            assert abs(x - y) < EPSILON
diff --git a/examples/server/tests/unit/test_infill.py b/examples/server/tests/unit/test_infill.py
new file mode 100644
index 000000000..38ce6c429
--- /dev/null
+++ b/examples/server/tests/unit/test_infill.py
@@ -0,0 +1,35 @@
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama_infill()
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama_infill()
+
+def test_infill_without_input_extra():
+    global server
+    server.start()
+    res = server.make_request("POST", "/infill", data={
+        "prompt": "Complete this",
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_",
+        "input_suffix": "}\n",
+    })
+    assert res.status_code == 200
+    assert match_regex("(One|day|she|saw|big|scary|bird)+", res.body["content"])
+
+def test_infill_with_input_extra():
+    global server
+    server.start()
+    res = server.make_request("POST", "/infill", data={
+        "prompt": "Complete this",
+        "input_extra": [{
+            "filename": "llama.h",
+            "text": "LLAMA_API int32_t llama_n_threads();\n"
+        }],
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_",
+        "input_suffix": "}\n",
+    })
+    assert res.status_code == 200
+    assert match_regex("(cuts|Jimmy|mom|came|into|the|room)+", res.body["content"])
diff --git a/examples/server/tests/unit/test_lora.py b/examples/server/tests/unit/test_lora.py
new file mode 100644
index 000000000..749615449
--- /dev/null
+++ b/examples/server/tests/unit/test_lora.py
@@ -0,0 +1,42 @@
+import pytest
+import os
+from utils import *
+
+server = ServerPreset.stories15m_moe()
+
+LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf"
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.stories15m_moe()
+    # download lora file if needed
+    file_name = LORA_FILE_URL.split('/').pop()
+    lora_file = f'../../../{file_name}'
+    if not os.path.exists(lora_file):
+        print(f"Downloading {LORA_FILE_URL} to {lora_file}")
+        with open(lora_file, 'wb') as f:
+            f.write(requests.get(LORA_FILE_URL).content)
+        print(f"Done downloading lora file")
+    server.lora_files = [lora_file]
+
+
+@pytest.mark.parametrize("scale,re_content", [
+    # without applying lora, the model should behave like a bedtime story generator
+    (0.0, "(little|girl|three|years|old)+"),
+    # with lora, the model should behave like a Shakespearean text generator
+    (1.0, "(eye|love|glass|sun)+"),
+])
+def test_lora(scale: float, re_content: str):
+    global server
+    server.start()
+    res_lora_control = server.make_request("POST", "/lora-adapters", data=[
+        {"id": 0, "scale": scale}
+    ])
+    assert res_lora_control.status_code == 200
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "Look in thy glass",
+    })
+    assert res.status_code == 200
+    assert match_regex(re_content, res.body["content"])
+
diff --git a/examples/server/tests/unit/test_rerank.py b/examples/server/tests/unit/test_rerank.py
new file mode 100644
index 000000000..3a49fd3ac
--- /dev/null
+++ b/examples/server/tests/unit/test_rerank.py
@@ -0,0 +1,38 @@
+import pytest
+from utils import *
+
+server = ServerPreset.jina_reranker_tiny()
+
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.jina_reranker_tiny()
+
+
+def test_rerank():
+    global server
+    server.start()
+    res = server.make_request("POST", "/rerank", data={
+        "query": "Machine learning is",
+        "documents": [
+            "A machine is a physical system that uses power to apply forces and control movement to perform an action. The term is commonly applied to artificial devices, such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines.",
+            "Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences. The ability to learn is possessed by humans, non-human animals, and some machines; there is also evidence for some kind of learning in certain plants.",
+            "Machine learning is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.",
+            "Paris, capitale de la France, est une grande ville européenne et un centre mondial de l'art, de la mode, de la gastronomie et de la culture. Son paysage urbain du XIXe siècle est traversé par de larges boulevards et la Seine."
+        ]
+    })
+    assert res.status_code == 200
+    assert len(res.body["results"]) == 4
+
+    most_relevant = res.body["results"][0]
+    least_relevant = res.body["results"][0]
+    for doc in res.body["results"]:
+        if doc["relevance_score"] > most_relevant["relevance_score"]:
+            most_relevant = doc
+        if doc["relevance_score"] < least_relevant["relevance_score"]:
+            least_relevant = doc
+
+    assert most_relevant["relevance_score"] > least_relevant["relevance_score"]
+    assert most_relevant["index"] == 2
+    assert least_relevant["index"] == 3
diff --git a/examples/server/tests/unit/test_security.py b/examples/server/tests/unit/test_security.py
new file mode 100644
index 000000000..620b25376
--- /dev/null
+++ b/examples/server/tests/unit/test_security.py
@@ -0,0 +1,83 @@
+import pytest
+from openai import OpenAI
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+TEST_API_KEY = "sk-this-is-the-secret-key"
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+    server.api_key = TEST_API_KEY
+
+
+@pytest.mark.parametrize("endpoint", ["/health", "/models"])
+def test_access_public_endpoint(endpoint: str):
+    global server
+    server.start()
+    res = server.make_request("GET", endpoint)
+    assert res.status_code == 200
+    assert "error" not in res.body
+
+
+@pytest.mark.parametrize("api_key", [None, "invalid-key"])
+def test_incorrect_api_key(api_key: str):
+    global server
+    server.start()
+    res = server.make_request("POST", "/completions", data={
+        "prompt": "I believe the meaning of life is",
+    }, headers={
+        "Authorization": f"Bearer {api_key}" if api_key else None,
+    })
+    assert res.status_code == 401
+    assert "error" in res.body
+    assert res.body["error"]["type"] == "authentication_error"
+
+
+def test_correct_api_key():
+    global server
+    server.start()
+    res = server.make_request("POST", "/completions", data={
+        "prompt": "I believe the meaning of life is",
+    }, headers={
+        "Authorization": f"Bearer {TEST_API_KEY}",
+    })
+    assert res.status_code == 200
+    assert "error" not in res.body
+    assert "content" in res.body
+
+
+def test_openai_library_correct_api_key():
+    global server
+    server.start()
+    client = OpenAI(api_key=TEST_API_KEY, base_url=f"http://{server.server_host}:{server.server_port}")
+    res = client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system", "content": "You are a chatbot."},
+            {"role": "user", "content": "What is the meaning of life?"},
+        ],
+    )
+    assert len(res.choices) == 1
+
+
+@pytest.mark.parametrize("origin,cors_header,cors_header_value", [
+    ("localhost", "Access-Control-Allow-Origin", "localhost"),
+    ("web.mydomain.fr", "Access-Control-Allow-Origin", "web.mydomain.fr"),
+    ("origin", "Access-Control-Allow-Credentials", "true"),
+    ("web.mydomain.fr", "Access-Control-Allow-Methods", "GET, POST"),
+    ("web.mydomain.fr", "Access-Control-Allow-Headers", "*"),
+])
+def test_cors_options(origin: str, cors_header: str, cors_header_value: str):
+    global server
+    server.start()
+    res = server.make_request("OPTIONS", "/completions", headers={
+        "Origin": origin,
+        "Access-Control-Request-Method": "POST",
+        "Access-Control-Request-Headers": "Authorization",
+    })
+    assert res.status_code == 200
+    assert cors_header in res.headers
+    assert res.headers[cors_header] == cors_header_value
diff --git a/examples/server/tests/unit/test_slot_save.py b/examples/server/tests/unit/test_slot_save.py
new file mode 100644
index 000000000..38704f5ec
--- /dev/null
+++ b/examples/server/tests/unit/test_slot_save.py
@@ -0,0 +1,98 @@
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+    server.slot_save_path = "./tmp"
+    server.temperature = 0.0
+
+
+def test_slot_save_restore():
+    global server
+    server.start()
+
+    # First prompt in slot 1 should be fully processed
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of France?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Whiskers|Flana)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 21  # all tokens are processed
+
+    # Save state of slot 1
+    res = server.make_request("POST", "/slots/1?action=save", data={
+        "filename": "slot1.bin",
+    })
+    assert res.status_code == 200
+    assert res.body["n_saved"] == 84
+
+    # Since we have cache, this should only process the last tokens
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of Germany?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Jack|said)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 6  # only different part is processed
+
+    # Loading the saved cache into slot 0
+    res = server.make_request("POST", "/slots/0?action=restore", data={
+        "filename": "slot1.bin",
+    })
+    assert res.status_code == 200
+    assert res.body["n_restored"] == 84
+
+    # Since we have cache, slot 0 should only process the last tokens
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of Germany?",
+        "id_slot": 0,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Jack|said)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 6  # only different part is processed
+
+    # For verification that slot 1 was not corrupted during slot 0 load, same thing should work
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of Germany?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Jack|said)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 1
+
+
+def test_slot_erase():
+    global server
+    server.start()
+
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of France?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Whiskers|Flana)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 21  # all tokens are processed
+
+    # erase slot 1
+    res = server.make_request("POST", "/slots/1?action=erase")
+    assert res.status_code == 200
+
+    # re-run the same prompt, it should process all tokens again
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of France?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Whiskers|Flana)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 21  # all tokens are processed
diff --git a/examples/server/tests/unit/test_tokenize.py b/examples/server/tests/unit/test_tokenize.py
new file mode 100644
index 000000000..382457c9d
--- /dev/null
+++ b/examples/server/tests/unit/test_tokenize.py
@@ -0,0 +1,59 @@
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+
+
+def test_tokenize_detokenize():
+    global server
+    server.start()
+    # tokenize
+    content = "What is the capital of France ?"
+    res_tok = server.make_request("POST", "/tokenize", data={
+        "content": content
+    })
+    assert res_tok.status_code == 200
+    assert len(res_tok.body["tokens"]) > 5
+    # detokenize
+    res_detok = server.make_request("POST", "/detokenize", data={
+        "tokens": res_tok.body["tokens"],
+    })
+    assert res_detok.status_code == 200
+    assert res_detok.body["content"].strip() == content
+
+
+def test_tokenize_with_bos():
+    global server
+    server.start()
+    # tokenize
+    content = "What is the capital of France ?"
+    bosId = 1
+    res_tok = server.make_request("POST", "/tokenize", data={
+        "content": content,
+        "add_special": True,
+    })
+    assert res_tok.status_code == 200
+    assert res_tok.body["tokens"][0] == bosId
+
+
+def test_tokenize_with_pieces():
+    global server
+    server.start()
+    # tokenize
+    content = "This is a test string with unicode 媽 and emoji 🤗"
+    res_tok = server.make_request("POST", "/tokenize", data={
+        "content": content,
+        "with_pieces": True,
+    })
+    assert res_tok.status_code == 200
+    for token in res_tok.body["tokens"]:
+        assert "id" in token
+        assert token["id"] > 0
+        assert "piece" in token
+        assert len(token["piece"]) > 0
diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py
new file mode 100644
index 000000000..bc590bcb3
--- /dev/null
+++ b/examples/server/tests/utils.py
@@ -0,0 +1,377 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# type: ignore[reportUnusedImport]
+
+import subprocess
+import os
+import re
+import json
+import sys
+import threading
+import requests
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import (
+    Any,
+    Callable,
+    ContextManager,
+    Iterable,
+    Iterator,
+    List,
+    Literal,
+    Tuple,
+    Set,
+)
+from re import RegexFlag
+
+
+class ServerResponse:
+    headers: dict
+    status_code: int
+    body: dict | Any
+
+
+class ServerProcess:
+    # default options
+    debug: bool = False
+    server_port: int = 8080
+    server_host: str = "127.0.0.1"
+    model_hf_repo: str = "ggml-org/models"
+    model_hf_file: str = "tinyllamas/stories260K.gguf"
+    model_alias: str = "tinyllama-2"
+    temperature: float = 0.8
+    seed: int = 42
+
+    # custom options
+    model_alias: str | None = None
+    model_url: str | None = None
+    model_file: str | None = None
+    n_threads: int | None = None
+    n_gpu_layer: int | None = None
+    n_batch: int | None = None
+    n_ubatch: int | None = None
+    n_ctx: int | None = None
+    n_ga: int | None = None
+    n_ga_w: int | None = None
+    n_predict: int | None = None
+    n_prompts: int | None = 0
+    slot_save_path: str | None = None
+    id_slot: int | None = None
+    cache_prompt: bool | None = None
+    n_slots: int | None = None
+    server_continuous_batching: bool | None = False
+    server_embeddings: bool | None = False
+    server_reranking: bool | None = False
+    server_metrics: bool | None = False
+    draft: int | None = None
+    api_key: str | None = None
+    response_format: str | None = None
+    lora_files: List[str] | None = None
+    disable_ctx_shift: int | None = False
+
+    # session variables
+    process: subprocess.Popen | None = None
+
+    def __init__(self):
+        if "N_GPU_LAYERS" in os.environ:
+            self.n_gpu_layer = int(os.environ["N_GPU_LAYERS"])
+        if "DEBUG" in os.environ:
+            self.debug = True
+        if "PORT" in os.environ:
+            self.server_port = int(os.environ["PORT"])
+
+    def start(self, timeout_seconds: int = 10) -> None:
+        if "LLAMA_SERVER_BIN_PATH" in os.environ:
+            server_path = os.environ["LLAMA_SERVER_BIN_PATH"]
+        elif os.name == "nt":
+            server_path = "../../../build/bin/Release/llama-server.exe"
+        else:
+            server_path = "../../../build/bin/llama-server"
+        server_args = [
+            "--slots",  # requires to get slot status via /slots endpoint
+            "--host",
+            self.server_host,
+            "--port",
+            self.server_port,
+            "--temp",
+            self.temperature,
+            "--seed",
+            self.seed,
+        ]
+        if self.model_file:
+            server_args.extend(["--model", self.model_file])
+        if self.model_url:
+            server_args.extend(["--model-url", self.model_url])
+        if self.model_hf_repo:
+            server_args.extend(["--hf-repo", self.model_hf_repo])
+        if self.model_hf_file:
+            server_args.extend(["--hf-file", self.model_hf_file])
+        if self.n_batch:
+            server_args.extend(["--batch-size", self.n_batch])
+        if self.n_ubatch:
+            server_args.extend(["--ubatch-size", self.n_ubatch])
+        if self.n_threads:
+            server_args.extend(["--threads", self.n_threads])
+        if self.n_gpu_layer:
+            server_args.extend(["--n-gpu-layers", self.n_gpu_layer])
+        if self.draft is not None:
+            server_args.extend(["--draft", self.draft])
+        if self.server_continuous_batching:
+            server_args.append("--cont-batching")
+        if self.server_embeddings:
+            server_args.append("--embedding")
+        if self.server_reranking:
+            server_args.append("--reranking")
+        if self.server_metrics:
+            server_args.append("--metrics")
+        if self.model_alias:
+            server_args.extend(["--alias", self.model_alias])
+        if self.n_ctx:
+            server_args.extend(["--ctx-size", self.n_ctx])
+        if self.n_slots:
+            server_args.extend(["--parallel", self.n_slots])
+        if self.n_predict:
+            server_args.extend(["--n-predict", self.n_predict])
+        if self.slot_save_path:
+            server_args.extend(["--slot-save-path", self.slot_save_path])
+        if self.n_ga:
+            server_args.extend(["--grp-attn-n", self.n_ga])
+        if self.n_ga_w:
+            server_args.extend(["--grp-attn-w", self.n_ga_w])
+        if self.debug:
+            server_args.append("--verbose")
+        if self.lora_files:
+            for lora_file in self.lora_files:
+                server_args.extend(["--lora", lora_file])
+        if self.disable_ctx_shift:
+            server_args.extend(["--no-context-shift"])
+        if self.api_key:
+            server_args.extend(["--api-key", self.api_key])
+
+        args = [str(arg) for arg in [server_path, *server_args]]
+        print(f"bench: starting server with: {' '.join(args)}")
+
+        flags = 0
+        if "nt" == os.name:
+            flags |= subprocess.DETACHED_PROCESS
+            flags |= subprocess.CREATE_NEW_PROCESS_GROUP
+            flags |= subprocess.CREATE_NO_WINDOW
+
+        self.process = subprocess.Popen(
+            [str(arg) for arg in [server_path, *server_args]],
+            creationflags=flags,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env={**os.environ, "LLAMA_CACHE": "tmp"},
+        )
+        server_instances.add(self)
+
+        def server_log(in_stream, out_stream):
+            for line in iter(in_stream.readline, b""):
+                print(line.decode("utf-8"), end="", file=out_stream)
+
+        thread_stdout = threading.Thread(
+            target=server_log, args=(self.process.stdout, sys.stdout), daemon=True
+        )
+        thread_stdout.start()
+
+        thread_stderr = threading.Thread(
+            target=server_log, args=(self.process.stderr, sys.stderr), daemon=True
+        )
+        thread_stderr.start()
+
+        print(f"server pid={self.process.pid}, pytest pid={os.getpid()}")
+
+        # wait for server to start
+        start_time = time.time()
+        while time.time() - start_time < timeout_seconds:
+            try:
+                response = self.make_request("GET", "/slots", headers={
+                    "Authorization": f"Bearer {self.api_key}" if self.api_key else None
+                })
+                if response.status_code == 200:
+                    self.ready = True
+                    return  # server is ready
+            except Exception as e:
+                pass
+            print(f"Waiting for server to start...")
+            time.sleep(0.5)
+        raise TimeoutError(f"Server did not start within {timeout_seconds} seconds")
+
+    def stop(self) -> None:
+        server_instances.remove(self)
+        if self.process:
+            print(f"Stopping server with pid={self.process.pid}")
+            self.process.kill()
+            self.process = None
+
+    def make_request(
+        self,
+        method: str,
+        path: str,
+        data: dict | Any | None = None,
+        headers: dict | None = None,
+    ) -> ServerResponse:
+        url = f"http://{self.server_host}:{self.server_port}{path}"
+        parse_body = False
+        if method == "GET":
+            response = requests.get(url, headers=headers)
+            parse_body = True
+        elif method == "POST":
+            response = requests.post(url, headers=headers, json=data)
+            parse_body = True
+        elif method == "OPTIONS":
+            response = requests.options(url, headers=headers)
+        else:
+            raise ValueError(f"Unimplemented method: {method}")
+        result = ServerResponse()
+        result.headers = dict(response.headers)
+        result.status_code = response.status_code
+        result.body = response.json() if parse_body else None
+        print("Response from server", result.body)
+        return result
+
+    def make_stream_request(
+        self,
+        method: str,
+        path: str,
+        data: dict | None = None,
+        headers: dict | None = None,
+    ) -> Iterator[dict]:
+        url = f"http://{self.server_host}:{self.server_port}{path}"
+        if method == "POST":
+            response = requests.post(url, headers=headers, json=data, stream=True)
+        else:
+            raise ValueError(f"Unimplemented method: {method}")
+        for line_bytes in response.iter_lines():
+            line = line_bytes.decode("utf-8")
+            if '[DONE]' in line:
+                break
+            elif line.startswith('data: '):
+                data = json.loads(line[6:])
+                print("Partial response from server", data)
+                yield data
+
+
+server_instances: Set[ServerProcess] = set()
+
+
+class ServerPreset:
+    @staticmethod
+    def tinyllama2() -> ServerProcess:
+        server = ServerProcess()
+        server.model_hf_repo = "ggml-org/models"
+        server.model_hf_file = "tinyllamas/stories260K.gguf"
+        server.model_alias = "tinyllama-2"
+        server.n_ctx = 256
+        server.n_batch = 32
+        server.n_slots = 2
+        server.n_predict = 64
+        server.seed = 42
+        return server
+
+    @staticmethod
+    def bert_bge_small() -> ServerProcess:
+        server = ServerProcess()
+        server.model_hf_repo = "ggml-org/models"
+        server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
+        server.model_alias = "bert-bge-small"
+        server.n_ctx = 512
+        server.n_batch = 128
+        server.n_ubatch = 128
+        server.n_slots = 2
+        server.seed = 42
+        server.server_embeddings = True
+        return server
+
+    @staticmethod
+    def tinyllama_infill() -> ServerProcess:
+        server = ServerProcess()
+        server.model_hf_repo = "ggml-org/models"
+        server.model_hf_file = "tinyllamas/stories260K-infill.gguf"
+        server.model_alias = "tinyllama-infill"
+        server.n_ctx = 2048
+        server.n_batch = 1024
+        server.n_slots = 1
+        server.n_predict = 64
+        server.temperature = 0.0
+        server.seed = 42
+        return server
+
+    @staticmethod
+    def stories15m_moe() -> ServerProcess:
+        server = ServerProcess()
+        server.model_hf_repo = "ggml-org/stories15M_MOE"
+        server.model_hf_file = "stories15M_MOE-F16.gguf"
+        server.model_alias = "stories15m-moe"
+        server.n_ctx = 2048
+        server.n_batch = 1024
+        server.n_slots = 1
+        server.n_predict = 64
+        server.temperature = 0.0
+        server.seed = 42
+        return server
+
+    @staticmethod
+    def jina_reranker_tiny() -> ServerProcess:
+        server = ServerProcess()
+        server.model_hf_repo = "ggml-org/models"
+        server.model_hf_file = "jina-reranker-v1-tiny-en/ggml-model-f16.gguf"
+        server.model_alias = "jina-reranker"
+        server.model_file = "./tmp/jina-reranker-v1-tiny-en.gguf"
+        server.n_ctx = 512
+        server.n_batch = 512
+        server.n_slots = 1
+        server.seed = 42
+        server.server_reranking = True
+        return server
+
+
+def parallel_function_calls(function_list: List[Tuple[Callable[..., Any], Tuple[Any, ...]]]) -> List[Any]:
+    """
+    Run multiple functions in parallel and return results in the same order as calls. Equivalent to Promise.all in JS.
+
+    Example usage:
+
+    results = parallel_function_calls([
+        (func1, (arg1, arg2)),
+        (func2, (arg3, arg4)),
+    ])
+    """
+    results = [None] * len(function_list)
+    exceptions = []
+
+    def worker(index, func, args):
+        try:
+            result = func(*args)
+            results[index] = result
+        except Exception as e:
+            exceptions.append((index, str(e)))
+
+    with ThreadPoolExecutor() as executor:
+        futures = []
+        for i, (func, args) in enumerate(function_list):
+            future = executor.submit(worker, i, func, args)
+            futures.append(future)
+
+        # Wait for all futures to complete
+        for future in as_completed(futures):
+            pass
+
+    # Check if there were any exceptions
+    if exceptions:
+        print("Exceptions occurred:")
+        for index, error in exceptions:
+            print(f"Function at index {index}: {error}")
+
+    return results
+
+
+def match_regex(regex: str, text: str) -> bool:
+    return (
+        re.compile(
+            regex, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL
+        ).search(text)
+        is not None
+    )

From 904109ed0d97c9b656a5e8bf612925f739bb8166 Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Tue, 26 Nov 2024 09:45:05 -0600
Subject: [PATCH 10/43] vulkan: fix group_norm (#10496)

Fix bad calculation of the end of the range. Add a backend test that
covers the bad case (taken from stable diffusion).

Fixes https://github.com/leejet/stable-diffusion.cpp/issues/439.
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp                | 2 +-
 ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp | 2 +-
 tests/test-backend-ops.cpp                          | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 49527fdf4..da1cfd24e 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -7157,7 +7157,7 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
         const int32_t max_period = tensor->op_params[1];
         tensor_clone = ggml_timestep_embedding(ggml_ctx, src0_clone, dim, max_period);
     } else if (tensor->op == GGML_OP_POOL_2D) {
-        enum ggml_op_pool op = static_cast<ggml_op_pool>(dst->op_params[0]);
+        enum ggml_op_pool op = static_cast<ggml_op_pool>(tensor->op_params[0]);
         const int32_t k0 = tensor->op_params[1];
         const int32_t k1 = tensor->op_params[2];
         const int32_t s0 = tensor->op_params[3];
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp
index 5ad9b28da..b6a0d5645 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp
@@ -19,7 +19,7 @@ void main() {
 
     const uint tid = gl_LocalInvocationID.x;
     const uint start = gl_WorkGroupID.x * group_size + tid;
-    const uint end = start + group_size;
+    const uint end = (gl_WorkGroupID.x + 1) * group_size;
 
     tmp[tid] = 0.0f;
 
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 6376b0e4c..da66ed856 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -3796,7 +3796,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_upscale());
     test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true));
     test_cases.emplace_back(new test_upscale_ext());
-    test_cases.emplace_back(new test_group_norm());
+    test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {64, 64, 320, 1}));
+    test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1}));
     test_cases.emplace_back(new test_acc());
     test_cases.emplace_back(new test_pad());
     test_cases.emplace_back(new test_arange());

From 249cd93da3df9c8fa78869b0522526d1625aca91 Mon Sep 17 00:00:00 2001
From: R0CKSTAR <xiaodong.ye@mthreads.com>
Date: Wed, 27 Nov 2024 00:00:41 +0800
Subject: [PATCH 11/43] mtgpu: Add MUSA_DOCKER_ARCH in Dockerfiles && update
 cmake and make (#10516)

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
---
 .devops/full-musa.Dockerfile         |  9 ++++++++-
 .devops/llama-cli-musa.Dockerfile    |  9 ++++++++-
 .devops/llama-server-musa.Dockerfile |  9 ++++++++-
 Makefile                             |  9 +++++----
 ggml/src/ggml-musa/CMakeLists.txt    | 11 ++++++++++-
 5 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/.devops/full-musa.Dockerfile b/.devops/full-musa.Dockerfile
index 575e81b48..3193fea1e 100644
--- a/.devops/full-musa.Dockerfile
+++ b/.devops/full-musa.Dockerfile
@@ -6,6 +6,9 @@ ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_V
 
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build
 
+# MUSA architecture to build for (defaults to all supported archs)
+ARG MUSA_DOCKER_ARCH=default
+
 RUN apt-get update && \
     apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
 
@@ -19,7 +22,11 @@ WORKDIR /app
 
 COPY . .
 
-RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+# Use the default MUSA archs if not specified
+RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
     cmake --build build --config Release -j$(nproc) && \
     cp build/bin/* .
 
diff --git a/.devops/llama-cli-musa.Dockerfile b/.devops/llama-cli-musa.Dockerfile
index 3372749be..e7c75af20 100644
--- a/.devops/llama-cli-musa.Dockerfile
+++ b/.devops/llama-cli-musa.Dockerfile
@@ -8,6 +8,9 @@ ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU
 
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build
 
+# MUSA architecture to build for (defaults to all supported archs)
+ARG MUSA_DOCKER_ARCH=default
+
 RUN apt-get update && \
     apt-get install -y build-essential git cmake
 
@@ -15,7 +18,11 @@ WORKDIR /app
 
 COPY . .
 
-RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+# Use the default MUSA archs if not specified
+RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
     cmake --build build --config Release --target llama-cli -j$(nproc) && \
     mkdir -p /app/lib && \
     find build -name "*.so" -exec cp {} /app/lib \;
diff --git a/.devops/llama-server-musa.Dockerfile b/.devops/llama-server-musa.Dockerfile
index eb67201c1..cebe51d42 100644
--- a/.devops/llama-server-musa.Dockerfile
+++ b/.devops/llama-server-musa.Dockerfile
@@ -8,6 +8,9 @@ ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU
 
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build
 
+# MUSA architecture to build for (defaults to all supported archs)
+ARG MUSA_DOCKER_ARCH=default
+
 RUN apt-get update && \
     apt-get install -y build-essential git cmake libcurl4-openssl-dev
 
@@ -15,7 +18,11 @@ WORKDIR /app
 
 COPY . .
 
-RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+# Use the default MUSA archs if not specified
+RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
     cmake --build build --config Release --target llama-server -j$(nproc) && \
     mkdir -p /app/lib && \
     find build -name "*.so" -exec cp {} /app/lib \;
diff --git a/Makefile b/Makefile
index cfc74c1dc..9a079a237 100644
--- a/Makefile
+++ b/Makefile
@@ -815,7 +815,7 @@ ifdef GGML_MUSA
 	else
 		MUSA_PATH ?= /opt/musa
 	endif
-	MTGPU_TARGETS ?= mp_21 mp_22
+	MUSA_ARCHITECTURES ?= 21;22
 
 	MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
 	MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib
@@ -834,7 +834,8 @@ ifdef GGML_MUSA
 	CXX := $(MUSA_PATH)/bin/clang++
 	MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc
 
-	MUSAFLAGS += $(addprefix --cuda-gpu-arch=, $(MTGPU_TARGETS))
+	MUSAFLAGS  = -x musa -mtgpu
+	MUSAFLAGS += $(foreach arch,$(subst ;, ,$(MUSA_ARCHITECTURES)),--cuda-gpu-arch=mp_$(arch))
 
 ifdef GGML_CUDA_FORCE_MMQ
 	MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ
@@ -878,14 +879,14 @@ ggml/src/ggml-cuda/ggml-cuda.o: \
 	ggml/src/ggml-backend-impl.h \
 	ggml/src/ggml-common.h \
 	$(wildcard ggml/src/ggml-cuda/*.cuh)
-	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $<
+	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $<
 
 ggml/src/ggml-cuda/%.o: \
 	ggml/src/ggml-cuda/%.cu \
 	ggml/include/ggml.h \
 	ggml/src/ggml-common.h \
 	ggml/src/ggml-cuda/common.cuh
-	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $<
+	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $<
 endif # GGML_MUSA
 
 ifdef GGML_METAL
diff --git a/ggml/src/ggml-musa/CMakeLists.txt b/ggml/src/ggml-musa/CMakeLists.txt
index e1a69186e..415b2b2e0 100644
--- a/ggml/src/ggml-musa/CMakeLists.txt
+++ b/ggml/src/ggml-musa/CMakeLists.txt
@@ -20,6 +20,11 @@ find_package(MUSAToolkit)
 if (MUSAToolkit_FOUND)
     message(STATUS "MUSA Toolkit found")
 
+    if (NOT DEFINED MUSA_ARCHITECTURES)
+        set(MUSA_ARCHITECTURES "21;22")
+    endif()
+    message(STATUS "Using MUSA architectures: ${MUSA_ARCHITECTURES}")
+
     file(GLOB   GGML_HEADERS_MUSA "../ggml-cuda/*.cuh")
     list(APPEND GGML_HEADERS_MUSA "../../include/ggml-cuda.h")
 
@@ -44,7 +49,11 @@ if (MUSAToolkit_FOUND)
 
     set_source_files_properties(${GGML_SOURCES_MUSA} PROPERTIES LANGUAGE CXX)
     foreach(SOURCE ${GGML_SOURCES_MUSA})
-        set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22")
+        set(COMPILE_FLAGS "-x musa -mtgpu")
+        foreach(ARCH ${MUSA_ARCHITECTURES})
+            set(COMPILE_FLAGS "${COMPILE_FLAGS} --cuda-gpu-arch=mp_${ARCH}")
+        endforeach()
+        set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS ${COMPILE_FLAGS})
     endforeach()
 
     ggml_add_backend_library(ggml-musa

From be0e350c8b69632b27d5fb41fa064fa256dd7fbf Mon Sep 17 00:00:00 2001
From: Tristan Druyen <tristan@vault81.mozmail.com>
Date: Tue, 26 Nov 2024 19:27:28 +0100
Subject: [PATCH 12/43] Fix HIP flag inconsistency & build docs (#10524)

* Fix inconsistency of HIP flags in cmake & make

* Fix docs regarding GGML_HIP
---
 Makefile      | 4 ++--
 docs/build.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 9a079a237..25214ec05 100644
--- a/Makefile
+++ b/Makefile
@@ -752,7 +752,7 @@ vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
 
 endif # GGML_VULKAN
 
-ifdef GGML_HIPBLAS
+ifdef GGML_HIP
 	ifeq ($(wildcard /opt/rocm),)
 		ROCM_PATH      ?= /usr
 		AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
@@ -807,7 +807,7 @@ ggml/src/ggml-cuda/%.o: \
 	ggml/src/ggml-common.h \
 	ggml/src/ggml-cuda/common.cuh
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
-endif # GGML_HIPBLAS
+endif # GGML_HIP
 
 ifdef GGML_MUSA
 	ifeq ($(wildcard /opt/musa),)
diff --git a/docs/build.md b/docs/build.md
index 359952b30..72b810437 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -221,7 +221,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
 
 - Using `make`:
   ```bash
-  make GGML_HIPBLAS=1
+  make GGML_HIP=1
   ```
 - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
   ```bash
@@ -249,7 +249,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
 
 - Using `make` (example for target gfx1030, build with 16 CPU threads):
   ```bash
-  make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
+  make -j16 GGML_HIP=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
   ```
 
 - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):

From 30ec39832165627dd6ed98938df63adfc6e6a21a Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Tue, 26 Nov 2024 21:01:47 +0100
Subject: [PATCH 13/43] llama : disable warnings for 3rd party sha1 dependency
 (#10527)

---
 examples/gguf-hash/CMakeLists.txt | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/examples/gguf-hash/CMakeLists.txt b/examples/gguf-hash/CMakeLists.txt
index 633f45535..7a494ce32 100644
--- a/examples/gguf-hash/CMakeLists.txt
+++ b/examples/gguf-hash/CMakeLists.txt
@@ -4,10 +4,17 @@ install(TARGETS ${TARGET} RUNTIME)
 
 # clibs dependencies
 include_directories(deps/)
+
 add_library(xxhash OBJECT deps/xxhash/xxhash.c deps/xxhash/xxhash.h)
 target_link_libraries(${TARGET} PRIVATE xxhash)
+
 add_library(sha1 OBJECT deps/sha1/sha1.c deps/sha1/sha1.h)
 target_link_libraries(${TARGET} PRIVATE sha1)
+if (NOT MSVC)
+    # disable warnings in 3rd party code
+    target_compile_options(sha1 PRIVATE -w)
+endif()
+
 add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h)
 target_link_libraries(${TARGET} PRIVATE sha256)
 

From 5a349f2809dc825960dfcfdf8f76b19cd0345be7 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Tue, 26 Nov 2024 21:13:54 +0100
Subject: [PATCH 14/43] ci : remove nix workflows (#10526)

---
 .github/workflows/nix-ci-aarch64.yml    | 72 ----------------------
 .github/workflows/nix-ci.yml            | 81 -------------------------
 .github/workflows/nix-flake-update.yml  | 22 -------
 .github/workflows/nix-publish-flake.yml | 36 -----------
 4 files changed, 211 deletions(-)
 delete mode 100644 .github/workflows/nix-ci-aarch64.yml
 delete mode 100644 .github/workflows/nix-ci.yml
 delete mode 100644 .github/workflows/nix-flake-update.yml
 delete mode 100644 .github/workflows/nix-publish-flake.yml

diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml
deleted file mode 100644
index 0da6acdf1..000000000
--- a/.github/workflows/nix-ci-aarch64.yml
+++ /dev/null
@@ -1,72 +0,0 @@
-name: Nix aarch64 builds
-
-on:
-  workflow_dispatch: # allows manual triggering
-  schedule:
-    # Rebuild daily rather than on every push because QEMU is expensive (e.g.
-    # 1.5h instead of minutes with the cold cache).
-    #
-    # randint(0, 59), randint(0, 23)
-    - cron: '26 12 * * *'
-  # But also rebuild if we touched any of the Nix expressions:
-  push:
-    branches:
-      - master
-    paths: ['**/*.nix', 'flake.lock']
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['**/*.nix', 'flake.lock']
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-# Fine-grant permission
-# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
-permissions:
-  # https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
-  id-token: write
-  contents: read
-
-jobs:
-  nix-build-aarch64:
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Install QEMU
-      # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
-      run: |
-        sudo apt-get update
-        sudo apt-get install -y qemu-user-static qemu-system-aarch64
-        sudo usermod -a -G kvm $USER
-    - name: Install Nix
-      uses: DeterminateSystems/nix-installer-action@v9
-      with:
-        github-token: ${{ secrets.GITHUB_TOKEN }}
-        extra-conf: |
-          extra-platforms = aarch64-linux
-          extra-system-features = nixos-test kvm
-          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
-    - uses: DeterminateSystems/magic-nix-cache-action@v2
-      with:
-        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
-    - name: Set-up cachix to push the results to
-      uses: cachix/cachix-action@v13
-      with:
-        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: llama-cpp
-    - name: Show all output paths
-      run: >
-          nix run github:nix-community/nix-eval-jobs
-          -- --gc-roots-dir gcroot
-          --flake
-          ".#packages.aarch64-linux"
-    - name: Build
-      run: >
-          nix run github:Mic92/nix-fast-build
-          -- --skip-cached --no-nom
-          --systems aarch64-linux
-          --flake
-          ".#checks.aarch64-linux"
diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml
deleted file mode 100644
index 3fe941576..000000000
--- a/.github/workflows/nix-ci.yml
+++ /dev/null
@@ -1,81 +0,0 @@
-name: Nix CI
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/nix-ci.yml', '**/flake.nix', '**/flake.lock', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/nix-ci.yml', '**/flake.nix', '**/flake.lock', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-# Fine-grant permission
-# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
-permissions:
-  # https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
-  id-token: write
-  contents: read
-
-jobs:
-  nix-eval:
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ ubuntu-latest, macos-latest ]
-    runs-on: ${{ matrix.os }}
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Install Nix
-      uses: DeterminateSystems/nix-installer-action@v9
-      with:
-        github-token: ${{ secrets.GITHUB_TOKEN }}
-        extra-conf: |
-          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
-    - uses: DeterminateSystems/magic-nix-cache-action@v2
-      with:
-        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
-    - name: List all flake outputs
-      run: nix flake show --all-systems
-    - name: Show all output paths
-      run: >
-          nix run github:nix-community/nix-eval-jobs
-          -- --gc-roots-dir gcroot
-          --flake
-          ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
-  nix-build:
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ ubuntu-latest, macos-latest ]
-    runs-on: ${{ matrix.os }}
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Install Nix
-      uses: DeterminateSystems/nix-installer-action@v9
-      with:
-        github-token: ${{ secrets.GITHUB_TOKEN }}
-        extra-conf: |
-          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
-    - uses: DeterminateSystems/magic-nix-cache-action@v2
-      with:
-        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
-    - name: Set-up cachix to push the results to
-      uses: cachix/cachix-action@v13
-      with:
-        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: llama-cpp
-    - name: Build
-      run: >
-          nix run github:Mic92/nix-fast-build
-          -- --skip-cached --no-nom
-          --flake
-          ".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
diff --git a/.github/workflows/nix-flake-update.yml b/.github/workflows/nix-flake-update.yml
deleted file mode 100644
index 3a6a96e26..000000000
--- a/.github/workflows/nix-flake-update.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-name: update-flake-lock
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
-
-jobs:
-  lockfile:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-      - name: Install Nix
-        uses: DeterminateSystems/nix-installer-action@main
-      - name: Update flake.lock
-        uses: DeterminateSystems/update-flake-lock@main
-        with:
-          pr-title: "nix: update flake.lock"
-          pr-labels: |
-            nix
-          pr-reviewers: philiptaron,SomeoneSerge
-          token: ${{ secrets.FLAKE_TOKEN }}
diff --git a/.github/workflows/nix-publish-flake.yml b/.github/workflows/nix-publish-flake.yml
deleted file mode 100644
index 2c3c1ebda..000000000
--- a/.github/workflows/nix-publish-flake.yml
+++ /dev/null
@@ -1,36 +0,0 @@
-# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
-name: "Publish a flake to flakestry & flakehub"
-on:
-    push:
-        tags:
-        - "*"
-    workflow_dispatch:
-        inputs:
-            tag:
-                description: "The existing tag to publish"
-                type: "string"
-                required: true
-jobs:
-    flakestry-publish:
-        runs-on: ubuntu-latest
-        permissions:
-            id-token: "write"
-            contents: "read"
-        steps:
-            - uses: flakestry/flakestry-publish@main
-              with:
-                version: "${{ inputs.tag || github.ref_name }}"
-    flakehub-publish:
-      runs-on: "ubuntu-latest"
-      permissions:
-        id-token: "write"
-        contents: "read"
-      steps:
-        - uses: "actions/checkout@v4"
-          with:
-            ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
-        - uses: "DeterminateSystems/nix-installer-action@main"
-        - uses: "DeterminateSystems/flakehub-push@main"
-          with:
-            visibility: "public"
-            tag: "${{ inputs.tag }}"

From de5097351caffb3deaea3393633609df49ef41d0 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Tue, 26 Nov 2024 12:55:29 -0800
Subject: [PATCH 15/43] Add OLMo 2 model in docs (#10530)

* Add link to OLMo 2 model in docs

* Change link to landing page
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 5f7933c13..414c5b1c0 100644
--- a/README.md
+++ b/README.md
@@ -79,6 +79,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
 - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
 - [x] [OLMo](https://allenai.org/olmo)
+- [x] [OLMo 2](https://allenai.org/olmo)
 - [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924)
 - [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
 - [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)

From c9b00a70b080d5c0668608024afc3e0e2fed822f Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Tue, 26 Nov 2024 22:12:10 +0100
Subject: [PATCH 16/43] ci : fix cuda releases (#10532)

---
 .github/workflows/build.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c6aecec6e..399641c4a 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -728,7 +728,7 @@ jobs:
             cmake --build build --config ${{ matrix.build }} -j $(nproc)
 
   windows-latest-cmake:
-    runs-on: windows-2019
+    runs-on: windows-latest
 
     env:
       OPENBLAS_VERSION: 0.3.23
@@ -929,7 +929,7 @@ jobs:
 
   windows-2019-cmake-cuda:
     runs-on: windows-2019
-    if: ${{ github.event == 'push' && github.ref == 'refs/heads/master' }}
+    if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
     strategy:
       matrix:

From 4a57d362e1948ada50af997a92c3cbff9711e78b Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Wed, 27 Nov 2024 01:00:50 -0600
Subject: [PATCH 17/43] vulkan: optimize Q2_K and Q3_K mul_mat_vec (#10459)

---
 .../vulkan-shaders/mul_mat_vec_q2_k.comp      | 74 +++++++++++++------
 .../vulkan-shaders/mul_mat_vec_q3_k.comp      | 46 +++++++++---
 2 files changed, 87 insertions(+), 33 deletions(-)

diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
index e2625d32b..fcf02210e 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
@@ -1,4 +1,5 @@
 #version 450
+#extension GL_EXT_shader_explicit_arithmetic_types : require
 
 #include "mul_mat_vec_base.comp"
 
@@ -32,38 +33,67 @@ void main() {
     const uint s_offset = 8*v_im;
     const uint y_offset = 128*v_im + l0;
 
-    tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp
+    FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
 
     [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
         const uint y_idx = i * QUANT_K + y_offset;
 
-        const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib0 + i].d.x);
-        const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib0 + i].d.y);
+        f16vec2 d = data_a[ib0 + i].d;
+        const FLOAT_TYPE dall = d.x;
+        const FLOAT_TYPE dmin = d.y;
+
+        B_TYPE_VEC2 b0 = data_b_v2[(b_offset + y_idx) / 2 + 0];
+        B_TYPE_VEC2 b16 = data_b_v2[(b_offset + y_idx) / 2 + 8];
+        B_TYPE_VEC2 b32 = data_b_v2[(b_offset + y_idx) / 2 + 16];
+        B_TYPE_VEC2 b48 = data_b_v2[(b_offset + y_idx) / 2 + 24];
+        B_TYPE_VEC2 b64 = data_b_v2[(b_offset + y_idx) / 2 + 32];
+        B_TYPE_VEC2 b80 = data_b_v2[(b_offset + y_idx) / 2 + 40];
+        B_TYPE_VEC2 b96 = data_b_v2[(b_offset + y_idx) / 2 + 48];
+        B_TYPE_VEC2 b112 = data_b_v2[(b_offset + y_idx) / 2 + 56];
+
+        uint32_t s0_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 0];
+        uint32_t s4_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 1];
+
+        uint32_t s0_lo4_u32 = s0_u32 & 0x0F0F0F0F;
+        uint32_t s0_hi4_u32 = (s0_u32 >> 4) & 0x0F0F0F0F;
+        uint32_t s4_lo4_u32 = s4_u32 & 0x0F0F0F0F;
+        uint32_t s4_hi4_u32 = (s4_u32 >> 4) & 0x0F0F0F0F;
+
+        uvec4 s0_lo4 = uvec4(unpack8(s0_lo4_u32));
+        uvec4 s4_lo4 = uvec4(unpack8(s4_lo4_u32));
+        uvec4 s0_hi4 = uvec4(unpack8(s0_hi4_u32));
+        uvec4 s4_hi4 = uvec4(unpack8(s4_hi4_u32));
+
+        uint16_t qs0_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 0];
+        uint16_t qs16_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 8];
+        uvec2 qs0 =  uvec2(unpack8(qs0_u16));
+        uvec2 qs16 = uvec2(unpack8(qs16_u16));
 
         FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
         FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            sum1 = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +  0]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 0) & 3),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 0) & 3),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 2) & 3),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 2) & 3),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 4) & 3),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 4) & 3),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 6) & 3),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +112]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 6) & 3), sum1))))))));
-            sum2 = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +  0]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 0] >> 4) & 0xF),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 1] >> 4) & 0xF),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 2] >> 4) & 0xF),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 3] >> 4) & 0xF),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 4] >> 4) & 0xF),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 5] >> 4) & 0xF),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 6] >> 4) & 0xF),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +112]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 7] >> 4) & 0xF), sum2))))))));
+        [[unroll]] for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
+            sum1 = fma(FLOAT_TYPE(b0[l]),   FLOAT_TYPE(s0_lo4[0]) * FLOAT_TYPE((qs0[l]  >> 0) & 3),
+                   fma(FLOAT_TYPE(b16[l]),  FLOAT_TYPE(s0_lo4[1]) * FLOAT_TYPE((qs16[l] >> 0) & 3),
+                   fma(FLOAT_TYPE(b32[l]),  FLOAT_TYPE(s0_lo4[2]) * FLOAT_TYPE((qs0[l]  >> 2) & 3),
+                   fma(FLOAT_TYPE(b48[l]),  FLOAT_TYPE(s0_lo4[3]) * FLOAT_TYPE((qs16[l] >> 2) & 3),
+                   fma(FLOAT_TYPE(b64[l]),  FLOAT_TYPE(s4_lo4[0]) * FLOAT_TYPE((qs0[l]  >> 4) & 3),
+                   fma(FLOAT_TYPE(b80[l]),  FLOAT_TYPE(s4_lo4[1]) * FLOAT_TYPE((qs16[l] >> 4) & 3),
+                   fma(FLOAT_TYPE(b96[l]),  FLOAT_TYPE(s4_lo4[2]) * FLOAT_TYPE((qs0[l]  >> 6) & 3),
+                   fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_lo4[3]) * FLOAT_TYPE((qs16[l] >> 6) & 3), sum1))))))));
+            sum2 = fma(FLOAT_TYPE(b0[l]),   FLOAT_TYPE(s0_hi4[0]),
+                   fma(FLOAT_TYPE(b16[l]),  FLOAT_TYPE(s0_hi4[1]),
+                   fma(FLOAT_TYPE(b32[l]),  FLOAT_TYPE(s0_hi4[2]),
+                   fma(FLOAT_TYPE(b48[l]),  FLOAT_TYPE(s0_hi4[3]),
+                   fma(FLOAT_TYPE(b64[l]),  FLOAT_TYPE(s4_hi4[0]),
+                   fma(FLOAT_TYPE(b80[l]),  FLOAT_TYPE(s4_hi4[1]),
+                   fma(FLOAT_TYPE(b96[l]),  FLOAT_TYPE(s4_hi4[2]),
+                   fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_hi4[3]), sum2))))))));
         }
-        const uint tmp_idx = 16 * ix + tid;
-        tmp[tmp_idx] = fma(dall, sum1, fma(-dmin, sum2, tmp[tmp_idx]));
+        temp = fma(dall, sum1, fma(-dmin, sum2, temp));
     }
 
+    tmp[gl_LocalInvocationID.x] = temp;
+
     // sum up partial sums and write back result
     barrier();
     [[unroll]] for (uint s = 16; s > 0; s >>= 1) {
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
index a28804533..723fadde0 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
@@ -1,4 +1,5 @@
 #version 450
+#extension GL_EXT_shader_explicit_arithmetic_types : require
 
 #include "mul_mat_vec_base.comp"
 
@@ -33,7 +34,7 @@ void main() {
     const uint q_offset = 32*v_im + l0;
     const uint y_offset = 128*v_im + l0;
 
-    tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp
+    FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
 
     const uint s_shift = 4 * v_im;
 
@@ -42,21 +43,44 @@ void main() {
 
         const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
 
+        B_TYPE_VEC2 b0 = data_b_v2[(b_offset + y_idx) / 2 + 0];
+        B_TYPE_VEC2 b16 = data_b_v2[(b_offset + y_idx) / 2 + 8];
+        B_TYPE_VEC2 b32 = data_b_v2[(b_offset + y_idx) / 2 + 16];
+        B_TYPE_VEC2 b48 = data_b_v2[(b_offset + y_idx) / 2 + 24];
+        B_TYPE_VEC2 b64 = data_b_v2[(b_offset + y_idx) / 2 + 32];
+        B_TYPE_VEC2 b80 = data_b_v2[(b_offset + y_idx) / 2 + 40];
+        B_TYPE_VEC2 b96 = data_b_v2[(b_offset + y_idx) / 2 + 48];
+        B_TYPE_VEC2 b112 = data_b_v2[(b_offset + y_idx) / 2 + 56];
+
+        uint16_t s0_16 = data_a_packed16[ib0 + i].scales[0];
+        uint16_t s2_16 = data_a_packed16[ib0 + i].scales[1];
+        uint16_t s4_16 = data_a_packed16[ib0 + i].scales[2];
+        uint16_t s6_16 = data_a_packed16[ib0 + i].scales[3];
+        uint16_t s8_16 = data_a_packed16[ib0 + i].scales[4];
+        uint16_t s10_16 = data_a_packed16[ib0 + i].scales[5];
+        u8vec2 s0 = unpack8(s0_16);
+        u8vec2 s2 = unpack8(s2_16);
+        u8vec2 s4 = unpack8(s4_16);
+        u8vec2 s6 = unpack8(s6_16);
+        u8vec2 s8 = unpack8(s8_16);
+        u8vec2 s10 = unpack8(s10_16);
+
         FLOAT_TYPE sum = FLOAT_TYPE(0.0);
         for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            sum = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +  0]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[0] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ]     ) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 0)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[2] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 1)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[4] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 2)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[6] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 3)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[1] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16]     ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[3] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[5] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +112]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[7] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum))))))));
+            sum = fma(FLOAT_TYPE(b0[l])   * FLOAT_TYPE(int8_t(((s0[0] >> s_shift) & 0xF) | ((s8[0]  >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ]     ) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 0)) != 0) ? 0 : 4)),
+                  fma(FLOAT_TYPE(b32[l])  * FLOAT_TYPE(int8_t(((s2[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 1)) != 0) ? 0 : 4)),
+                  fma(FLOAT_TYPE(b64[l])  * FLOAT_TYPE(int8_t(((s4[0] >> s_shift) & 0xF) | ((s8[0]  >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 2)) != 0) ? 0 : 4)),
+                  fma(FLOAT_TYPE(b96[l])  * FLOAT_TYPE(int8_t(((s6[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 3)) != 0) ? 0 : 4)),
+                  fma(FLOAT_TYPE(b16[l])  * FLOAT_TYPE(int8_t(((s0[1] >> s_shift) & 0xF) | ((s8[1]  >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16]     ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)),
+                  fma(FLOAT_TYPE(b48[l])  * FLOAT_TYPE(int8_t(((s2[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)),
+                  fma(FLOAT_TYPE(b80[l])  * FLOAT_TYPE(int8_t(((s4[1] >> s_shift) & 0xF) | ((s8[1]  >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)),
+                  fma(FLOAT_TYPE(b112[l]) * FLOAT_TYPE(int8_t(((s6[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum))))))));
         }
-        const uint tmp_idx = 16 * ix + tid;
-        tmp[tmp_idx] = fma(d, sum, tmp[tmp_idx]);
+        temp = fma(d, sum, temp);
     }
 
+    tmp[gl_LocalInvocationID.x] = temp;
+
     // sum up partial sums and write back result
     barrier();
     [[unroll]] for (uint s = 16; s > 0; s >>= 1) {

From 71a64989a5d2e25c13507efada145f12cf358914 Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Wed, 27 Nov 2024 01:08:54 -0600
Subject: [PATCH 18/43] vulkan: skip integer div/mod in get_offsets for
 batch_idx==0 (#10506)

---
 .../vulkan-shaders/mul_mat_vec_base.comp            | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp
index 8d0a57913..2ec1af5c7 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp
@@ -52,13 +52,16 @@ void get_offsets(out uint a_offset, out uint b_offset, out uint d_offset) {
 #endif
 
 #ifndef MUL_MAT_ID
-    const uint i13 = batch_idx / p.ne12;
-    const uint i12 = batch_idx % p.ne12;
+    uint batch_idx_a = 0;
+    if (batch_idx != 0) {
+        const uint i13 = batch_idx / p.ne12;
+        const uint i12 = batch_idx % p.ne12;
 
-    const uint i03 = i13 / p.broadcast3;
-    const uint i02 = i12 / p.broadcast2;
+        const uint i03 = i13 / p.broadcast3;
+        const uint i02 = i12 / p.broadcast2;
 
-    const uint batch_idx_a = i03 * p.ne02 + i02;
+        batch_idx_a = i03 * p.ne02 + i02;
+    }
 #else
     const uint expert_id = data_ids[expert_idx];
 #endif

From 249a7902ec710c8d027b9cc0ed10219d2b4184f8 Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Wed, 27 Nov 2024 01:21:59 -0600
Subject: [PATCH 19/43] vulkan: further optimize q5_k mul_mat_vec (#10479)

---
 .../vulkan-shaders/mul_mat_vec_q5_k.comp      | 52 ++++++++++---------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
index 22a6bfae4..b455cbd31 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
@@ -34,9 +34,6 @@ void main() {
     const uint q_offset = 32*v_im + l0;
     const uint y_offset = 64*v_im + l0;
 
-    const uint8_t hm1 = uint8_t(1 << (2*v_im));
-    const uint8_t hm2 = uint8_t(hm1 << 4);
-
     FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
 
     [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += 2) {
@@ -71,6 +68,18 @@ void main() {
         uint32_t qs64_80_u32_lo4 = qs64_80_u32 & 0x0F0F0F0F;
         uint32_t qs64_80_u32_hi4 = (qs64_80_u32 >> 4) & 0x0F0F0F0F;
 
+        uint32_t qh = pack32(u16vec2(data_a_packed16[ib0 + i].qh[l0 / 2], data_a_packed16[ib0 + i].qh[l0 / 2 + 8]));
+
+        uint32_t qs0_16_lo4_offset16 = ((qh >> (2*v_im)) & 0x01010101) << 4;
+        uint32_t qs0_16_hi4_offset16 = ((qh >> (2*v_im)) & 0x02020202) << 3;
+        uint32_t qs64_80_lo4_offset16 = ((qh >> (2*v_im)) & 0x10101010) << 0;
+        uint32_t qs64_80_hi4_offset16 = ((qh >> (2*v_im)) & 0x20202020) >> 1;
+
+        qs0_16_u32_lo4 += qs0_16_lo4_offset16;
+        qs0_16_u32_hi4 += qs0_16_hi4_offset16;
+        qs64_80_u32_lo4 += qs64_80_lo4_offset16;
+        qs64_80_u32_hi4 += qs64_80_hi4_offset16;
+
         uvec4 qs0_16_lo4 = uvec4(unpack8(qs0_16_u32_lo4));
         uvec4 qs64_80_lo4 = uvec4(unpack8(qs64_80_u32_lo4));
         uvec4 qs0_16_hi4 = uvec4(unpack8(qs0_16_u32_hi4));
@@ -102,31 +111,26 @@ void main() {
         B_TYPE_VEC2 by232 = data_b_v2[(b_offset + y2_idx) / 2 + 16];
         B_TYPE_VEC2 by248 = data_b_v2[(b_offset + y2_idx) / 2 + 24];
 
-        uint32_t qh0 = data_a_packed16[ib0 + i].qh[l0 / 2];
-        uint32_t qh1 = qh0 >> 8;
-        uint32_t qh16 = data_a_packed16[ib0 + i].qh[l0 / 2 + 8];
-        uint32_t qh17 = qh16 >> 8;
-
         const FLOAT_TYPE sx =
-          fma(FLOAT_TYPE(by10.x), (q4_0 + (((qh0 & hm1) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(by10.y), (q4_1 + (((qh1 & hm1) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(by116.x), (q4_2 + (((qh16 & hm1) != 0) ? 16 : 0)),
-             FLOAT_TYPE(by116.y) * (q4_3 + (((qh17 & hm1) != 0) ? 16 : 0)))));
+          fma(FLOAT_TYPE(by10.x), q4_0,
+          fma(FLOAT_TYPE(by10.y), q4_1,
+          fma(FLOAT_TYPE(by116.x), q4_2,
+             FLOAT_TYPE(by116.y) * q4_3)));
         const FLOAT_TYPE sy =
-          fma(FLOAT_TYPE(by132.x), (q4_4 + (((qh0 & (hm1 << 1)) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(by132.y), (q4_5 + (((qh1 & (hm1 << 1)) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(by148.x), (q4_6 + (((qh16 & (hm1 << 1)) != 0) ? 16 : 0)),
-             FLOAT_TYPE(by148.y) * (q4_7 + (((qh17 & (hm1 << 1)) != 0) ? 16 : 0)))));
+          fma(FLOAT_TYPE(by132.x), q4_4,
+          fma(FLOAT_TYPE(by132.y), q4_5,
+          fma(FLOAT_TYPE(by148.x), q4_6,
+             FLOAT_TYPE(by148.y) * q4_7)));
         const FLOAT_TYPE sz =
-          fma(FLOAT_TYPE(by20.x), (q4_8  + (((qh0 & hm2) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(by20.y), (q4_9  + (((qh1 & hm2) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(by216.x), (q4_10 + (((qh16 & hm2) != 0) ? 16 : 0)),
-             FLOAT_TYPE(by216.y) * (q4_11 + (((qh17 & hm2) != 0) ? 16 : 0)))));
+          fma(FLOAT_TYPE(by20.x), q4_8,
+          fma(FLOAT_TYPE(by20.y), q4_9,
+          fma(FLOAT_TYPE(by216.x), q4_10,
+             FLOAT_TYPE(by216.y) * q4_11)));
         const FLOAT_TYPE sw =
-          fma(FLOAT_TYPE(by232.x), (q4_12 + (((qh0 & (hm2 << 1)) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(by232.y), (q4_13 + (((qh1 & (hm2 << 1)) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(by248.x), (q4_14 + (((qh16 & (hm2 << 1)) != 0) ? 16 : 0)),
-             FLOAT_TYPE(by248.y) * (q4_15 + (((qh17 & (hm2 << 1)) != 0) ? 16 : 0)))));
+          fma(FLOAT_TYPE(by232.x), q4_12,
+          fma(FLOAT_TYPE(by232.y), q4_13,
+          fma(FLOAT_TYPE(by248.x), q4_14,
+             FLOAT_TYPE(by248.y) * q4_15)));
         const FLOAT_TYPE smin =
           fma(FLOAT_TYPE(by10.x) + FLOAT_TYPE(by10.y) + FLOAT_TYPE(by116.x) + FLOAT_TYPE(by116.y), sc2,
           fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3,

From 5b3466bedfa84aa29c6871c7254467550186ecc6 Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Wed, 27 Nov 2024 01:30:27 -0600
Subject: [PATCH 20/43] vulkan: Handle GPUs with less shared memory (#10468)

There have been reports of failure to compile on systems with <= 32KB
of shared memory (e.g. #10037). This change makes the large tile size
fall back to a smaller size if necessary, and makes mul_mat_id fall
back to CPU if there's only 16KB of shared memory.
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 158 +++++++++++++++++----------
 1 file changed, 103 insertions(+), 55 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index da1cfd24e..a833007fb 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1232,8 +1232,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
     std::cerr << "ggml_vulkan: Compiling shaders";
 
     // mulmat
-    std::vector<uint32_t> l_warptile, m_warptile, s_warptile, l_warptile_mmq, m_warptile_mmq, s_warptile_mmq;
-    std::array<uint32_t, 3> l_wg_denoms, m_wg_denoms, s_wg_denoms;
+    std::vector<uint32_t> l_warptile, m_warptile, s_warptile,
+                          l_warptile_mmq, m_warptile_mmq, s_warptile_mmq;
+    std::array<uint32_t, 3> l_wg_denoms, m_wg_denoms, s_wg_denoms,
+                            l_mmq_wg_denoms, m_mmq_wg_denoms, s_mmq_wg_denoms;
     uint32_t l_align, m_align, s_align;
 
     l_warptile = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
@@ -1244,14 +1246,48 @@ static void ggml_vk_load_shaders(vk_device& device) {
     m_warptile_mmq = { 128,  64,  64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
     s_warptile_mmq = { std::max(device->subgroup_size, 16u),  32,  32, 32, 32, 32, 2, 2, 2, device->subgroup_size };
 
-    l_wg_denoms = {128, 128, 1 };
-    m_wg_denoms = { 64,  64, 1 };
-    s_wg_denoms = { 32,  32, 1 };
+    l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 };
+    m_mmq_wg_denoms = m_wg_denoms = { 64,  64, 1 };
+    s_mmq_wg_denoms = s_wg_denoms = { 32,  32, 1 };
 
     l_align = 128;
     m_align =  64;
     s_align =  32;
 
+    // Fallback to smaller sizes if there's not enough shared memory. Given the current shaders
+    // and tile sizes, this should handle 16KB, 32KB, and 48KB+.
+    // This logic doesn't explicitly account for the 12KB row_ids in the mul_mat_mat_id shaders.
+    // But the numbers happen to work out for 32KB shared memory size that when using the medium
+    // size there's enough room for everything, and we assert for this.
+    uint32_t shmem_needed = (l_warptile[1] + l_warptile[2]) * (l_warptile[3] + 1) * sizeof(float);
+    if (shmem_needed > device->properties.limits.maxComputeSharedMemorySize) {
+        l_warptile = m_warptile;
+        l_wg_denoms = m_wg_denoms;
+        shmem_needed = (l_warptile[1] + l_warptile[2]) * (l_warptile[3] + 1) * sizeof(float);
+        GGML_ASSERT(shmem_needed <= device->properties.limits.maxComputeSharedMemorySize);
+    }
+    if (device->properties.limits.maxComputeSharedMemorySize >= 32768) {
+        // assert mul_mat_mat_id shaders will fit.
+        GGML_ASSERT(shmem_needed + 3072*4 <= device->properties.limits.maxComputeSharedMemorySize);
+    }
+
+    shmem_needed = (l_warptile_mmq[1] + l_warptile_mmq[2]) * (l_warptile_mmq[3] + 1) * sizeof(float);
+    if (shmem_needed > device->properties.limits.maxComputeSharedMemorySize) {
+        if (device->properties.limits.maxComputeSharedMemorySize == 32768) {
+            l_warptile_mmq = m_warptile_mmq;
+            l_mmq_wg_denoms = m_mmq_wg_denoms;
+        } else {
+            l_warptile_mmq = s_warptile_mmq;
+            l_mmq_wg_denoms = s_mmq_wg_denoms;
+        }
+        shmem_needed = (l_warptile_mmq[1] + l_warptile_mmq[2]) * (l_warptile_mmq[3] + 1) * sizeof(float);
+        GGML_ASSERT(shmem_needed <= device->properties.limits.maxComputeSharedMemorySize);
+    }
+    if (device->properties.limits.maxComputeSharedMemorySize >= 32768) {
+        // assert mul_mat_mat_id shaders will fit.
+        GGML_ASSERT(shmem_needed + 3072*4 <= device->properties.limits.maxComputeSharedMemorySize);
+    }
+
     device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
     device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
 
@@ -1299,35 +1335,38 @@ static void ggml_vk_load_shaders(vk_device& device) {
         CREATE_MM(pipeline_matmul_f16.f32acc, matmul_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3);
         CREATE_MM(pipeline_matmul_f16_f32.f32acc, matmul_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3);
 
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
 
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
 
-        CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
-        CREATE_MM(pipeline_matmul_id_f16, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
-        CREATE_MM(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
+        // If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
+        if (device->properties.limits.maxComputeSharedMemorySize >= 32768) {
+            CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
+            CREATE_MM(pipeline_matmul_id_f16, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
+            CREATE_MM(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
 
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
 
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+        }
 #undef CREATE_MM
     } else {
         // Create 6 variants, {s,m,l}x{unaligned,aligned}
@@ -1344,35 +1383,38 @@ static void ggml_vk_load_shaders(vk_device& device) {
         CREATE_MM(pipeline_matmul_f16.f32acc, matmul_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3);
         CREATE_MM(pipeline_matmul_f16_f32.f32acc, matmul_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3);
 
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
 
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
 
-        CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
-        CREATE_MM(pipeline_matmul_id_f16, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
-        CREATE_MM(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
+        // If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
+        if (device->properties.limits.maxComputeSharedMemorySize >= 32768) {
+            CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
+            CREATE_MM(pipeline_matmul_id_f16, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
+            CREATE_MM(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
 
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
 
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+        }
 #undef CREATE_MM
     }
 
@@ -6541,6 +6583,12 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
         case GGML_OP_MUL_MAT:
         case GGML_OP_MUL_MAT_ID:
             {
+                ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+                if (op->op == GGML_OP_MUL_MAT_ID &&
+                    ggml_vk_get_device(ctx->device)->properties.limits.maxComputeSharedMemorySize < 32768) {
+                    // If there's not enough shared memory for row_ids and the result tile, fallback to CPU
+                    return false;
+                }
                 switch (op->src[0]->type) {
                     case GGML_TYPE_F32:
                     case GGML_TYPE_F16:

From c31ed2abfce05c38a2a5189586bfae45a139a547 Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Wed, 27 Nov 2024 01:32:54 -0600
Subject: [PATCH 21/43] vulkan: define all quant data structures in types.comp
 (#10440)

---
 .../src/ggml-vulkan/vulkan-shaders/types.comp | 147 +++++++++---------
 1 file changed, 76 insertions(+), 71 deletions(-)

diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
index bc28e0ab8..eecc47f3a 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
@@ -30,10 +30,8 @@
 #endif
 #endif
 
-#if defined(DATA_A_Q4_0)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 32
-#define QUANT_R 2
+#define QUANT_K_Q4_0 32
+#define QUANT_R_Q4_0 2
 
 struct block_q4_0
 {
@@ -46,14 +44,15 @@ struct block_q4_0_packed16
     uint16_t qs[16/2];
 };
 
+#if defined(DATA_A_Q4_0)
+#define QUANT_K QUANT_K_Q4_0
+#define QUANT_R QUANT_R_Q4_0
 #define A_TYPE block_q4_0
 #define A_TYPE_PACKED16 block_q4_0_packed16
 #endif
 
-#if defined(DATA_A_Q4_1)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 32
-#define QUANT_R 2
+#define QUANT_K_Q4_1 32
+#define QUANT_R_Q4_1 2
 
 struct block_q4_1
 {
@@ -69,15 +68,15 @@ struct block_q4_1_packed16
     uint16_t qs[16/2];
 };
 
+#if defined(DATA_A_Q4_1)
+#define QUANT_K QUANT_K_Q4_1
+#define QUANT_R QUANT_R_Q4_1
 #define A_TYPE block_q4_1
 #define A_TYPE_PACKED16 block_q4_1_packed16
 #endif
 
-#if defined(DATA_A_Q5_0)
-#extension GL_EXT_shader_16bit_storage : require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
-#define QUANT_K 32
-#define QUANT_R 2
+#define QUANT_K_Q5_0 32
+#define QUANT_R_Q5_0 2
 
 struct block_q5_0
 {
@@ -93,15 +92,15 @@ struct block_q5_0_packed16
     uint16_t qs[16/2];
 };
 
+#if defined(DATA_A_Q5_0)
+#define QUANT_K QUANT_K_Q5_0
+#define QUANT_R QUANT_R_Q5_0
 #define A_TYPE block_q5_0
 #define A_TYPE_PACKED16 block_q5_0_packed16
 #endif
 
-#if defined(DATA_A_Q5_1)
-#extension GL_EXT_shader_16bit_storage : require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
-#define QUANT_K 32
-#define QUANT_R 2
+#define QUANT_K_Q5_1 32
+#define QUANT_R_Q5_1 2
 
 struct block_q5_1
 {
@@ -119,14 +118,15 @@ struct block_q5_1_packed16
     uint16_t qs[16/2];
 };
 
+#if defined(DATA_A_Q5_1)
+#define QUANT_K QUANT_K_Q5_1
+#define QUANT_R QUANT_R_Q5_1
 #define A_TYPE block_q5_1
 #define A_TYPE_PACKED16 block_q5_1_packed16
 #endif
 
-#if defined(DATA_A_Q8_0)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 32
-#define QUANT_R 1
+#define QUANT_K_Q8_0 32
+#define QUANT_R_Q8_0 1
 
 struct block_q8_0
 {
@@ -139,164 +139,164 @@ struct block_q8_0_packed16
     uint16_t qs[32/2];
 };
 
+#if defined(DATA_A_Q8_0)
+#define QUANT_K QUANT_K_Q8_0
+#define QUANT_R QUANT_R_Q8_0
 #define A_TYPE block_q8_0
 #define A_TYPE_PACKED16 block_q8_0_packed16
 #endif
 
 // K-quants
-#if defined(DATA_A_Q2_K)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 256
+#define QUANT_K_Q2_K 256
 
 struct block_q2_K
 {
-    uint8_t scales[QUANT_K/16];
-    uint8_t qs[QUANT_K/4];
+    uint8_t scales[QUANT_K_Q2_K/16];
+    uint8_t qs[QUANT_K_Q2_K/4];
     f16vec2 d;
 };
 
 struct block_q2_K_packed16
 {
-    uint16_t scales[QUANT_K/16/2];
-    uint16_t qs[QUANT_K/4/2];
+    uint16_t scales[QUANT_K_Q2_K/16/2];
+    uint16_t qs[QUANT_K_Q2_K/4/2];
     f16vec2 d;
 };
 
 struct block_q2_K_packed32
 {
-    uint32_t scales[QUANT_K/16/4];
-    uint32_t qs[QUANT_K/4/4];
+    uint32_t scales[QUANT_K_Q2_K/16/4];
+    uint32_t qs[QUANT_K_Q2_K/4/4];
     f16vec2 d;
 };
 
+#if defined(DATA_A_Q2_K)
+#define QUANT_K QUANT_K_Q2_K
 #define A_TYPE block_q2_K
 #define A_TYPE_PACKED16 block_q2_K_packed16
 #define A_TYPE_PACKED32 block_q2_K_packed32
 #endif
 
-#if defined(DATA_A_Q3_K)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 256
+#define QUANT_K_Q3_K 256
 
 struct block_q3_K
 {
-    uint8_t hmask[QUANT_K/8];
-    uint8_t qs[QUANT_K/4];
+    uint8_t hmask[QUANT_K_Q3_K/8];
+    uint8_t qs[QUANT_K_Q3_K/4];
     uint8_t scales[12];
     float16_t d;
 };
 
 struct block_q3_K_packed16
 {
-    uint16_t hmask[QUANT_K/8/2];
-    uint16_t qs[QUANT_K/4/2];
+    uint16_t hmask[QUANT_K_Q3_K/8/2];
+    uint16_t qs[QUANT_K_Q3_K/4/2];
     uint16_t scales[12/2];
     float16_t d;
 };
 
+#if defined(DATA_A_Q3_K)
+#define QUANT_K QUANT_K_Q3_K
 #define A_TYPE block_q3_K
 #define A_TYPE_PACKED16 block_q3_K_packed16
 #endif
 
-#if defined(DATA_A_Q4_K)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 256
+#define QUANT_K_Q4_K 256
 
 struct block_q4_K
 {
     f16vec2 d;
-    uint8_t scales[3*QUANT_K/64];
-    uint8_t qs[QUANT_K/2];
+    uint8_t scales[3*QUANT_K_Q4_K/64];
+    uint8_t qs[QUANT_K_Q4_K/2];
 };
 
 struct block_q4_K_packed16
 {
     f16vec2 d;
-    uint16_t scales[3*QUANT_K/64/2];
-    uint16_t qs[QUANT_K/2/2];
+    uint16_t scales[3*QUANT_K_Q4_K/64/2];
+    uint16_t qs[QUANT_K_Q4_K/2/2];
 };
 
 struct block_q4_K_packed32
 {
     f16vec2 d;
-    uint32_t scales[3*QUANT_K/64/4];
-    uint32_t qs[QUANT_K/2/4];
+    uint32_t scales[3*QUANT_K_Q4_K/64/4];
+    uint32_t qs[QUANT_K_Q4_K/2/4];
 };
 
+#if defined(DATA_A_Q4_K)
+#define QUANT_K QUANT_K_Q4_K
 #define A_TYPE block_q4_K
 #define A_TYPE_PACKED16 block_q4_K_packed16
 #define A_TYPE_PACKED32 block_q4_K_packed32
 #endif
 
-#if defined(DATA_A_Q5_K)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 256
+#define QUANT_K_Q5_K 256
 
 struct block_q5_K
 {
     f16vec2 d;
     uint8_t scales[12];
-    uint8_t qh[QUANT_K/8];
-    uint8_t qs[QUANT_K/2];
+    uint8_t qh[QUANT_K_Q5_K/8];
+    uint8_t qs[QUANT_K_Q5_K/2];
 };
 
 struct block_q5_K_packed16
 {
     f16vec2 d;
     uint16_t scales[12/2];
-    uint16_t qh[QUANT_K/8/2];
-    uint16_t qs[QUANT_K/2/2];
+    uint16_t qh[QUANT_K_Q5_K/8/2];
+    uint16_t qs[QUANT_K_Q5_K/2/2];
 };
 
+#if defined(DATA_A_Q5_K)
+#define QUANT_K QUANT_K_Q5_K
 #define A_TYPE block_q5_K
 #define A_TYPE_PACKED16 block_q5_K_packed16
 #endif
 
-#if defined(DATA_A_Q6_K)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 256
+#define QUANT_K_Q6_K 256
 
 struct block_q6_K
 {
-    uint8_t ql[QUANT_K/2];
-    uint8_t qh[QUANT_K/4];
-    int8_t scales[QUANT_K/16];
+    uint8_t ql[QUANT_K_Q6_K/2];
+    uint8_t qh[QUANT_K_Q6_K/4];
+    int8_t scales[QUANT_K_Q6_K/16];
     float16_t d;
 };
 
 struct block_q6_K_packed16
 {
-    uint16_t ql[QUANT_K/2/2];
-    uint16_t qh[QUANT_K/4/2];
-    int8_t scales[QUANT_K/16];
+    uint16_t ql[QUANT_K_Q6_K/2/2];
+    uint16_t qh[QUANT_K_Q6_K/4/2];
+    int8_t scales[QUANT_K_Q6_K/16];
     float16_t d;
 };
 
+#if defined(DATA_A_Q6_K)
+#define QUANT_K QUANT_K_Q6_K
 #define A_TYPE block_q6_K
 #define A_TYPE_PACKED16 block_q6_K_packed16
 #endif
 
 // IQuants
 
-#if defined(DATA_A_IQ4_NL)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 32
-#define QUANT_R 2
+#define QUANT_K_IQ4_NL 32
+#define QUANT_R_IQ4_NL 2
 
 struct block_iq4_nl
 {
     float16_t d;
-    uint8_t qs[QUANT_K/2];
+    uint8_t qs[QUANT_K_IQ4_NL/2];
 };
 
 struct block_iq4_nl_packed16
 {
     float16_t d;
-    uint16_t qs[QUANT_K/2/2];
+    uint16_t qs[QUANT_K_IQ4_NL/2/2];
 };
 
-#define A_TYPE block_iq4_nl
-#define A_TYPE_PACKED16 block_iq4_nl_packed16
+#if defined(DATA_A_IQ4_NL)
 
 const int8_t kvalues_iq4nl_const[16] = {
     int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
@@ -313,6 +313,11 @@ void init_iq4nl_shmem()
     }
     barrier();
 }
+
+#define QUANT_K QUANT_K_IQ4_NL
+#define QUANT_R QUANT_R_IQ4_NL
+#define A_TYPE block_iq4_nl
+#define A_TYPE_PACKED16 block_iq4_nl_packed16
 #endif
 
 #endif // !defined(GGML_TYPES_COMP)

From 9150f8fef95327474d39ccd6c6e30787e85f3529 Mon Sep 17 00:00:00 2001
From: Frankie Robertson <frankier@users.noreply.github.com>
Date: Tue, 26 Nov 2024 15:50:26 +0200
Subject: [PATCH 22/43] Do not include arm_neon.h when compiling CUDA code
 (ggml/1028)

---
 ggml/src/ggml-impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index 3965be787..f39b7a88c 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -14,7 +14,7 @@
 #include <arm_sve.h>
 #endif // __ARM_FEATURE_SVE
 
-#if defined(__ARM_NEON)
+#if defined(__ARM_NEON) && !defined(__CUDACC__)
 // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
 //
 //   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/

From fee824a1a1e35b5c49d482f654613addade61764 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 27 Nov 2024 11:10:42 +0200
Subject: [PATCH 23/43] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index d101d2b57..fd71a0a46 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-6fcbd60bc72ac3f7ad43f78c87e535f2e6206f58
+c598cbe30621251e80acbcf3b601589a37c17f4d

From 9e2301f4a4ef1690bd99360c11de43fe830b1c8d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 27 Nov 2024 11:22:14 +0200
Subject: [PATCH 24/43] metal : fix group_norm support condition (#0)

---
 ggml/src/ggml-metal/ggml-metal.m | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index c47f07a9e..ae6b25ede 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -997,9 +997,10 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
             return ggml_is_contiguous(op->src[0]);
         case GGML_OP_SUM_ROWS:
         case GGML_OP_SOFT_MAX:
-        case GGML_OP_RMS_NORM:
         case GGML_OP_GROUP_NORM:
             return has_simdgroup_reduction;
+        case GGML_OP_RMS_NORM:
+            return has_simdgroup_reduction && (op->ne[0] % 4 == 0);
         case GGML_OP_NORM:
         case GGML_OP_ROPE:
             return true;
@@ -2672,7 +2673,6 @@ static void ggml_metal_encode_node(
             } break;
         case GGML_OP_GROUP_NORM:
             {
-                GGML_ASSERT(ne00 % 4 == 0);
                 GGML_ASSERT(ggml_is_contiguous(src0));
 
                 float eps;

From 46c69e0e752ff16206347bb12f96ed69f4a01abf Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Wed, 27 Nov 2024 11:03:25 +0100
Subject: [PATCH 25/43] ci : faster CUDA toolkit installation method and use
 ccache (#10537)

* ci : faster CUDA toolkit installation method and use ccache

* remove fetch-depth

* only pack CUDA runtime on master
---
 .github/workflows/build.yml | 117 +++++++++++++++++++++---------------
 1 file changed, 70 insertions(+), 47 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 399641c4a..e02b5c620 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -892,12 +892,12 @@ jobs:
             cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89-real -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined -DLLAMA_FATAL_WARNINGS=ON
             cmake --build build
 
-  windows-latest-cmake-cuda:
-    runs-on: windows-latest
+  windows-2019-cmake-cuda:
+    runs-on: windows-2019
 
     strategy:
       matrix:
-        cuda: ['12.6.2']
+        cuda: ['12.4', '11.7']
         build: ['cuda']
 
     steps:
@@ -905,13 +905,66 @@ jobs:
         id: checkout
         uses: actions/checkout@v4
 
-      - name: Install CUDA toolkit
-        id: cuda-toolkit
-        uses: Jimver/cuda-toolkit@v0.2.19
+      - name: Install Cuda Toolkit 11.7
+        if: ${{ matrix.cuda == '11.7' }}
+        run: |
+          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
+          choco install unzip -y
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
+          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+          echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+
+      - name: Install Cuda Toolkit 12.4
+        if: ${{ matrix.cuda == '12.4' }}
+        run: |
+          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+          choco install unzip -y
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
+          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+          echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+
+      - name: Install ccache
+        uses: hendrikmuhs/ccache-action@v1.2
         with:
-          cuda: ${{ matrix.cuda }}
-          method: 'network'
-          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
+          key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }}
 
       - name: Install Ninja
         id: install_ninja
@@ -922,44 +975,12 @@ jobs:
         id: cmake_build
         shell: cmd
         run: |
-          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
-          cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON -DCMAKE_CUDA_ARCHITECTURES=89-real
-          cmake --build build --config Release -t ggml-cuda
+          call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
+          cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
+          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
+          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
           cmake --build build --config Release
 
-  windows-2019-cmake-cuda:
-    runs-on: windows-2019
-    if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-    strategy:
-      matrix:
-        cuda: ['12.2.0', '11.7.1']
-        build: ['cuda']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Install CUDA toolkit
-        id: cuda-toolkit
-        uses: Jimver/cuda-toolkit@v0.2.15
-        with:
-          cuda: ${{ matrix.cuda }}
-          method: 'network'
-          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
-
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
-          cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
-          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
-
       - name: Determine tag name
         id: tag
         shell: bash
@@ -987,10 +1008,12 @@ jobs:
           name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
 
       - name: Copy and pack Cuda runtime
+        if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
         run: |
-          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
+          echo "Cuda install location: ${{ env.CUDA_PATH }}"
           $dst='.\build\bin\cudart\'
-          robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
           7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
 
       - name: Upload Cuda runtime

From 3ad5451f3b75809e3033e4e577b9f60bcaf6676a Mon Sep 17 00:00:00 2001
From: uvos <devnull@uvos.xyz>
Date: Wed, 27 Nov 2024 17:10:08 +0100
Subject: [PATCH 26/43] Add some minimal optimizations for CDNA (#10498)

* Add some minimal optimizations for CDNA

* ggml_cuda: set launch bounds also for GCN as it helps there too
---
 ggml/src/ggml-cuda/common.cuh    | 17 ++++++++++++++---
 ggml/src/ggml-cuda/ggml-cuda.cu  | 11 ++++++++++-
 ggml/src/ggml-cuda/mmq.cu        |  2 +-
 ggml/src/ggml-cuda/mmq.cuh       |  4 ++--
 ggml/src/ggml-cuda/mmvq.cu       |  2 +-
 ggml/src/ggml-cuda/vendors/hip.h |  8 ++++++++
 6 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index b0dd16066..535118d87 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -47,9 +47,20 @@
 #define CC_TURING     750
 #define CC_AMPERE     800
 #define CC_OFFSET_AMD 1000000
-#define CC_RDNA1      (CC_OFFSET_AMD + 1010)
-#define CC_RDNA2      (CC_OFFSET_AMD + 1030)
-#define CC_RDNA3      (CC_OFFSET_AMD + 1100)
+
+// GCN/CNDA, wave size is 64
+#define CC_GCN4       (CC_OFFSET_AMD + 803)  // Tonga, Fiji, Polaris, minimum for fast fp16
+#define CC_VEGA       (CC_OFFSET_AMD + 900)  // Vega56/64, minimum for fp16 dual issue
+#define CC_VEGA20     (CC_OFFSET_AMD + 906)  // MI50/Radeon VII, minimum for dp4a
+#define CC_CDNA       (CC_OFFSET_AMD + 908)  // MI100, minimum for MFMA, acc registers
+#define CC_CDNA2      (CC_OFFSET_AMD + 910)  // MI210, minimum acc register renameing
+#define CC_CDNA3      (CC_OFFSET_AMD + 942)  // MI300
+
+// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
+#define CC_RDNA1      (CC_OFFSET_AMD + 1010) // RX 5000
+#define CC_RDNA2      (CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
+#define CC_RDNA3      (CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
+
 #define CC_QY1        210
 #define CC_QY2        220
 
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 2a78a4393..d6e4bfdd0 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -1107,6 +1107,11 @@ static void ggml_cuda_op_mul_mat_cublas(
         const half alpha_f16 = 1.0f;
         const half beta_f16 = 0.0f;
 
+        cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
+        if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) {
+            cu_compute_type = CUBLAS_COMPUTE_32F;
+        }
+
         CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
         CUBLAS_CHECK(
             cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
@@ -1114,7 +1119,7 @@ static void ggml_cuda_op_mul_mat_cublas(
                     &alpha_f16, src0_ptr,       CUDA_R_16F, ne00,
                                 src1_ptr,       CUDA_R_16F, ne10,
                     &beta_f16,   dst_f16.get(), CUDA_R_16F, ldc,
-                    CUBLAS_COMPUTE_16F,
+                    cu_compute_type,
                     CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 
         const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
@@ -1607,6 +1612,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
     cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
     cudaDataType_t      cu_data_type    = CUDA_R_16F;
 
+    if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) {
+        cu_compute_type = CUBLAS_COMPUTE_32F;
+    }
+
     // dst strides
     size_t nbd2 = dst->nb[2];
     size_t nbd3 = dst->nb[3];
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index ae5c68ab3..7f7c8c90b 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -148,5 +148,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
         return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
     }
 
-    return cc < CC_RDNA3 || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
+    return (cc < CC_RDNA3 && cc != CC_CDNA && cc != CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
 }
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index 425acb20d..8d8867121 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -2570,9 +2570,9 @@ static __device__ void mul_mat_q_process_tile(
 
 template <ggml_type type, int mmq_x, int nwarps, bool need_check>
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
+#if defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
     __launch_bounds__(WARP_SIZE*nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
 #else
 #if __CUDA_ARCH__ >= CC_VOLTA
     __launch_bounds__(WARP_SIZE*nwarps, 1)
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
index 735975c16..02d150983 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -142,7 +142,7 @@ static void mul_mat_vec_q_cuda(
     int64_t nwarps = 1;
     int64_t rows_per_cuda_block = 1;
 
-    if (ggml_cuda_info().devices[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2
+    if (ggml_cuda_info().devices[id].cc < CC_CDNA || ggml_cuda_info().devices[id].cc == CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA
         switch(ncols_y) {
             case 1:
                 nwarps = 4;
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
index 1f3c70c2e..3205534d6 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -95,6 +95,14 @@
 
 #define __CUDA_ARCH__ 1300
 
+#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
+#define GCN
+#endif
+
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
+#define CDNA
+#endif
+
 #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
     defined(__gfx1150__) || defined(__gfx1151__)
 #define RDNA3

From 9f912511bc9414fa7a3c521378b6388cd932b58d Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Wed, 27 Nov 2024 22:30:52 +0100
Subject: [PATCH 27/43] common : fix duplicated file name with hf_repo and
 hf_file (#10550)

---
 common/arg.cpp                 |  6 +++-
 common/common.cpp              | 54 +++++++++++++++++-----------------
 common/common.h                | 13 ++++++--
 examples/server/tests/utils.py |  1 -
 4 files changed, 43 insertions(+), 31 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 272492e50..a6b7a1394 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -128,7 +128,11 @@ static void common_params_handle_model_default(common_params & params) {
             }
             params.hf_file = params.model;
         } else if (params.model.empty()) {
-            params.model = fs_get_cache_file(string_split<std::string>(params.hf_file, '/').back());
+            // this is to avoid different repo having same file name, or same file name in different subdirs
+            std::string filename = params.hf_repo + "_" + params.hf_file;
+            // to make sure we don't have any slashes in the filename
+            string_replace_all(filename, "/", "_");
+            params.model = fs_get_cache_file(filename);
         }
     } else if (!params.model_url.empty()) {
         if (params.model.empty()) {
diff --git a/common/common.cpp b/common/common.cpp
index 09ec9f238..2b2f00098 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -829,9 +829,9 @@ struct common_init_result common_init_from_params(common_params & params) {
     llama_model * model = nullptr;
 
     if (!params.hf_repo.empty() && !params.hf_file.empty()) {
-        model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
+        model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
     } else if (!params.model_url.empty()) {
-        model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
+        model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
     } else {
         model = llama_load_model_from_file(params.model.c_str(), mparams);
     }
@@ -1342,17 +1342,17 @@ static bool common_download_file(const std::string & url, const std::string & pa
 }
 
 struct llama_model * common_load_model_from_url(
-        const char * model_url,
-        const char * path_model,
-        const char * hf_token,
+        const std::string & model_url,
+        const std::string & local_path,
+        const std::string & hf_token,
         const struct llama_model_params & params) {
     // Basic validation of the model_url
-    if (!model_url || strlen(model_url) == 0) {
+    if (model_url.empty()) {
         LOG_ERR("%s: invalid model_url\n", __func__);
         return NULL;
     }
 
-    if (!common_download_file(model_url, path_model, hf_token)) {
+    if (!common_download_file(model_url, local_path, hf_token)) {
         return NULL;
     }
 
@@ -1363,9 +1363,9 @@ struct llama_model * common_load_model_from_url(
             /*.no_alloc = */ true,
             /*.ctx      = */ NULL,
         };
-        auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
+        auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
         if (!ctx_gguf) {
-            LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, path_model);
+            LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, local_path.c_str());
             return NULL;
         }
 
@@ -1384,13 +1384,13 @@ struct llama_model * common_load_model_from_url(
         // Verify the first split file format
         // and extract split URL and PATH prefixes
         {
-            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
-                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
+            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
+                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
                 return NULL;
             }
 
-            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
-                LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
+            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
+                LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
                 return NULL;
             }
         }
@@ -1417,14 +1417,14 @@ struct llama_model * common_load_model_from_url(
         }
     }
 
-    return llama_load_model_from_file(path_model, params);
+    return llama_load_model_from_file(local_path.c_str(), params);
 }
 
 struct llama_model * common_load_model_from_hf(
-        const char * repo,
-        const char * model,
-        const char * path_model,
-        const char * hf_token,
+        const std::string & repo,
+        const std::string & remote_path,
+        const std::string & local_path,
+        const std::string & hf_token,
         const struct llama_model_params & params) {
     // construct hugging face model url:
     //
@@ -1438,27 +1438,27 @@ struct llama_model * common_load_model_from_hf(
     std::string model_url = "https://huggingface.co/";
     model_url += repo;
     model_url += "/resolve/main/";
-    model_url += model;
+    model_url += remote_path;
 
-    return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
+    return common_load_model_from_url(model_url, local_path, hf_token, params);
 }
 
 #else
 
 struct llama_model * common_load_model_from_url(
-        const char * /*model_url*/,
-        const char * /*path_model*/,
-        const char * /*hf_token*/,
+        const std::string & /*model_url*/,
+        const std::string & /*local_path*/,
+        const std::string & /*hf_token*/,
         const struct llama_model_params & /*params*/) {
     LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
     return nullptr;
 }
 
 struct llama_model * common_load_model_from_hf(
-        const char * /*repo*/,
-        const char * /*model*/,
-        const char * /*path_model*/,
-        const char * /*hf_token*/,
+        const std::string & /*repo*/,
+        const std::string & /*remote_path*/,
+        const std::string & /*local_path*/,
+        const std::string & /*hf_token*/,
         const struct llama_model_params & /*params*/) {
     LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
     return nullptr;
diff --git a/common/common.h b/common/common.h
index 286642db2..9b1508a15 100644
--- a/common/common.h
+++ b/common/common.h
@@ -470,8 +470,17 @@ struct llama_model_params     common_model_params_to_llama  (      common_params
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
 struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
 
-struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
-struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
+struct llama_model * common_load_model_from_url(
+    const std::string & model_url,
+    const std::string & local_path,
+    const std::string & hf_token,
+    const struct llama_model_params & params);
+struct llama_model * common_load_model_from_hf(
+    const std::string & repo,
+    const std::string & remote_path,
+    const std::string & local_path,
+    const std::string & hf_token,
+    const struct llama_model_params & params);
 
 // clear LoRA adapters from context, then apply new list of adapters
 void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py
index bc590bcb3..e31743c50 100644
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
@@ -319,7 +319,6 @@ class ServerPreset:
         server.model_hf_repo = "ggml-org/models"
         server.model_hf_file = "jina-reranker-v1-tiny-en/ggml-model-f16.gguf"
         server.model_alias = "jina-reranker"
-        server.model_file = "./tmp/jina-reranker-v1-tiny-en.gguf"
         server.n_ctx = 512
         server.n_batch = 512
         server.n_slots = 1

From b7420131bf8ab3e067bc660439ab1ab18be7edbd Mon Sep 17 00:00:00 2001
From: Chenguang Li <87689256+noemotiovon@users.noreply.github.com>
Date: Thu, 28 Nov 2024 14:24:46 +0800
Subject: [PATCH 28/43] CANN: ROPE operator optimization (#10540)

* [cann] ROPE operator optimization

Co-authored-by: noemotiovon <noemotiovon@gmail.com>
---
 ggml/src/ggml-cann/aclnn_ops.cpp | 308 +++++++++++++++++++++----------
 ggml/src/ggml-cann/ggml-cann.cpp |   9 -
 2 files changed, 211 insertions(+), 106 deletions(-)

diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index d7472ee3a..d707efc5d 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -21,22 +21,23 @@
  */
 
 #include "aclnn_ops.h"
-#include "ggml-impl.h"
 
+#include <aclnnop/aclnn_addcdiv.h>
 #include <aclnnop/aclnn_avgpool2d.h>
+#include <aclnnop/aclnn_batch_matmul.h>
 #include <aclnnop/aclnn_cast.h>
 #include <aclnnop/aclnn_constant_pad_nd.h>
 #include <aclnnop/aclnn_copy.h>
 #include <aclnnop/aclnn_cos.h>
+#include <aclnnop/aclnn_div.h>
 #include <aclnnop/aclnn_exp.h>
 #include <aclnnop/aclnn_fill_scalar.h>
 #include <aclnnop/aclnn_group_norm.h>
 #include <aclnnop/aclnn_index_fill_tensor.h>
 #include <aclnnop/aclnn_layer_norm.h>
-#include <aclnnop/aclnn_mm.h>
-#include <aclnnop/aclnn_batch_matmul.h>
 #include <aclnnop/aclnn_matmul.h>
 #include <aclnnop/aclnn_max_pool.h>
+#include <aclnnop/aclnn_mm.h>
 #include <aclnnop/aclnn_permute.h>
 #include <aclnnop/aclnn_pow_tensor_tensor.h>
 #include <aclnnop/aclnn_reduce_sum.h>
@@ -56,6 +57,7 @@
 #include <exception>
 #include <vector>
 
+#include "ggml-impl.h"
 #include "kernels/ascendc_kernels.h"
 
 #define GGML_COMMON_DECL_C
@@ -1103,9 +1105,9 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
 }
 
 /**
- * @brief Creates an ACL tensor initialized with ones using a provided buffer.
+ * @brief Creates an ACL tensor initialized with value using a provided buffer.
  *
- * This function initializes a tensor with ones using the specified buffer and
+ * This function initializes a tensor with value using the specified buffer and
  * tensor parameters.
  *
  * @param ctx The context for the CANN backend operations.
@@ -1118,12 +1120,12 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
  * @param type_size The size of each element in the tensor data type.
  * @param value The value to be used for initializing the tensor (default
  * is 1.0).
- * @return An ACL tensor initialized with ones.
+ * @return An ACL tensor initialized with value.
  */
-static aclTensor* aclnn_ones(ggml_backend_cann_context& ctx, void* buffer,
-                             size_t n_bytes, int64_t* ne, int64_t dims,
-                             aclDataType type, size_t type_size,
-                             float value = 1.0f) {
+static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer,
+                               size_t n_bytes, int64_t* ne, int64_t dims,
+                               aclDataType type, size_t type_size,
+                               float value = 1.0f) {
     aclTensor* acl_tensor =
         aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size);
     float alpha_host = 1.0f;
@@ -1165,7 +1167,7 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src);
     ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
 
-    aclTensor* acl_gamma = aclnn_ones(
+    aclTensor* acl_gamma = aclnn_values(
         ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1,
         ggml_cann_type_mapping(src->type), ggml_element_size(src));
 
@@ -1209,9 +1211,9 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
     ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
 
     aclTensor* mask_tensor =
-        aclnn_ones(ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne,
-                   GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
-                   ggml_element_size(src), value);
+        aclnn_values(ctx, one_tensor_allocator.get(), one_tensor_n_bytes,
+                     src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
+                     ggml_element_size(src), value);
 
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
@@ -1768,6 +1770,92 @@ static void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
     ACL_CHECK(aclnnSin(workspaceAddr, workspaceSize, executor, ctx.stream()));
 }
 
+/**
+ * @brief Performs element-wise division of tensor1 by tensor2 , multiplies the
+ result by the scalar value and adds it to self .
+ *
+ * Performs element-wise division of tensor1 by tensor2,
+ * multiplies the result by the scalar value and adds it to self .
+ * The operation is defined as:
+ * \f[
+ *     \text{out}_i = \text{selft}_i + \text{value} \times
+ \frac{\text{tensor1}_i}{\text{tensor2}_i}
+ * \f]
+
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_self The source tensor on which the addcdiv function will be
+ applied.
+ * @param tensor1 Numerator tensor.
+ * @param tensor2 Denominator tensor.
+ * @param value The value to be used for coefficient.
+ */
+static void aclnn_inplace_addcdiv(ggml_backend_cann_context& ctx,
+                                  aclTensor* acl_self, aclTensor* tensor1,
+                                  aclTensor* tensor2, float value) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+    aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
+
+    ACL_CHECK(aclnnInplaceAddcdivGetWorkspaceSize(
+        acl_self, tensor1, tensor2, acl_value, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnInplaceAddcdiv(workspaceAddr, workspaceSize, executor,
+                                  ctx.stream()));
+}
+
+/**
+ * @brief Matrix division, optionally in-place.
+ *
+ * This function division each element of the source tensor `acl_src` by the
+ * tensor `acl_other` and stores the result in the destination tensor `acl_dst`.
+ * If `inplace` is true, `acl_dst` will not be used and the operation is
+ * performed in-place on `acl_src`. The operation is defined as: \f[
+ *     \text{dst}_i = \frac{\text{acl_src}_i}{\text{acl_other}_i}
+ * \f]
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src Numerator tensor..
+ * @param acl_other Denominator tensor.
+ * @param acl_dst The destination tensor where the result will be stored if
+ * `inplace` is false.
+ * @param inplace Flag indicating whether to perform the operation in-place on
+ * `acl_src`.
+ */
+static void aclnn_div_tensor(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                             aclTensor* acl_other, aclTensor* acl_dst,
+                             bool inplace) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    if (inplace) {
+        ACL_CHECK(aclnnInplaceDivGetWorkspaceSize(acl_src, acl_other,
+                                                  &workspaceSize, &executor));
+        if (workspaceSize > 0) {
+            ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+            workspaceAddr = workspace_allocator.get();
+        }
+
+        ACL_CHECK(aclnnInplaceDiv(workspaceAddr, workspaceSize, executor,
+                                  ctx.stream()));
+    } else {
+        ACL_CHECK(aclnnDivGetWorkspaceSize(acl_src, acl_other, acl_dst,
+                                           &workspaceSize, &executor));
+        if (workspaceSize > 0) {
+            ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+            workspaceAddr = workspace_allocator.get();
+        }
+
+        ACL_CHECK(
+            aclnnDiv(workspaceAddr, workspaceSize, executor, ctx.stream()));
+    }
+}
+
 void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
                                   ggml_tensor* dst) {
     const ggml_tensor* src = dst->src[0];
@@ -2311,12 +2399,13 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                                ctx.stream()));
 
     switch (src0->type) {
-        case GGML_TYPE_F32:
-        {
+        case GGML_TYPE_F32: {
 #ifdef ASCEND_310P
-             // Special operation for get_row_f32 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
+            // Special operation for get_row_f32 kernel of 310P: clear the
+            // content of dest data buffer when row is not aligned to 32 bytes
             if ((src0->ne[0] % 8) != 0) {
-                size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
+                size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] *
+                                 src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
                 ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
             }
 #endif
@@ -2329,12 +2418,15 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                 ((ggml_tensor*)dst->extra)->nb);
             break;
         }
-        case GGML_TYPE_F16:
-        {
+        case GGML_TYPE_F16: {
 #ifdef ASCEND_310P
-             // Special operation for get_row_f16 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
+            // Special operation for get_row_f16 kernel of 310P: clear the
+            // content of dest data buffer when row is not aligned to 32 bytes
             if ((src0->ne[0] % 16) != 0) {
-                size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); // out is also f32, even input is f16
+                size_t dst_len =
+                    src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] *
+                    ggml_type_size(
+                        GGML_TYPE_F32);  // out is also f32, even input is f16
                 ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
             }
 #endif
@@ -2459,8 +2551,9 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
  * @param acl_dst The destination tensor where the result of the matrix
  * multiplication will be stored.
  */
-static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
-                             aclTensor* acl_weight, aclTensor* acl_dst) {
+static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx,
+                             aclTensor* acl_input, aclTensor* acl_weight,
+                             aclTensor* acl_dst) {
     int8_t cube_math_type = 2;
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
@@ -2475,8 +2568,7 @@ static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_inpu
         workspaceAddr = workspace_allocator.get();
     }
 
-    ACL_CHECK(
-        aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
+    ACL_CHECK(aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
 }
 
 /**
@@ -2496,8 +2588,9 @@ static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_inpu
  * @param acl_dst The destination tensor where the result of the matrix
  * multiplication will be stored.
  */
-static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
-                             aclTensor* acl_weight, aclTensor* acl_dst) {
+static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx,
+                             aclTensor* acl_input, aclTensor* acl_weight,
+                             aclTensor* acl_dst) {
     int8_t cube_math_type = 2;
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
@@ -2548,31 +2641,27 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
 
     aclTensor* acl_input_tensor =
         ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
-    int64_t transpose_ne[] = {
-        bcast_weight_ne[1], bcast_weight_ne[0],
-        bcast_weight_ne[2], bcast_weight_ne[3],
-        bcast_weight_ne[4], bcast_weight_ne[5]
-    };
-    size_t transpose_nb[] = {
-        bcast_weight_nb[1], bcast_weight_nb[0],
-        bcast_weight_nb[2], bcast_weight_nb[3],
-        bcast_weight_nb[4], bcast_weight_nb[5]
-    };
+    int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0],
+                              bcast_weight_ne[2], bcast_weight_ne[3],
+                              bcast_weight_ne[4], bcast_weight_ne[5]};
+    size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
+                             bcast_weight_nb[2], bcast_weight_nb[3],
+                             bcast_weight_nb[4], bcast_weight_nb[5]};
     aclTensor* acl_weight_tensor =
         ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
     aclTensor* acl_dst =
         ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
 
     switch (n_dims) {
-    case 2:
-        aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
-        break;
-    case 3:
-        aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
-        break;
-    default:
-        aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
-        break;
+        case 2:
+            aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
+            break;
+        case 3:
+            aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
+            break;
+        default:
+            aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
+            break;
     }
 
     ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
@@ -2594,8 +2683,8 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
  * multiplication will be stored.
  */
 static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
-                                   ggml_tensor* dst,
-                                   const enum ggml_type type) {
+                                    ggml_tensor* dst,
+                                    const enum ggml_type type) {
     ggml_tensor* src0 = dst->src[0];  // weight
     ggml_tensor* src1 = dst->src[1];  // input
 
@@ -2617,14 +2706,15 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
 
     // scale stored at the end of weight. Also need transpose.
     size_t scale_elem_size = sizeof(uint16_t);
-    size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size};
+    size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
+                         scale_elem_size};
     size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
     char* scale_offset = (char*)src0->data + weight_size;
 
     // input
     size_t input_elem_size = sizeof(uint16_t);
     int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
-    size_t input_nb[] = {input_elem_size,  input_ne[0] * input_elem_size};
+    size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size};
     size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
     ggml_cann_pool_alloc input_alloctor(ctx.pool());
     void* input_buffer = src1->data;
@@ -2632,7 +2722,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
     // case in
     if (src1->type != GGML_TYPE_F16) {
         aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
-        input_buffer = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
+        input_buffer =
+            input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
 
         int64_t* input_cast_ne = src1->ne;
         size_t input_cast_nb[GGML_MAX_DIMS];
@@ -2642,9 +2733,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
         }
 
         aclTensor* acl_input_tensor = ggml_cann_create_tensor(
-            input_buffer,
-            ACL_FLOAT16,
-            input_elem_size, input_cast_ne, input_cast_nb, GGML_MAX_DIMS);
+            input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
+            input_cast_nb, GGML_MAX_DIMS);
         aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
 
         ACL_CHECK(aclDestroyTensor(acl_input_tensor));
@@ -2655,7 +2745,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
     size_t output_elem_size = sizeof(uint16_t);
     size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
     ggml_cann_pool_alloc output_allocator(ctx.pool());
-    void* output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
+    void* output_buffer =
+        output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
     size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
 
     // aclnn
@@ -2679,7 +2770,9 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
 
             // first split
             int64_t weight_ne_offset = 0;
-            int64_t weight_ne[2] = {max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0]};
+            int64_t weight_ne[2] = {
+                max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size,
+                src0->ne[0]};
             int64_t scale_ne_offset = 0;
             int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
             int64_t output_ne_offset = 0;
@@ -2687,24 +2780,21 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
 
             aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
                 (char*)src0->data + batch0 * weight_stride,
-                ggml_cann_type_mapping(type),
-                weight_elem_size, weight_ne, weight_nb, 2,
-                ACL_FORMAT_ND, weight_ne_offset);
+                ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
+                weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
             aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
-                scale_offset + batch0 * scale_stride,
-                ACL_FLOAT16,
-                scale_elem_size, scale_ne, scale_nb, 2,
-                ACL_FORMAT_ND, scale_ne_offset);
+                scale_offset + batch0 * scale_stride, ACL_FLOAT16,
+                scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
+                scale_ne_offset);
             aclTensor* acl_output_tensor = ggml_cann_create_tensor(
-                (char*)output_buffer + batch1 * output_stride,
-                ACL_FLOAT16,
-                output_elem_size, output_ne, output_nb, 2,
-                ACL_FORMAT_ND, output_ne_offset);
+                (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
+                output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
+                output_ne_offset);
 
             ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
-                acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
-                nullptr, nullptr, nullptr, nullptr, QK8_0,
-                acl_output_tensor, &workspaceSize, &executor));
+                acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
+                nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
+                &workspaceSize, &executor));
             if (workspaceAddr == nullptr) {
                 workspaceAddr = workspace_allocator.alloc(workspaceSize);
             }
@@ -2717,28 +2807,29 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
 
             // other splits
             for (int64_t split = 1; split < split_size; split++) {
-                weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1];
-                weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size;
+                weight_ne_offset +=
+                    weight_elem_size * weight_ne[0] * weight_ne[1];
+                weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1]
+                                   ? src0->ne[1] - (max_elem_size * split)
+                                   : max_elem_size;
                 scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
                 scale_ne[0] = weight_ne[0];
-                output_ne_offset += output_elem_size * output_ne[0] * output_ne[1];
+                output_ne_offset +=
+                    output_elem_size * output_ne[0] * output_ne[1];
                 output_ne[0] = weight_ne[0];
 
                 acl_weight_tensor = ggml_cann_create_tensor(
                     (char*)src0->data + batch0 * weight_stride,
-                    ggml_cann_type_mapping(type),
-                    weight_elem_size, weight_ne, weight_nb, 2,
-                    ACL_FORMAT_ND, weight_ne_offset);
+                    ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
+                    weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
                 acl_scale_tensor = ggml_cann_create_tensor(
-                    scale_offset + batch0 * scale_stride,
-                    ACL_FLOAT16,
-                    scale_elem_size, scale_ne, scale_nb, 2,
-                    ACL_FORMAT_ND, scale_ne_offset);
+                    scale_offset + batch0 * scale_stride, ACL_FLOAT16,
+                    scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
+                    scale_ne_offset);
                 acl_output_tensor = ggml_cann_create_tensor(
-                    (char*)output_buffer + batch1 * output_stride,
-                    ACL_FLOAT16,
-                    output_elem_size, output_ne, output_nb, 2,
-                    ACL_FORMAT_ND, output_ne_offset);
+                    (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
+                    output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
+                    output_ne_offset);
 
                 ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
                     acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
@@ -2766,11 +2857,11 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
         }
 
         aclTensor* acl_output_tensor = ggml_cann_create_tensor(
-            output_buffer,
-            ACL_FLOAT16,
-            output_elem_size, output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
+            output_buffer, ACL_FLOAT16, output_elem_size, output_cast_ne,
+            output_cast_nb, GGML_MAX_DIMS);
         aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
-        aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
+        aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor,
+                   ggml_cann_type_mapping(dst->type));
 
         ACL_CHECK(aclDestroyTensor(acl_output_tensor));
         ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
@@ -2873,12 +2964,14 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
 static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
                              aclTensor* acl_cos_repeat_tensor,
                              aclTensor* acl_sin_repeat_tensor,
-                             float theta_scale, bool is_neox) {
+                             float theta_scale, float freq_scale,
+                             bool is_neox) {
     // int sin/cos cache, cache has different repeat method depond on
     // @param.is_neox
 
     ggml_tensor* src0 = dst->src[0];  // input
     ggml_tensor* src1 = dst->src[1];  // position
+    ggml_tensor* src2 = dst->src[2];  // freq_factors
 
     // arange, [0,1,...,ne0/2]
     int64_t arange_length = src0->ne[0] / 2;
@@ -2907,11 +3000,25 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
     ggml_cann_pool_alloc theta_scale_allocator(ctx.pool(),
                                                arange_length * sizeof(float_t));
     void* theta_scale_buffer = theta_scale_allocator.get();
-    aclTensor* acl_theta_scale_tensor = aclnn_ones(
+    aclTensor* acl_theta_scale_tensor = aclnn_values(
         ctx, theta_scale_buffer, arange_length * sizeof(float_t), arange_ne,
         GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), theta_scale);
     aclnn_pow_tensor_tensor(ctx, acl_theta_scale_tensor, acl_arange_tensor);
 
+    // freq_scale
+    if (freq_scale != 1) {
+        aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true);
+    }
+
+    // freq_factors
+    if (src2) {
+        aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor(
+            src2->data, ggml_cann_type_mapping(src2->type),
+            ggml_type_size(src2->type), arange_ne, arange_nb, GGML_MAX_DIMS);
+        aclnn_div_tensor(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor,
+                         nullptr, true);
+    }
+
     // position
     GGML_ASSERT(src1->type == GGML_TYPE_I32);
     int64_t position_length = src1->ne[0];
@@ -2940,6 +3047,16 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
     aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor,
               acl_theta_tensor);
 
+    // // power[] * position[] * freq_scale / freq_factors[]
+    // ggml_cann_pool_alloc theta_final_allocator(ctx.pool(),
+    //                                            theta_length *
+    //                                            sizeof(float_t));
+    // aclTensor* acl_theat_final_tensor = aclnn_zero(
+    //     ctx, theta_final_allocator.get(), sizeof(float_t) * theta_length,
+    //     theta_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t));
+    // aclnn_inplace_addcdiv(ctx, acl_theat_final_tensor, acl_theta_tensor,
+    //                       acl_freq_factors_tensor, freq_scale);
+
     // permute: [0,1,2,3]->[0,2,1,3]
     int64_t permute_ne[] = {arange_length, 1, position_length, 1};
     size_t permute_nb[GGML_MAX_DIMS];
@@ -3038,8 +3155,6 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
     memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
 
-    // TODO: with freq_factors
-    GGML_ASSERT(src2 == NULL);
     // TODO: attn_factor != 1
     GGML_ASSERT(attn_factor == 1);
     // TODO: n_dims <= ne0
@@ -3047,8 +3162,6 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     GGML_ASSERT(n_dims % 2 == 0);
     // TODO: ext_factor != 0
     GGML_ASSERT(ext_factor == 0);
-    // TODO: freq_scale != 1
-    GGML_ASSERT(freq_scale == 1);
     // TODO: type == GGML_TYPE_F16
     GGML_ASSERT(src0->type == GGML_TYPE_F32);
 
@@ -3081,7 +3194,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
         ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t),
                                 sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
     aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
-                     theta_scale, is_neox);
+                     theta_scale, freq_scale, is_neox);
 
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
@@ -3096,7 +3209,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     aclTensor* acl_x = ggml_cann_create_tensor(src0);
     aclTensor* acl_dst = ggml_cann_create_tensor(dst);
     ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize(
-        acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst, &workspaceSize, &executor));
+        acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode,
+        acl_dst, &workspaceSize, &executor));
     if (workspaceSize > 0) {
         ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
         workspaceAddr = workspace_allocator.get();
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index c7a3419c7..bcb54e444 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1738,13 +1738,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
         }
         case GGML_OP_ROPE: {
             // TODO: with ops-test v == 1
-            float * freq_scale = (float*)((int32_t*)op->op_params + 6);
             float * ext_factor = (float*)((int32_t*)op->op_params + 7);
             float * attn_factor = (float*)((int32_t*)op->op_params + 8);
-            // TODO: with freq_factors
-            if (op->src[2] != NULL) {
-                return false;
-            }
             // TODO: n_dims <= ne0
             if (op->src[0]->ne[0] != op->op_params[1]) {
                 return false;
@@ -1753,10 +1748,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
             if (*ext_factor != 0) {
                 return false;
             }
-            // TODO: freq_scale != 1
-            if (*freq_scale != 1) {
-                return false;
-            }
             // TODO: attn_factor != 1
             if (*attn_factor != 1) {
                 return false;

From 605fa66c509f9f117bd654cf0b9b3ea08bb86e80 Mon Sep 17 00:00:00 2001
From: leo-pony <nengjunma@outlook.com>
Date: Thu, 28 Nov 2024 15:25:24 +0800
Subject: [PATCH 29/43] CANN: Fix SOC_TYPE compile bug (#10519)

* CANN: Fix the bug build fail on Ascend310P under two cases:
1) Manual specify SOC_TYPE
2) Under some unusual compile environment

* Update the cann backend News content: Support F16 and F32 data type model for Ascend 310P NPU.

* fix CANN  compile fail bug: the assert in ascend kernel function doesn't supportted on some CANN version
---
 docs/backend/CANN.md                            |  3 +++
 ggml/src/ggml-cann/CMakeLists.txt               |  7 ++++---
 ggml/src/ggml-cann/kernels/CMakeLists.txt       |  2 +-
 ggml/src/ggml-cann/kernels/dup.cpp              |  1 -
 ggml/src/ggml-cann/kernels/get_row_q4_0.cpp     | 16 ++++++++++++----
 .../src/ggml-cann/kernels/quantize_f16_q8_0.cpp | 10 ++++++++++
 .../src/ggml-cann/kernels/quantize_f32_q8_0.cpp | 10 ++++++++++
 .../kernels/quantize_float_to_q4_0.cpp          | 17 +++++++++++++++++
 8 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md
index 6bdd9d2da..496e05807 100644
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -23,6 +23,8 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
 
 ## News
 
+- 2024.11
+  - Support F16 and F32 data type model for Ascend 310P NPU.
 - 2024.8
   - Support `Q4_0` and `Q8_0` data type for Ascend NPU.
 - 2024.7
@@ -43,6 +45,7 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
 | Ascend NPU                    | Status  |
 |:-----------------------------:|:-------:|
 | Atlas 300T A2                 | Support |
+| Atlas 300I Duo                | Support |
 
 *Notes:*
 
diff --git a/ggml/src/ggml-cann/CMakeLists.txt b/ggml/src/ggml-cann/CMakeLists.txt
index 901327185..05cf06bfa 100644
--- a/ggml/src/ggml-cann/CMakeLists.txt
+++ b/ggml/src/ggml-cann/CMakeLists.txt
@@ -22,13 +22,14 @@ if(NOT SOC_TYPE)
     detect_ascend_soc_type(SOC_VERSION)
     set(SOC_TYPE "${SOC_VERSION}")
     message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}")
-else()
-    string(TOLOWER ${SOC_TYPE} SOC_VERSION)
 endif()
 
-# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND310P.
+string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower
+
+# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND_310P.
 string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
 set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
+string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
 
 if (CANN_INSTALL_DIR)
     # Only Support Linux.
diff --git a/ggml/src/ggml-cann/kernels/CMakeLists.txt b/ggml/src/ggml-cann/kernels/CMakeLists.txt
index 6a4e17cce..d687220c3 100644
--- a/ggml/src/ggml-cann/kernels/CMakeLists.txt
+++ b/ggml/src/ggml-cann/kernels/CMakeLists.txt
@@ -25,6 +25,6 @@ ascendc_library(ascendc_kernels STATIC
     ${SRC_FILES}
 )
 
-message(STATUS "CANN: compile ascend kernels witch SOC_VERSION:${SOC_VERSION}.")
+message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.")
 ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
 # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
diff --git a/ggml/src/ggml-cann/kernels/dup.cpp b/ggml/src/ggml-cann/kernels/dup.cpp
index 99f03e058..c7ba38d10 100644
--- a/ggml/src/ggml-cann/kernels/dup.cpp
+++ b/ggml/src/ggml-cann/kernels/dup.cpp
@@ -20,7 +20,6 @@ class DupByRows {
         // Input has four dims.
         int64_t op_block_num = GetBlockNum();
         int64_t op_block_idx = GetBlockIdx();
-        assert(op_block_idx < SUPPORTED_MAX_DIM && op_block_idx >= 0, "Invalid block index:%d, max is:%d\n", op_block_idx, SUPPORTED_MAX_DIM);
 
         // param
         num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
diff --git a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
index 377211096..4fbe72208 100644
--- a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
@@ -2,6 +2,15 @@
 
 // optimize me. Use template to avoid copy code.
 using namespace AscendC;
+#ifdef ASCEND_310P // 310P not support 4bit get row
+    extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
+        GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
+        GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
+        GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support 4bit get row.\n");
+    }
+#else
 
 #define BUFFER_NUM 2
 
@@ -110,12 +119,9 @@ class GET_ROW_Q4_0 {
         LocalTensor<float> output_local = output_queue.AllocTensor<float>();
 
         // TODO: cast more data to speed up.
-#ifdef ASCEND_310P
-        // TODO: 310P support quantification
-#else
         Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
         Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
-#endif
+
         // Only mul need compile by group.
         half scale = scale_gm.GetValue(scale_offset);
 
@@ -194,3 +200,5 @@ extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
             indices_nb_ub, output_ne_ub, output_nb_ub);
     op.calculate();
 }
+
+#endif // #ifdef ASCEND_310P
diff --git a/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp b/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
index 8423b3f02..504b43afa 100644
--- a/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
@@ -1,6 +1,14 @@
 #include "kernel_operator.h"
 
 using namespace AscendC;
+#ifdef ASCEND_310P
+    extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
+        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support f16->8bit quantization.\n");
+    }
+#else
 
 #define BUFFER_NUM 2
 #define QK8_0 32
@@ -206,3 +214,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
     op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
     op.calculate();
 }
+
+#endif // #ifdef ASCEND_310P
diff --git a/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp b/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
index b7c575093..05b0bc1df 100644
--- a/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
@@ -1,6 +1,14 @@
 #include "kernel_operator.h"
 
 using namespace AscendC;
+#ifdef ASCEND_310P // 310P not support f32->8bit quantization
+    extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
+        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support f32->8bit quantization.\n");
+    }
+#else
 
 #define BUFFER_NUM 2
 #define QK8_0 32
@@ -204,3 +212,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
     op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
     op.calculate();
 }
+
+#endif // #ifdef ASCEND_310P
diff --git a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
index 9c8c86b66..1188937b7 100644
--- a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
@@ -1,6 +1,21 @@
 #include "kernel_operator.h"
 
 using namespace AscendC;
+#ifdef ASCEND_310P // 310P not support float->4bit quantization
+    extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
+        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support f32->4bit quantization.\n");
+    }
+
+    extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
+        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support f16->4bit quantization.\n");
+    }
+#else
 
 #define BUFFER_NUM 2
 #define Group_Size 32
@@ -276,3 +291,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
     op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
     op.calculate();
 }
+
+#endif // #ifdef ASCEND_310P

From c6bc73951ed52466392b1abda98c28ecbe522c7f Mon Sep 17 00:00:00 2001
From: Ruixin Huang <18860020911@163.com>
Date: Thu, 28 Nov 2024 15:27:11 +0800
Subject: [PATCH 30/43] CANN: Update cann.md to display correctly in CLion
 (#10538)

---
 docs/backend/CANN.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md
index 496e05807..23f10175a 100644
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -42,6 +42,7 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
 ### Ascend NPU
 
 **Verified devices**
+
 | Ascend NPU                    | Status  |
 |:-----------------------------:|:-------:|
 | Atlas 300T A2                 | Support |

From 2025fa67e94358deda4740a74fe9803916cb2f60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sergio=20L=C3=B3pez?= <slp@redhat.com>
Date: Thu, 28 Nov 2024 12:51:38 +0100
Subject: [PATCH 31/43] kompute : improve backend to pass test_backend_ops
 (#10542)

* kompute: op_unary: reject unsupported parameters

Signed-off-by: Sergio Lopez <slp@redhat.com>

* kompute: softmax: implement ALiBi support

Signed-off-by: Sergio Lopez <slp@redhat.com>

* kompute: rope: implement neox and phi3 support

Signed-off-by: Sergio Lopez <slp@redhat.com>

* kompute: op_mul_mat_q4_k permutted support

Signed-off-by: Sergio Lopez <slp@redhat.com>

* kompute: op_mul_mat_[q4_0|q4_1|q8_0] permutted support

Signed-off-by: Sergio Lopez <slp@redhat.com>

* kompute: op_mul_mat_f16 permutted support

Signed-off-by: Sergio Lopez <slp@redhat.com>

* kompute: op_mul_mat_q6_k permutted support

Signed-off-by: Sergio Lopez <slp@redhat.com>

---------

Signed-off-by: Sergio Lopez <slp@redhat.com>
---
 ggml/src/ggml-kompute/CMakeLists.txt          |  12 +-
 ggml/src/ggml-kompute/ggml-kompute.cpp        | 176 ++++++++++++------
 .../ggml-kompute/kompute-shaders/common.comp  |   1 +
 .../kompute-shaders/op_mul_mat_f16.comp       |   6 +-
 .../kompute-shaders/op_mul_mat_q4_k.comp      |  19 +-
 .../kompute-shaders/op_mul_mat_q6_k.comp      |  24 ++-
 .../kompute-shaders/op_mul_mv_q_n.comp        |  14 +-
 .../kompute-shaders/op_mul_mv_q_n_pre.comp    |   8 +-
 .../kompute-shaders/op_rope_f16.comp          |  73 --------
 .../kompute-shaders/op_rope_f32.comp          |  73 --------
 .../kompute-shaders/op_rope_neox_f16.comp     |  52 ++++++
 .../kompute-shaders/op_rope_neox_f32.comp     |  52 ++++++
 .../kompute-shaders/op_rope_norm_f16.comp     |  52 ++++++
 .../kompute-shaders/op_rope_norm_f32.comp     |  52 ++++++
 .../kompute-shaders/op_softmax.comp           |  20 +-
 .../kompute-shaders/rope_common.comp          |   2 +
 16 files changed, 403 insertions(+), 233 deletions(-)
 delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_rope_f16.comp
 delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_rope_f32.comp
 create mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp
 create mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp
 create mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp
 create mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp

diff --git a/ggml/src/ggml-kompute/CMakeLists.txt b/ggml/src/ggml-kompute/CMakeLists.txt
index dc623926c..c9109d5e8 100644
--- a/ggml/src/ggml-kompute/CMakeLists.txt
+++ b/ggml/src/ggml-kompute/CMakeLists.txt
@@ -105,8 +105,10 @@ if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
         kompute-shaders/op_getrows_q4_0.comp
         kompute-shaders/op_getrows_q4_1.comp
         kompute-shaders/op_getrows_q6_k.comp
-        kompute-shaders/op_rope_f16.comp
-        kompute-shaders/op_rope_f32.comp
+        kompute-shaders/op_rope_norm_f16.comp
+        kompute-shaders/op_rope_norm_f32.comp
+        kompute-shaders/op_rope_neox_f16.comp
+        kompute-shaders/op_rope_neox_f32.comp
         kompute-shaders/op_cpy_f16_f16.comp
         kompute-shaders/op_cpy_f16_f32.comp
         kompute-shaders/op_cpy_f32_f16.comp
@@ -139,8 +141,10 @@ if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
         shaderop_getrows_q4_0.h
         shaderop_getrows_q4_1.h
         shaderop_getrows_q6_k.h
-        shaderop_rope_f16.h
-        shaderop_rope_f32.h
+        shaderop_rope_norm_f16.h
+        shaderop_rope_norm_f32.h
+        shaderop_rope_neox_f16.h
+        shaderop_rope_neox_f32.h
         shaderop_cpy_f16_f16.h
         shaderop_cpy_f16_f32.h
         shaderop_cpy_f32_f16.h
diff --git a/ggml/src/ggml-kompute/ggml-kompute.cpp b/ggml/src/ggml-kompute/ggml-kompute.cpp
index 24566404d..28ceecfc4 100644
--- a/ggml/src/ggml-kompute/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute/ggml-kompute.cpp
@@ -28,8 +28,10 @@
 #include "shaderop_getrows_q4_0.h"
 #include "shaderop_getrows_q4_1.h"
 #include "shaderop_getrows_q6_k.h"
-#include "shaderop_rope_f16.h"
-#include "shaderop_rope_f32.h"
+#include "shaderop_rope_norm_f16.h"
+#include "shaderop_rope_norm_f32.h"
+#include "shaderop_rope_neox_f16.h"
+#include "shaderop_rope_neox_f32.h"
 #include "shaderop_cpy_f16_f16.h"
 #include "shaderop_cpy_f16_f32.h"
 #include "shaderop_cpy_f32_f16.h"
@@ -345,7 +347,7 @@ void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t
     std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = {
         vk::DescriptorPoolSize(
           vk::DescriptorType::eStorageBuffer,
-          3 * size // Descriptor count is number of possible tensors to pass into an algorithm
+          4 * size // Descriptor count is number of possible tensors to pass into an algorithm
           )
     };
 
@@ -788,7 +790,8 @@ static void ggml_vk_soft_max(
     const std::shared_ptr<kp::Tensor>& out,
     uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
     int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03,
-    float scale
+    float scale, float max_bias, float m0, float m1,
+    uint32_t n_head_log2
 ) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_softmax_comp_spv,
         kp::shader_data::op_softmax_comp_spv_len);
@@ -796,12 +799,14 @@ static void ggml_vk_soft_max(
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff;
         int32_t ne00, ne01, ne02;
-        float scale;
+        float scale, max_bias, m0, m1;
+        uint32_t n_head_log2;
         int32_t mask;
     } pushConsts {
         safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
         ne00, ne01, ne02,
-        scale,
+        scale, max_bias, m0, m1,
+        n_head_log2,
         bool(inB)
     };
 
@@ -911,9 +916,9 @@ static void ggml_vk_mul_mat_f16(
     const std::shared_ptr<kp::Tensor>& out,
     uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
     int32_t ne00, int32_t ne01, int32_t ne02,
-    uint32_t nb00, uint32_t nb01, uint32_t nb02,
+    uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
     int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
-    uint32_t nb10, uint32_t nb11, uint32_t nb12,
+    uint32_t nb10, uint32_t nb11, uint32_t nb12, uint32_t nb13,
     int32_t ne0, int32_t ne1,
     uint32_t r2, uint32_t r3
 ) {
@@ -923,17 +928,17 @@ static void ggml_vk_mul_mat_f16(
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff;
         int32_t ne00, ne01, ne02;
-        uint32_t nb00, nb01, nb02;
+        uint32_t nb00, nb01, nb02, nb03;
         int32_t ne10, ne11, ne12;
-        uint32_t nb10, nb11, nb12;
+        uint32_t nb10, nb11, nb12, nb13;
         int32_t ne0, ne1;
         uint32_t r2, r3;
     } pushConsts {
         safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
         ne00, ne01, ne02,
-        nb00, nb01, nb02,
+        nb00, nb01, nb02, nb03,
         ne10, ne11, ne12,
-        nb10, nb11, nb12,
+        nb10, nb11, nb12, nb13,
         ne0, ne1,
         r2, r3
     };
@@ -1013,6 +1018,8 @@ static void ggml_vk_mul_mat_impl(
     int32_t ne00, int32_t ne01, int32_t ne02,
     int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
     int32_t ne0, int32_t ne1,
+    uint32_t nb01, uint32_t nb02, uint32_t nb03,
+    uint32_t nb11, uint32_t nb12, uint32_t nb13,
     uint32_t r2, uint32_t r3
 ) {
     struct PushConstants {
@@ -1020,19 +1027,23 @@ static void ggml_vk_mul_mat_impl(
         int32_t ne00, ne01, ne02;
         int32_t ne10, ne12;
         int32_t ne0, ne1;
+        uint32_t nb01, nb02, nb03;
+        uint32_t nb11, nb12, nb13;
         uint32_t r2, r3;
     } pushConsts {
         safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
         ne00, ne01, ne02,
         ne10, ne12,
         ne0, ne1,
+        nb01, nb02, nb03,
+        nb11, nb12, nb13,
         r2, r3
     };
 
     auto name = std::string(__func__) + "_" + suffix;
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(name)) {
-        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
+        const uint32_t local_x = (ggml_vk_current_device().subgroupSize * 2) / 8;
         s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}, {local_x}, {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(name);
@@ -1074,19 +1085,26 @@ static void ggml_vk_mul_mat_q4_k(
     const std::shared_ptr<kp::Tensor>& inB,
     const std::shared_ptr<kp::Tensor>& out,
     uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-    int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne10,
-    int32_t ne11, int32_t ne12, int32_t ne13, int32_t ne0,
-    int32_t ne1, int32_t r2, int32_t r3
+    int32_t ne00, int32_t ne01, int32_t ne02,
+    int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
+    int32_t ne0, int32_t ne1,
+    uint32_t nb01, uint32_t nb02, uint32_t nb03,
+    uint32_t nb11, uint32_t nb12, uint32_t nb13,
+    uint32_t r2, uint32_t r3
 ) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_k_comp_spv,
         kp::shader_data::op_mul_mat_q4_k_comp_spv_len);
 
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12, r2, r3;
+        int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12;
+        uint32_t nb01, nb02, nb03, nb11, nb12, nb13;
+        uint32_t r2, r3;
     } pushConsts {
-        0, 0, 0,
-        ne00, ne10, ne0, ne1, ne01, ne02, ne12, r2, r3
+        inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, ne10, ne0, ne1, ne01, ne02, ne12,
+        nb01, nb02, nb03, nb11, nb12, nb13,
+        r2, r3
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
@@ -1108,28 +1126,37 @@ static void ggml_vk_mul_mat_q6_k(
     const std::shared_ptr<kp::Tensor>& inB,
     const std::shared_ptr<kp::Tensor>& out,
     uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-    int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1,
-    int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02
+    int32_t ne00, int32_t ne01, int32_t ne02,
+    int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
+    int32_t ne0, int32_t ne1,
+    uint32_t nb01, uint32_t nb02, uint32_t nb03,
+    uint32_t nb11, uint32_t nb12, uint32_t nb13,
+    uint32_t r2, uint32_t r3
 ) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv,
         kp::shader_data::op_mul_mat_q6_k_comp_spv_len);
 
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, ne10, ne0, ne1, ne01, gqa;
+        int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12;
+        uint32_t nb01, nb02, nb03, nb11, nb12, nb13;
+        uint32_t r2, r3;
     } pushConsts {
         inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, ne10, ne0, ne1, ne01, ne12/ne02
+        ne00, ne10, ne0, ne1, ne01, ne02, ne12,
+        nb01, nb02, nb03, nb11, nb12, nb13,
+        r2, r3
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__)) {
-        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
+        const uint32_t local_x = 2;
+        const uint32_t local_y = ggml_vk_current_device().subgroupSize;
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)}, {local_x, local_y}, {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)});
+        s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
         s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
@@ -1217,10 +1244,11 @@ static void ggml_vk_rope(
     kp::Sequence& seq,
     const std::shared_ptr<kp::Tensor>& inA,
     const std::shared_ptr<kp::Tensor>& inB,
+    const std::shared_ptr<kp::Tensor>& inC,
     const std::shared_ptr<kp::Tensor>& out,
-    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+    uint32_t inAOff, uint32_t inBOff, uint32_t inCOff, uint32_t outOff,
     ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_ctx_orig,
-    float freq_base, float freq_scale, float ext_factor, float attn_factor, float beta_fast, float beta_slow,
+    float freq_base, float freq_scale, bool has_freq_factors, float ext_factor, float attn_factor, float beta_fast, float beta_slow,
     int32_t ne01, int32_t ne02, int32_t ne03,
     uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
     int32_t ne0,
@@ -1228,11 +1256,17 @@ static void ggml_vk_rope(
 ) {
     GGML_ASSERT(src0t == GGML_TYPE_F16 || src0t == GGML_TYPE_F32);
 
-    static const auto spirv_f16 = getSpirvShader(
-        kp::shader_data::op_rope_f16_comp_spv, kp::shader_data::op_rope_f16_comp_spv_len
+    static const auto spirv_norm_f16 = getSpirvShader(
+        kp::shader_data::op_rope_norm_f16_comp_spv, kp::shader_data::op_rope_norm_f16_comp_spv_len
     );
-    static const auto spirv_f32 = getSpirvShader(
-        kp::shader_data::op_rope_f32_comp_spv, kp::shader_data::op_rope_f32_comp_spv_len
+    static const auto spirv_norm_f32 = getSpirvShader(
+        kp::shader_data::op_rope_norm_f32_comp_spv, kp::shader_data::op_rope_norm_f32_comp_spv_len
+    );
+    static const auto spirv_neox_f16 = getSpirvShader(
+        kp::shader_data::op_rope_neox_f16_comp_spv, kp::shader_data::op_rope_neox_f16_comp_spv_len
+    );
+    static const auto spirv_neox_f32 = getSpirvShader(
+        kp::shader_data::op_rope_neox_f32_comp_spv, kp::shader_data::op_rope_neox_f32_comp_spv_len
     );
 
     int type_size = src0t == GGML_TYPE_F16 ? 2 : 4;
@@ -1247,32 +1281,40 @@ static void ggml_vk_rope(
     GGML_ASSERT(nb0  % type_size == 0);
 
     struct PushConstants {
-        uint32_t inAOff, inBOff, outOff;
+        uint32_t inAOff, inBOff, inCOff, outOff;
         int32_t n_dims, mode, n_ctx_orig;
-        float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+        float freq_base, freq_scale;
+        bool has_freq_factors;
+        float ext_factor, attn_factor, beta_fast, beta_slow;
         uint32_t nb00, nb01, nb02, nb03;
         int32_t ne0;
         uint32_t nb0, nb1, nb2, nb3;
     } pushConsts {
-        safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(outOff, type_size),
+        safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(inCOff, type_size), safe_divide(outOff, type_size),
         n_dims, mode, n_ctx_orig,
-        freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
+        freq_base, freq_scale,
+        has_freq_factors,
+        ext_factor, attn_factor, beta_fast, beta_slow,
         nb00, nb01, nb02, nb03,
         ne0,
         nb0, nb1, nb2, nb3
     };
 
-    auto name = std::string(__func__) + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32");
+    auto & inC_ = inC ? inC : inA;
+    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+    const bool is_f16 = src0t == GGML_TYPE_F16;
+
+    auto name = std::string(__func__) + (is_neox ? "_neox" : "_norm") + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32");
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(name)) {
+        auto & spirv = is_neox ? is_f16 ? spirv_neox_f16 : spirv_neox_f32 : is_f16 ? spirv_norm_f16 : spirv_norm_f32;
         s_algo = komputeManager()->algorithm<float, PushConstants>(
-            name, s_kompute_context->pool.get(), {inA, inB, out},
-            src0t == GGML_TYPE_F16 ? spirv_f16 : spirv_f32,
+            name, s_kompute_context->pool.get(), {inA, inB, inC_, out}, spirv,
             {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}
         );
     } else {
         s_algo = komputeManager()->getAlgorithm(name);
-        s_algo->setTensors({inA, inB, out});
+        s_algo->setTensors({inA, inB, inC_, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
         s_algo->updateDescriptors(s_kompute_context->pool.get());
@@ -1351,11 +1393,15 @@ static void ggml_vk_cpy_f16_f32(Args&&... args) {
 }
 
 static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    int64_t n = ggml_nelements(op);
     switch (op->op) {
         case GGML_OP_UNARY:
+            if (n % 4 != 0) return false;
             switch (ggml_get_unary_op(op)) {
-                case GGML_UNARY_OP_RELU:
                 case GGML_UNARY_OP_GELU:
+                    if (n % 8 != 0) return false;
+                    // fall through
+                case GGML_UNARY_OP_RELU:
                 case GGML_UNARY_OP_SILU:
                     return ggml_is_contiguous(op->src[0]);
                 default:
@@ -1413,8 +1459,8 @@ static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, cons
 
             switch (op->src[0]->type) {
                 case GGML_TYPE_F32:
-                case GGML_TYPE_Q6_K:
                     return op->ne[3] == 1;
+                case GGML_TYPE_Q6_K:
                 case GGML_TYPE_F16:
                 case GGML_TYPE_Q8_0:
                 case GGML_TYPE_Q4_0:
@@ -1515,9 +1561,11 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
             const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
             uint32_t off_src0 = 0;
             uint32_t off_src1 = 0;
+            uint32_t off_src2 = 0;
             uint32_t off_dst  = 0;
             const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor;
             const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor;
+            const std::shared_ptr<kp::Tensor>& id_src2 = src2 ? ggml_vk_get_tensor(src2, &off_src2) : nullTensor;
             const std::shared_ptr<kp::Tensor>& id_dst  = dst  ? ggml_vk_get_tensor(dst,  &off_dst)  : nullTensor;
 
             switch (dst->op) {
@@ -1593,11 +1641,16 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
 #pragma message("ref:  https://github.com/ggerganov/llama.cpp/pull/5021")
                         GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
 
-#pragma message("TODO: add ALiBi support")
-#pragma message("ref:  https://github.com/ggerganov/llama.cpp/pull/7192")
-                        GGML_ASSERT(max_bias == 0.0f);
+                        const int64_t nrows_x = ggml_nrows(src0);
+                        const int64_t nrows_y = src0->ne[1];
 
-                        ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
+                        const uint32_t n_head      = nrows_x/nrows_y;
+                        const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
+
+                        const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
+                        const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+
+                        ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale, max_bias, m0, m1, n_head_log2);
                     } break;
                 case GGML_OP_DIAG_MASK_INF:
                     {
@@ -1649,38 +1702,44 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
                             case GGML_TYPE_F16:
                                 ggml_vk_mul_mat_f16(
                                     seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, ne13, nb10, nb11, nb12,
+                                    ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+                                    ne10, ne11, ne12, ne13, nb10, nb11, nb12, nb13,
                                     ne0, ne1, r2, r3
                                 );
                                 break;
                             case GGML_TYPE_Q8_0:
                                 ggml_vk_mul_mat_q8_0(
                                     seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
+                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
+                                    nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
                                 );
                                 break;
                             case GGML_TYPE_Q4_0:
                                 ggml_vk_mul_mat_q4_0(
                                     seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
+                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
+                                    nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
                                 );
                                 break;
                             case GGML_TYPE_Q4_1:
                                 ggml_vk_mul_mat_q4_1(
                                     seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
+                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
+                                    nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
                                 );
                                 break;
                             case GGML_TYPE_Q4_K:
                                 ggml_vk_mul_mat_q4_k(
                                     seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, ne12/ne02, ne13/ne03
+                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
+                                    nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
                                 );
                                 break;
                             case GGML_TYPE_Q6_K:
                                 ggml_vk_mul_mat_q6_k(
                                     seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02
+                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
+                                    nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
                                 );
                                 break;
                             default: {
@@ -1709,13 +1768,6 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
                     } break;
                 case GGML_OP_ROPE:
                     {
-#pragma message("TODO: implement phi3 frequency factors support")
-#pragma message("      https://github.com/ggerganov/llama.cpp/pull/7225")
-                        GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
-
-#pragma message("TODO: update rope NORM mode to match NEOX mode")
-#pragma message("      https://github.com/ggerganov/llama.cpp/pull/7634")
-
                         GGML_ASSERT(ne10 == ne02);
                         GGML_ASSERT(src0t == dstt);
                         // const int n_past = ((int32_t *) dst->op_params)[0];
@@ -1724,6 +1776,8 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
                         // skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan
                         const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
 
+                        const bool has_freq_factors = dst->src[2] != nullptr;
+
                         float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
                         memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
                         memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
@@ -1732,8 +1786,8 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
                         memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
                         memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
                         ggml_vk_rope(
-                            seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, n_ctx_orig,
-                            freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
+                            seq, id_src0, id_src1, id_src2, id_dst, off_src0, off_src1, off_src2, off_dst, src0t, n_dims, mode, n_ctx_orig,
+                            freq_base, freq_scale, has_freq_factors, ext_factor, attn_factor, beta_fast, beta_slow,
                             ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3
                         );
                     } break;
diff --git a/ggml/src/ggml-kompute/kompute-shaders/common.comp b/ggml/src/ggml-kompute/kompute-shaders/common.comp
index 2aaddf704..dbe4cf804 100644
--- a/ggml/src/ggml-kompute/kompute-shaders/common.comp
+++ b/ggml/src/ggml-kompute/kompute-shaders/common.comp
@@ -3,6 +3,7 @@
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #extension GL_EXT_shader_explicit_arithmetic_types_int8: require
 #extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int64: require
 #extension GL_EXT_control_flow_attributes: enable
 #extension GL_KHR_shader_subgroup_arithmetic : require
 #extension GL_EXT_debug_printf : enable
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp
index 8f0a9031f..0ab1b2fc2 100644
--- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp
@@ -20,12 +20,14 @@ layout (push_constant) uniform parameter {
     uint nb00;
     uint nb01;
     uint nb02;
+    uint nb03;
     int ne10;
     int ne11;
     int ne12;
     uint nb10;
     uint nb11;
     uint nb12;
+    uint nb13;
     int ne0;
     int ne1;
     uint r2;
@@ -42,7 +44,7 @@ void main() {
     const uint i12 = im%pcs.ne12;
     const uint i13 = im/pcs.ne12;
 
-    const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb02*pcs.ne02;
+    const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb03;
 
     const uint x = offset0 / 2 + pcs.inAOff; // Based from inA
 
@@ -52,7 +54,7 @@ void main() {
             break;
         }
 
-        const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // Based from inB
+        const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
 
         float sumf = 0;
         for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp
index fc8e45aa9..a5752a3a0 100644
--- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp
@@ -24,8 +24,14 @@ layout (push_constant) uniform parameter {
     int ne01;
     int ne02;
     int ne12;
-    int r2;
-    int r3;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    uint nb11;
+    uint nb12;
+    uint nb13;
+    uint r2;
+    uint r3;
 } pcs;
 
 void main() {
@@ -50,10 +56,11 @@ void main() {
     const uint i12 = im%pcs.ne12;
     const uint i13 = im/pcs.ne12;
 
-    const uint offset0 = (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
+    const uint offset0 = first_row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
+    const uint offset1 =        r1*pcs.nb11 + (i12       )*pcs.nb12 + (i13       )*pcs.nb13;
 
-    const uint xblk = ib_row + offset0 + pcs.inAOff;
-    const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff;
+    const uint xblk = offset0 + pcs.inAOff;
+    const uint y = (offset1 / 4) + pcs.inBOff;
 
     float yl[16];
     float yh[16];
@@ -74,7 +81,7 @@ void main() {
         }
 
         for (int row = 0; row < N_DST; row++) {
-            uint row_idx = row * nb;
+            uint row_idx = row * (pcs.nb01 / SIZE_OF_BLOCK);
 
             uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0);
             uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2);
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp
index c9baebdf4..d331d1a70 100644
--- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp
@@ -21,7 +21,16 @@ layout (push_constant) uniform parameter {
     int ne0;
     int ne1;
     int ne01;
-    int gqa;
+    int ne02;
+    int ne12;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    uint nb11;
+    uint nb12;
+    uint nb13;
+    uint r2;
+    uint r3;
 } pcs;
 
 void main() {
@@ -34,12 +43,15 @@ void main() {
 
     const uint r0 = gl_WorkGroupID.x;
     const uint r1 = gl_WorkGroupID.y;
-    const uint r2 = gl_WorkGroupID.z;
+    const uint im = gl_WorkGroupID.z;
 
     const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID);
-    const uint offset0 = r2/pcs.gqa*(nb*pcs.ne0);
-    const uint x = row * nb + offset0; // Based from inA without base offset
-    const uint yy = r1*pcs.ne10 + r2*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
+
+    const uint i12 = im%pcs.ne12;
+    const uint i13 = im/pcs.ne12;
+
+    const uint x = row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
+    const uint yy = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
 
     float sumf = 0;
 
@@ -89,6 +101,6 @@ void main() {
 
     const float tot = subgroupAdd(sumf);
     if (subgroupElect()) {
-        out_[r1*pcs.ne0 + r2*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
+        out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
     }
 }
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp
index 440b5ab2c..a6517cc1f 100644
--- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp
@@ -14,10 +14,15 @@ void main() {
     const uint i12 = im%pcs.ne12;
     const uint i13 = im/pcs.ne12;
 
-    const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
+    // pointers to src0 rows
+    uint ax[N_ROWS];
+    for (int row = 0; row < N_ROWS; ++row) {
+        const uint offset0 = (first_row + row)*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
 
-    const uint x = offset0; // Based from inA without base offset
-    const uint y = r1*uint(pcs.ne10)+im*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
+        ax[row] = offset0 + pcs.inAOff;
+    }
+
+    const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
 
     float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
 
@@ -32,8 +37,7 @@ void main() {
 
     for (uint ib = ix; ib < nb; ib += 16) {
         for (int row = 0; row < N_ROWS; row++) {
-            const uint block_index = x + ib + row * nb;
-            sumf[row] += block_q_n_dot_y(block_index, yb, il);
+            sumf[row] += block_q_n_dot_y(ax[row] + ib, yb, il);
         }
 
         yb += BLOCKS_IN_QUANT * 16;
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp
index 7912b09ac..a9a2f2218 100644
--- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp
@@ -1,5 +1,5 @@
 layout(local_size_x_id = 0) in;
-layout(local_size_y = 1) in;
+layout(local_size_y = 8) in;
 layout(local_size_z = 1) in;
 
 layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
@@ -17,6 +17,12 @@ layout (push_constant) uniform parameter {
     int  ne12;
     int  ne0;
     int  ne1;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    uint nb11;
+    uint nb12;
+    uint nb13;
     uint r2;
     uint r3;
 } pcs;
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_f16.comp
deleted file mode 100644
index 0ecfb2eab..000000000
--- a/ggml/src/ggml-kompute/kompute-shaders/op_rope_f16.comp
+++ /dev/null
@@ -1,73 +0,0 @@
-#version 450
-
-#include "rope_common.comp"
-
-layout(binding = 0) buffer restrict readonly  tensorInA { float16_t inA[]; };
-layout(binding = 1) buffer restrict readonly  tensorInB { int       inB[]; };
-layout(binding = 2) buffer restrict writeonly tensorOut { float16_t out_[]; };
-
-void main() {
-    const uint i3 = gl_WorkGroupID.z;
-    const uint i2 = gl_WorkGroupID.y;
-    const uint i1 = gl_WorkGroupID.x;
-
-    const bool is_neox = (pcs.mode & GGML_ROPE_TYPE_NEOX) != 0;
-
-    float corr_dims[2];
-    rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
-
-    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
-
-    const int p = inB[pcs.inBOff + i2];
-
-    float theta = float(p);
-
-    if (!is_neox) {
-        for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
-            float cos_theta, sin_theta;
-            rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
-
-            theta *= theta_scale;
-
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
-
-            const float x0 = float(inA[src]);
-            const float x1 = float(inA[src+1]);
-
-            out_[dst_data]   = float16_t(x0*cos_theta - x1*sin_theta);
-            out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta);
-        }
-    } else {
-        const float inv_ndims = -1.f/pcs.n_dims;
-        for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
-            const uint cur_rot = ic;
-
-            float cos_theta, sin_theta;
-            rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
-
-            theta *= theta_scale;
-
-            const uint i0 = ic/2;
-
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
-
-            const float x0 = float(inA[src]);
-            const float x1 = float(inA[src+pcs.n_dims/2]);
-
-            out_[dst_data]              = float16_t(x0*cos_theta - x1*sin_theta);
-            out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
-        }
-
-        for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) {
-            const uint i0 = ic;
-
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
-
-            out_[dst_data + 0] = inA[src + 0];
-            out_[dst_data + 1] = inA[src + 1];
-        }
-    }
-}
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_f32.comp
deleted file mode 100644
index cec0fd9a5..000000000
--- a/ggml/src/ggml-kompute/kompute-shaders/op_rope_f32.comp
+++ /dev/null
@@ -1,73 +0,0 @@
-#version 450
-
-#include "rope_common.comp"
-
-layout(binding = 0) buffer restrict readonly  tensorInA { float inA[]; };
-layout(binding = 1) buffer restrict readonly  tensorInB { int   inB[]; };
-layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
-
-void main() {
-    const uint i3 = gl_WorkGroupID.z;
-    const uint i2 = gl_WorkGroupID.y;
-    const uint i1 = gl_WorkGroupID.x;
-
-    const bool is_neox = (pcs.mode & GGML_ROPE_TYPE_NEOX) != 0;
-
-    float corr_dims[2];
-    rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
-
-    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
-
-    const int p = inB[pcs.inBOff + i2];
-
-    float theta = float(p);
-
-    if (!is_neox) {
-        for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
-            float cos_theta, sin_theta;
-            rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
-
-            theta *= theta_scale;
-
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
-
-            const float x0 = inA[src];
-            const float x1 = inA[src+1];
-
-            out_[dst_data]   = x0*cos_theta - x1*sin_theta;
-            out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
-        }
-    } else {
-        const float inv_ndims = -1.f/pcs.n_dims;
-        for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
-            const uint cur_rot = ic;
-
-            float cos_theta, sin_theta;
-            rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
-
-            theta *= theta_scale;
-
-            const uint i0 = ic/2;
-
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
-
-            const float x0 = inA[src];
-            const float x1 = inA[src+pcs.n_dims/2];
-
-            out_[dst_data] = x0*cos_theta - x1*sin_theta;
-            out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
-        }
-
-        for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) {
-            const uint i0 = ic;
-
-            const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
-
-            out_[dst_data + 0] = inA[src + 0];
-            out_[dst_data + 1] = inA[src + 1];
-        }
-    }
-}
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp
new file mode 100644
index 000000000..63659cbfe
--- /dev/null
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp
@@ -0,0 +1,52 @@
+#version 450
+
+#include "rope_common.comp"
+
+layout(binding = 0) buffer restrict readonly  tensorInA { float16_t inA[]; };
+layout(binding = 1) buffer restrict readonly  tensorInB { int       inB[]; };
+layout(binding = 2) buffer restrict readonly  tensorInC { float     inC[]; };
+layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; };
+
+void main() {
+    const uint i3 = gl_WorkGroupID.z;
+    const uint i2 = gl_WorkGroupID.y;
+    const uint i1 = gl_WorkGroupID.x;
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
+
+    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
+
+    float theta_base = float(inB[pcs.inBOff + i2]);
+    float inv_ndims = -1.f/pcs.n_dims;
+
+    float cos_theta;
+    float sin_theta;
+
+    for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
+        if (i0 < pcs.n_dims) {
+            uint ic = i0/2;
+
+            float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
+
+            const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
+
+            rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
+
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 2) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + ic*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
+
+            const float x0 = float(inA[src]);
+            const float x1 = float(inA[src+pcs.n_dims/2]);
+
+            out_[dst_data]              = float16_t(x0*cos_theta - x1*sin_theta);
+            out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
+        } else {
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
+
+            out_[dst_data]   = inA[src];
+            out_[dst_data+1] = inA[src+1];
+        }
+    }
+}
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp
new file mode 100644
index 000000000..4df56204d
--- /dev/null
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp
@@ -0,0 +1,52 @@
+#version 450
+
+#include "rope_common.comp"
+
+layout(binding = 0) buffer restrict readonly  tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly  tensorInB { int       inB[]; };
+layout(binding = 2) buffer restrict readonly  tensorInC { float inC[]; };
+layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; };
+
+void main() {
+    const uint i3 = gl_WorkGroupID.z;
+    const uint i2 = gl_WorkGroupID.y;
+    const uint i1 = gl_WorkGroupID.x;
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
+
+    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
+
+    float theta_base = float(inB[pcs.inBOff + i2]);
+    float inv_ndims = -1.f/pcs.n_dims;
+
+    float cos_theta;
+    float sin_theta;
+
+    for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
+        if (i0 < pcs.n_dims) {
+            uint ic = i0/2;
+
+            float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
+
+            const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
+
+            rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
+
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 4) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + ic*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
+
+            const float x0 = inA[src];
+            const float x1 = inA[src+pcs.n_dims/2];
+
+            out_[dst_data]              = x0*cos_theta - x1*sin_theta;
+            out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
+        } else {
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
+
+            out_[dst_data]   = inA[src];
+            out_[dst_data+1] = inA[src+1];
+        }
+    }
+}
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp
new file mode 100644
index 000000000..a3c0eda8b
--- /dev/null
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp
@@ -0,0 +1,52 @@
+#version 450
+
+#include "rope_common.comp"
+
+layout(binding = 0) buffer restrict readonly  tensorInA { float16_t inA[]; };
+layout(binding = 1) buffer restrict readonly  tensorInB { int       inB[]; };
+layout(binding = 2) buffer restrict readonly  tensorInC { float     inC[]; };
+layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; };
+
+void main() {
+    const uint i3 = gl_WorkGroupID.z;
+    const uint i2 = gl_WorkGroupID.y;
+    const uint i1 = gl_WorkGroupID.x;
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
+
+    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
+
+    float theta_base = float(inB[pcs.inBOff + i2]);
+    float inv_ndims = -1.f/pcs.n_dims;
+
+    float cos_theta;
+    float sin_theta;
+
+    for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
+        if (i0 < pcs.n_dims) {
+            uint ic = i0/2;
+
+            float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
+
+            const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
+
+            rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
+
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
+
+            const float x0 = float(inA[src]);
+            const float x1 = float(inA[src+1]);
+
+            out_[dst_data]   = float16_t(x0*cos_theta - x1*sin_theta);
+            out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta);
+        } else {
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
+
+            out_[dst_data]   = inA[src];
+            out_[dst_data+1] = inA[src+1];
+        }
+    }
+}
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp
new file mode 100644
index 000000000..b7963ae72
--- /dev/null
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp
@@ -0,0 +1,52 @@
+#version 450
+
+#include "rope_common.comp"
+
+layout(binding = 0) buffer restrict readonly  tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly  tensorInB { int   inB[]; };
+layout(binding = 2) buffer restrict readonly  tensorInC { float inC[]; };
+layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; };
+
+void main() {
+    const uint i3 = gl_WorkGroupID.z;
+    const uint i2 = gl_WorkGroupID.y;
+    const uint i1 = gl_WorkGroupID.x;
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
+
+    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
+
+    float theta_base = float(inB[pcs.inBOff + i2]);
+    float inv_ndims = -1.f/pcs.n_dims;
+
+    float cos_theta;
+    float sin_theta;
+
+    for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
+        if (i0 < pcs.n_dims) {
+            uint ic = i0/2;
+
+            float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
+
+            const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
+
+            rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
+
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
+
+            const float x0 = inA[src];
+            const float x1 = inA[src+1];
+
+            out_[dst_data]   = x0*cos_theta - x1*sin_theta;
+            out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
+        } else {
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
+
+            out_[dst_data]   = inA[src];
+            out_[dst_data+1] = inA[src+1];
+        }
+    }
+}
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp b/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp
index 7bc9176ca..4165295bf 100644
--- a/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp
@@ -18,6 +18,10 @@ layout(push_constant) uniform PushConstants {
     int ne01;
     int ne02;
     float scale;
+    float max_bias;
+    float m0;
+    float m1;
+    uint n_head_log2;
     int mask;
 } pcs;
 
@@ -34,17 +38,29 @@ void main() {
     const uint pmask = i01*pcs.ne00 + pcs.inBOff; // Based from inB
     const uint pdst = extra_off + pcs.outOff; // Based from out_
 
+    float slope = 1.0f;
+
+    // ALiBi
+    if (pcs.max_bias > 0.0f) {
+        int64_t h = i02;
+
+        float base = h < pcs.n_head_log2 ? pcs.m0 : pcs.m1;
+        int64_t exp = h < pcs.n_head_log2 ? h + 1 : 2*(h - pcs.n_head_log2) + 1;
+
+        slope = pow(base, float(exp));
+    }
+
     // parallel max
     float localMax = uintBitsToFloat(0xFF800000);
     for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
-        localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f));
+        localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f));
     }
     float max_ = subgroupMax(localMax);
 
     // parallel sum
     float localSum = 0.0f;
     for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
-        const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f) - max_);
+        const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f) - max_);
         localSum += exp_psrc0;
         out_[pdst + i00] = exp_psrc0;
     }
diff --git a/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp b/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp
index df4702896..0fca640dc 100644
--- a/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp
+++ b/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp
@@ -8,12 +8,14 @@ layout(local_size_x = 1) in;
 layout (push_constant) uniform parameter {
     uint inAOff;
     uint inBOff;
+    uint inCOff;
     uint outOff;
     int n_dims;
     int mode;
     int n_ctx_orig;
     float freq_base;
     float freq_scale;
+    bool has_freq_factors;
     float ext_factor;
     float attn_factor;
     float beta_fast;

From c202cef1686182a78f8f4e253ab8d0c0ffe2fcc8 Mon Sep 17 00:00:00 2001
From: Shupei Fan <dymarkfan@outlook.com>
Date: Thu, 28 Nov 2024 20:52:03 +0800
Subject: [PATCH 32/43] ggml-cpu: support IQ4_NL_4_4 by runtime repack (#10541)

* ggml-cpu: support IQ4_NL_4_4 by runtime repack

* ggml-cpu: add __ARM_FEATURE_DOTPROD guard
---
 ggml/include/ggml-cpu.h              |   1 +
 ggml/include/ggml.h                  |   3 +
 ggml/src/ggml-common.h               |   6 +
 ggml/src/ggml-cpu/ggml-cpu-aarch64.c | 321 +++++++++++++++++++++++++--
 ggml/src/ggml-cpu/ggml-cpu-aarch64.h |   2 +
 ggml/src/ggml-cpu/ggml-cpu.c         |  27 ++-
 ggml/src/ggml-cpu/ggml-cpu.cpp       |   2 +-
 ggml/src/ggml.c                      |   9 +
 8 files changed, 352 insertions(+), 19 deletions(-)

diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
index a5358d047..e14ea9ea5 100644
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -91,6 +91,7 @@ extern "C" {
     GGML_BACKEND_API int ggml_cpu_has_neon       (void);
     GGML_BACKEND_API int ggml_cpu_has_arm_fma    (void);
     GGML_BACKEND_API int ggml_cpu_has_fp16_va    (void);
+    GGML_BACKEND_API int ggml_cpu_has_dotprod    (void);
     GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
     GGML_BACKEND_API int ggml_cpu_has_sve        (void);
     GGML_BACKEND_API int ggml_cpu_get_sve_cnt    (void);  // sve vector length in bytes
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 9843b09fb..65cb92c44 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -389,6 +389,9 @@ extern "C" {
         GGML_TYPE_Q4_0_8_8 = 33,
         GGML_TYPE_TQ1_0   = 34,
         GGML_TYPE_TQ2_0   = 35,
+        GGML_TYPE_IQ4_NL_4_4 = 36,
+        // GGML_TYPE_IQ4_NL_4_8 = 37,
+        // GGML_TYPE_IQ4_NL_8_8 = 38,
         GGML_TYPE_COUNT,
     };
 
diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
index 050161393..27253a6c2 100644
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -418,6 +418,12 @@ typedef struct {
 } block_iq4_xs;
 static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
 
+typedef struct {
+    ggml_half d[4];        // deltas for 4 iq4_nl blocks
+    uint8_t qs[QK4_NL * 2];// nibbles / quants for 4 iq4_nl blocks
+} block_iq4_nlx4;
+static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
+
 #endif // GGML_COMMON_DECL
 #endif // GGML_COMMON_DECL
 
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
index 96a16dfba..ced378879 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
@@ -187,6 +187,8 @@ static inline __m256i mul_sum_i8_pairs_int32x8(const __m256i x, const __m256i y)
 }
 #endif
 
+static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
+
 static void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k) {
     assert(QK8_0 == 32);
     assert(k % QK8_0 == 0);
@@ -528,7 +530,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(blocklen);
 
 #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
-    if (ggml_cpu_has_neon()) {
+    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;
         float * res_ptr = s;
@@ -996,6 +998,102 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
     }
 }
 
+void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 4;
+
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
+        const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        float * res_ptr = s;
+
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+
+            float32x4_t sumf = vdupq_n_f32(0);
+            for (int l = 0; l < nb; l++) {
+                uint8x16_t b_0 = vld1q_u8(b_ptr[l].qs + 0);
+                uint8x16_t b_1 = vld1q_u8(b_ptr[l].qs + 16);
+                uint8x16_t b_2 = vld1q_u8(b_ptr[l].qs + 32);
+                uint8x16_t b_3 = vld1q_u8(b_ptr[l].qs + 48);
+
+                int8x16_t b_0_hi = vqtbl1q_s8(kvalues, b_0 >> 4);
+                int8x16_t b_0_lo = vqtbl1q_s8(kvalues, b_0 & 0x0F);
+                int8x16_t b_1_hi = vqtbl1q_s8(kvalues, b_1 >> 4);
+                int8x16_t b_1_lo = vqtbl1q_s8(kvalues, b_1 & 0x0F);
+                int8x16_t b_2_hi = vqtbl1q_s8(kvalues, b_2 >> 4);
+                int8x16_t b_2_lo = vqtbl1q_s8(kvalues, b_2 & 0x0F);
+                int8x16_t b_3_hi = vqtbl1q_s8(kvalues, b_3 >> 4);
+                int8x16_t b_3_lo = vqtbl1q_s8(kvalues, b_3 & 0x0F);
+
+                int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 0);
+                int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16);
+
+                int32x4_t sumi = vdupq_n_s32(0);
+                sumi = vdotq_laneq_s32(sumi, b_0_lo, a_0, 0);
+                sumi = vdotq_laneq_s32(sumi, b_0_hi, a_1, 0);
+                sumi = vdotq_laneq_s32(sumi, b_1_lo, a_0, 1);
+                sumi = vdotq_laneq_s32(sumi, b_1_hi, a_1, 1);
+                sumi = vdotq_laneq_s32(sumi, b_2_lo, a_0, 2);
+                sumi = vdotq_laneq_s32(sumi, b_2_hi, a_1, 2);
+                sumi = vdotq_laneq_s32(sumi, b_3_lo, a_0, 3);
+                sumi = vdotq_laneq_s32(sumi, b_3_hi, a_1, 3);
+
+                float32x4_t a_d = vcvt_f32_f16(vld1_dup_f16((const float16_t *)&a_ptr[l].d));
+                float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
+                float32x4_t d = a_d * b_d;
+
+                sumf = vmlaq_f32(sumf, d, vcvtq_f32_s32(sumi));
+            }
+
+            vst1q_f32(res_ptr + x * 4, sumf);
+        }
+        return;
+    }
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
+    {
+        float sumf[4];
+        int sumi;
+
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
+
+            for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int j = 0; j < ncols_interleaved; j++) {
+                        sumi = 0;
+                        for (int i = 0; i < blocklen; ++i) {
+                            const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
+                            const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
+                            sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
+                        }
+                        sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
+                    }
+                }
+            }
+            for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+        }
+    }
+}
+
 void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
     const int qk = QK8_0;
     const int nb = n / qk;
@@ -1017,7 +1115,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(blocklen);
 
 #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
-    if (ggml_cpu_has_neon()) {
+    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;
         float * res_ptr = s;
@@ -3386,6 +3484,117 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
     }
 }
 
+void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 4;
+
+    assert (n % qk == 0);
+    assert (nr % 4 == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
+        const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
+
+        for (int y = 0; y < nr / 4; y++) {
+            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+            for (int x = 0; x < nc / ncols_interleaved; x++) {
+                const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+
+                float32x4_t sumf[4];
+                for (int m = 0; m < 4; m++) {
+                    sumf[m] = vdupq_n_f32(0);
+                }
+
+                for (int l = 0; l < nb; l++) {
+                    float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *)a_ptr[l].d));
+                    float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
+
+                    int32x4_t sumi_0 = vdupq_n_s32(0);
+                    int32x4_t sumi_1 = vdupq_n_s32(0);
+                    int32x4_t sumi_2 = vdupq_n_s32(0);
+                    int32x4_t sumi_3 = vdupq_n_s32(0);
+
+                    for (int k = 0; k < 4; k++) {
+                        int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 16 * k + 0);
+                        int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16 * k + 64);
+
+                        uint8x16_t b = vld1q_u8(b_ptr[l].qs + 16 * k);
+                        int8x16_t b_hi = vqtbl1q_s8(kvalues, b >> 4);
+                        int8x16_t b_lo = vqtbl1q_s8(kvalues, b & 0xF);
+
+                        sumi_0 = vdotq_laneq_s32(sumi_0, b_lo, a_0, 0);
+                        sumi_1 = vdotq_laneq_s32(sumi_1, b_lo, a_0, 1);
+                        sumi_2 = vdotq_laneq_s32(sumi_2, b_lo, a_0, 2);
+                        sumi_3 = vdotq_laneq_s32(sumi_3, b_lo, a_0, 3);
+                        sumi_0 = vdotq_laneq_s32(sumi_0, b_hi, a_1, 0);
+                        sumi_1 = vdotq_laneq_s32(sumi_1, b_hi, a_1, 1);
+                        sumi_2 = vdotq_laneq_s32(sumi_2, b_hi, a_1, 2);
+                        sumi_3 = vdotq_laneq_s32(sumi_3, b_hi, a_1, 3);
+                    }
+
+                    sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0));
+                    sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1));
+                    sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2));
+                    sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3));
+                }
+
+                for (int m = 0; m < 4; m++) {
+                    vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]);
+                }
+            }
+        }
+        return;
+    }
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
+    {
+        float sumf[4][4];
+        int sumi;
+
+        for (int y = 0; y < nr / 4; y++) {
+            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+            for (int x = 0; x < nc / ncols_interleaved; x++) {
+                const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
+                for (int m = 0; m < 4; m++) {
+                    for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+                }
+                for (int l = 0; l < nb; l++) {
+                    for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                        for (int m = 0; m < 4; m++) {
+                            for (int j = 0; j < ncols_interleaved; j++) {
+                                sumi = 0;
+                                for (int i = 0; i < blocklen; ++i) {
+                                    const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
+                                    const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
+                                    sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                            (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
+                                }
+                                sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
+                            }
+                        }
+                    }
+                }
+                for (int m = 0; m < 4; m++) {
+                    for (int j = 0; j < ncols_interleaved; j++)
+                        s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+                }
+            }
+        }
+    }
+}
+
 // FIXME: this code is duplicated from ggml-aarch64.c
 static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
     block_q4_0x4 out;
@@ -3518,6 +3727,70 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block,
     GGML_UNUSED(data_size);
 }
 
+static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
+    block_iq4_nlx4 out;
+
+    for (int i = 0; i < 4; i++) {
+        out.d[i] = in[i].d;
+    }
+
+    const int end = QK4_NL * 2 / blck_size_interleave;
+
+    if (blck_size_interleave == 8) {
+        for (int i = 0; i < end; ++i) {
+            int src_id = i % 4;
+            int src_offset = (i / 4) * blck_size_interleave;
+            int dst_offset = i * blck_size_interleave;
+
+            // Using memcpy to avoid unaligned memory accesses
+            memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
+        }
+    } else if (blck_size_interleave == 4) {
+        for (int i = 0; i < end; ++i) {
+            int src_id = i % 4;
+            int src_offset = (i / 4) * blck_size_interleave;
+            int dst_offset = i * blck_size_interleave;
+
+            memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
+        }
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    return out;
+}
+
+static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * restrict data, size_t data_size) {
+    GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
+    GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
+
+    block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
+    const block_iq4_nl * src = (const block_iq4_nl *)data;
+    block_iq4_nl dst_tmp[4];
+    int nrow = t->ne[1]; // Number of rows
+    int nrows_interleaved = 4;
+    int nblocks = t->ne[0] / QK4_0;
+
+    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
+
+    if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
+        return -1;
+    }
+
+    for (int b = 0; b < nrow; b += nrows_interleaved) {
+        for (int64_t x = 0; x < nblocks; x++) {
+            for (int i = 0; i < nrows_interleaved; i++) {
+                dst_tmp[i] = src[x + i * nblocks];
+            }
+            *dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block);
+        }
+        src += nrows_interleaved * nblocks;
+    }
+    return 0;
+
+    GGML_UNUSED(data_size);
+}
+
 // Prepare for optimized kernels if applicable
 void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * restrict data, size_t data_size) {
     if (cur->type == repack_type) {
@@ -3525,20 +3798,30 @@ void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_
         return;
     }
 
-    GGML_ASSERT(cur->type == GGML_TYPE_Q4_0);
-
-    switch (repack_type) {
-        case GGML_TYPE_Q4_0_8_8:
-            repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size);
-            break;
-        case GGML_TYPE_Q4_0_4_8:
-            repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size);
-            break;
-        case GGML_TYPE_Q4_0_4_4:
-            repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size);
-            break;
-        default:
-            GGML_ABORT("Unsupported type");
+    if (cur->type == GGML_TYPE_Q4_0) {
+        switch (repack_type) {
+            case GGML_TYPE_Q4_0_8_8:
+                repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size);
+                break;
+            case GGML_TYPE_Q4_0_4_8:
+                repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size);
+                break;
+            case GGML_TYPE_Q4_0_4_4:
+                repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size);
+                break;
+            default:
+                GGML_ABORT("Unsupported type");
+        }
+    } else if (cur->type == GGML_TYPE_IQ4_NL) {
+        switch (repack_type) {
+            case GGML_TYPE_IQ4_NL_4_4:
+                repack_iq4_nl_to_iq4_nl_4_bl(cur, 4, data, data_size);
+                break;
+            default:
+                GGML_ABORT("Unsupported type");
+        }
+    } else {
+        GGML_ABORT("Unsupported type");
     }
 }
 
@@ -3551,9 +3834,13 @@ enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * c
         if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
             return GGML_TYPE_Q4_0_4_8;
         }
-        if (ggml_cpu_has_neon()) {
+        if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
             return GGML_TYPE_Q4_0_4_4;
         }
+    } else if (cur->type == GGML_TYPE_IQ4_NL) {
+        if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
+            return GGML_TYPE_IQ4_NL_4_4;
+        }
     }
 
     return cur->type;
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
index 53b30c1dd..3d9db6a19 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
@@ -15,11 +15,13 @@ void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
 void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 
 // GEMM
 void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 
 void           ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size);
 enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur);
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index c6ede19d9..fea867440 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -109,10 +109,11 @@ static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
 #if defined(__ARM_ARCH)
 struct ggml_arm_arch_features_type {
     int has_neon;
+    int has_dotprod;
     int has_i8mm;
     int has_sve;
     int sve_cnt;
-} ggml_arm_arch_features = {-1, -1, -1, 0};
+} ggml_arm_arch_features = {-1, -1, -1, -1, 0};
 #endif
 
 
@@ -446,6 +447,15 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_K,
         .nrows                    = 1,
     },
+    [GGML_TYPE_IQ4_NL_4_4] = {
+        .from_float               = NULL,
+        .vec_dot                  = NULL,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+        .nrows                    = 1,
+        .ncols                    = 4,
+        .gemv                     = ggml_gemv_iq4_nl_4x4_q8_0,
+        .gemm                     = ggml_gemm_iq4_nl_4x4_q8_0,
+    },
 };
 
 const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
@@ -2439,6 +2449,7 @@ static void ggml_init_arm_arch_features(void) {
     uint32_t hwcap2 = getauxval(AT_HWCAP2);
 
     ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
+    ggml_arm_arch_features.has_dotprod = !!(hwcap && HWCAP_ASIMDDP);
     ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
     ggml_arm_arch_features.has_sve  = !!(hwcap & HWCAP_SVE);
 
@@ -2453,6 +2464,11 @@ static void ggml_init_arm_arch_features(void) {
     }
     ggml_arm_arch_features.has_neon = oldp;
 
+    if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) != 0) {
+        oldp = 0;
+    }
+    ggml_arm_arch_features.has_dotprod = oldp;
+
     if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
         oldp = 0;
     }
@@ -9133,6 +9149,7 @@ static void ggml_compute_forward_clamp(
         case GGML_TYPE_Q4_0_4_4:
         case GGML_TYPE_Q4_0_4_8:
         case GGML_TYPE_Q4_0_8_8:
+        case GGML_TYPE_IQ4_NL_4_4:
         case GGML_TYPE_I8:
         case GGML_TYPE_I16:
         case GGML_TYPE_I32:
@@ -13880,6 +13897,14 @@ int ggml_cpu_has_neon(void) {
 #endif
 }
 
+int ggml_cpu_has_dotprod(void) {
+#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD)
+    return ggml_arm_arch_features.has_dotprod;
+#else
+    return 0;
+#endif
+}
+
 int ggml_cpu_has_sve(void) {
 #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
     return ggml_arm_arch_features.has_sve;
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index febed433a..44d99089a 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -457,7 +457,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
     const struct ggml_tensor * src1 = op->src[1];
 
     if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
-        if (op->op != GGML_OP_MUL_MAT || src0->type != GGML_TYPE_Q4_0 || ggml_aarch64_get_optimal_repack_type(src0) == GGML_TYPE_Q4_0) {
+        if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) {
             return false;
         }
     }
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 1a2318cb1..1a9a7efaf 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -831,6 +831,15 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_tq2_0,
         .from_float_ref           = (ggml_from_float_t) quantize_row_tq2_0_ref,
     },
+    [GGML_TYPE_IQ4_NL_4_4] = {
+        .type_name                = "iq4_nl_4x4",
+        .blck_size                = QK4_NL,
+        .blck_size_interleave     = 4,
+        .type_size                = sizeof(block_iq4_nl),
+        .is_quantized             = true,
+        .to_float                 = NULL,
+        .from_float_ref           = NULL,
+    },
 };
 
 const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {

From eea986f215e1dc490654d012ccf2ab62fe8f606d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 28 Nov 2024 14:56:23 +0200
Subject: [PATCH 33/43] cmake : fix ARM feature detection (#10543)

ggml-ci
---
 ggml/src/ggml-cpu/CMakeLists.txt | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index ddc05ecef..4dbc1f75b 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -82,17 +82,23 @@ if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
         check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
         if (GGML_COMPILER_SUPPORT_DOTPROD)
             add_compile_definitions(__ARM_FEATURE_DOTPROD)
+
+            message(STATUS "ARM feature DOTPROD enabled")
         endif ()
 
-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
+        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
 
         if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
             add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
+
+            message(STATUS "ARM feature MATMUL_INT8 enabled")
         endif ()
 
         check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
         if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
             add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+            message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
         endif ()
 
         set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
@@ -113,17 +119,23 @@ if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
                 if (GGML_COMPILER_SUPPORT_DOTPROD)
                     set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
                     add_compile_definitions(__ARM_FEATURE_DOTPROD)
+
+                    message(STATUS "ARM feature DOTPROD enabled")
                 endif ()
 
                 set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm")
 
                 set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
-                set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}")
+                set(CMAKE_REQUIRED_FLAGS     "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}")
+
                 check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
                 if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
                     set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
                     add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
+
+                    message(STATUS "ARM feature MATMUL_INT8 enabled")
                 endif ()
+
                 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
 
                 list(APPEND ARCH_FLAGS "${MARCH_FLAGS}")

From 76b27d29c22af03172cf211a8a31025c7c828a57 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 28 Nov 2024 14:56:37 +0200
Subject: [PATCH 34/43] ggml : fix row condition for i8mm kernels (#10561)

ggml-ci
---
 ggml/src/ggml-cpu/ggml-cpu-quants.c |  6 ++++--
 ggml/src/ggml-cpu/ggml-cpu.c        | 17 +++++++++--------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c
index f0e276b69..11e8df253 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
@@ -1813,11 +1813,13 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
             sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
                                                                                 l1, r1)), l2, r2)), l3, r3))), scale);
         }
-        float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
+
+        float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
         float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
 
-        vst1_f32(s,      vget_low_f32(sumv2));
+        vst1_f32(s,      vget_low_f32 (sumv2));
         vst1_f32(s + bs, vget_high_f32(sumv2));
+
         return;
     }
 #endif
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index fea867440..1c88e5d81 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -7576,14 +7576,6 @@ UseGgmlGemm2:;
     // This is the size of the rest of the dimensions of the result
     const int64_t nr1 = ne1 * ne2 * ne3;
 
-    // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
-    int64_t num_rows_per_vec_dot = vec_dot_num_rows;
-    // TODO: currently the mmla kernels support only even numbered rows/cols.
-    // this check can be removed once they are extended to support odd numbered rows/cols too
-    if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
-        num_rows_per_vec_dot = 1;
-    }
-
     // Now select a reasonable chunk size.
     int chunk_size = 16;
 
@@ -7646,6 +7638,15 @@ UseGgmlGemm2:;
         const int64_t ir1_start = dr1 * ith1;
         const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
 
+        // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
+        int64_t num_rows_per_vec_dot = vec_dot_num_rows;
+
+        // TODO: currently the mmla kernels support only even numbered rows/cols.
+        // this check can be removed once they are extended to support odd numbered rows/cols too
+        if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
+            num_rows_per_vec_dot = 1;
+        }
+
         ggml_compute_forward_mul_mat_one_chunk(params, dst, type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
 
         if (nth >= nchunk0 * nchunk1) {

From e90688edd004fdb7063f463bd18408ba9ae008dd Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Thu, 28 Nov 2024 15:58:54 +0100
Subject: [PATCH 35/43] ci : fix tag name in cuda and hip releases (#10566)

---
 .github/workflows/build.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index e02b5c620..48953dafa 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -904,6 +904,8 @@ jobs:
       - name: Clone
         id: checkout
         uses: actions/checkout@v4
+        with:
+            fetch-depth: 0
 
       - name: Install Cuda Toolkit 11.7
         if: ${{ matrix.cuda == '11.7' }}
@@ -1139,6 +1141,8 @@ jobs:
       - name: Clone
         id: checkout
         uses: actions/checkout@v4
+        with:
+            fetch-depth: 0
 
       - name: Install
         id: depends

From 7281cf13addfae9b64bb2be87e3b5b1914505d63 Mon Sep 17 00:00:00 2001
From: Random Fly <renfei8@live.cn>
Date: Thu, 28 Nov 2024 23:03:11 +0800
Subject: [PATCH 36/43] docs: fix outdated usage of llama-simple (#10565)

---
 docs/android.md           | 4 ++--
 examples/simple/README.md | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/android.md b/docs/android.md
index 320b62240..47530c6c1 100644
--- a/docs/android.md
+++ b/docs/android.md
@@ -23,10 +23,10 @@ $ curl -L {model-url} -o ~/{model}.gguf
 Then, if you are not already in the repo directory, `cd` into `llama.cpp` and:
 
 ```
-$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
+$ ./build/bin/llama-cli -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
 ```
 
-Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
+Here, we show `llama-cli`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
 
 To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:
 
diff --git a/examples/simple/README.md b/examples/simple/README.md
index 0ff342535..937008b24 100644
--- a/examples/simple/README.md
+++ b/examples/simple/README.md
@@ -3,7 +3,7 @@
 The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
 
 ```bash
-./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is"
+./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
 
 ...
 

From 890719311b6535e572f15965c6d7ec4ac2537f60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Thu, 28 Nov 2024 18:15:25 +0100
Subject: [PATCH 37/43] common: fix warning message when no GPU found (#10564)

---
 common/arg.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index a6b7a1394..32d9a964c 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1370,8 +1370,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, int value) {
             params.n_gpu_layers = value;
             if (!llama_supports_gpu_offload()) {
-                fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
-                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+                fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
+                fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
+                fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
             }
         }
     ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
@@ -2104,8 +2105,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, int value) {
             params.speculative.n_gpu_layers = value;
             if (!llama_supports_gpu_offload()) {
-                fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
-                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+                fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
+                fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
+                fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
             }
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));

From 6c595676899013102fdb0aa4b06a49954300c94a Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Thu, 28 Nov 2024 19:17:49 +0100
Subject: [PATCH 38/43] server : (tests) don't use thread for capturing
 stdout/stderr, bump openai client library (#10568)

* server : (tests) don't use thread for capturing stdout/stderr

* test: bump openai to 1.55.2

* bump openai to 1.55.3
---
 examples/server/tests/requirements.txt |  2 +-
 examples/server/tests/utils.py         | 19 ++-----------------
 2 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt
index 935a79114..074b9d47b 100644
--- a/examples/server/tests/requirements.txt
+++ b/examples/server/tests/requirements.txt
@@ -2,6 +2,6 @@ aiohttp~=3.9.3
 pytest~=8.3.3
 huggingface_hub~=0.23.2
 numpy~=1.26.4
-openai~=1.30.3
+openai~=1.55.3
 prometheus-client~=0.20.0
 requests~=2.32.3
diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py
index e31743c50..a831f113f 100644
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
@@ -8,7 +8,6 @@ import os
 import re
 import json
 import sys
-import threading
 import requests
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -161,26 +160,12 @@ class ServerProcess:
         self.process = subprocess.Popen(
             [str(arg) for arg in [server_path, *server_args]],
             creationflags=flags,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            stdout=sys.stdout,
+            stderr=sys.stdout,
             env={**os.environ, "LLAMA_CACHE": "tmp"},
         )
         server_instances.add(self)
 
-        def server_log(in_stream, out_stream):
-            for line in iter(in_stream.readline, b""):
-                print(line.decode("utf-8"), end="", file=out_stream)
-
-        thread_stdout = threading.Thread(
-            target=server_log, args=(self.process.stdout, sys.stdout), daemon=True
-        )
-        thread_stdout.start()
-
-        thread_stderr = threading.Thread(
-            target=server_log, args=(self.process.stderr, sys.stderr), daemon=True
-        )
-        thread_stderr.start()
-
         print(f"server pid={self.process.pid}, pytest pid={os.getpid()}")
 
         # wait for server to start

From 4c0a95b1074907ce7efe6f5bb6ae3351c01429ab Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 28 Nov 2024 20:45:07 +0200
Subject: [PATCH 39/43] llama : add missing model types

---
 src/llama.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/llama.cpp b/src/llama.cpp
index af5e686e0..22b951ba2 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2341,6 +2341,7 @@ enum e_model {
     MODEL_16B,
     MODEL_20B,
     MODEL_30B,
+    MODEL_32B,
     MODEL_34B,
     MODEL_35B,
     MODEL_40B,
@@ -5330,6 +5331,7 @@ static const char * llama_model_type_name(e_model type) {
         case MODEL_16B:           return "16B";
         case MODEL_20B:           return "20B";
         case MODEL_30B:           return "30B";
+        case MODEL_32B:           return "32B";
         case MODEL_34B:           return "34B";
         case MODEL_35B:           return "35B";
         case MODEL_40B:           return "40B";
@@ -5690,7 +5692,10 @@ static void llm_load_hparams(
                     case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
                     case 28: model.type = hparams.n_embd == 1536 ? e_model::MODEL_1_5B : e_model::MODEL_7B; break;
                     case 32: model.type = e_model::MODEL_7B; break;
+                    case 36: model.type = e_model::MODEL_3B; break;
                     case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
+                    case 48: model.type = e_model::MODEL_14B; break;
+                    case 64: model.type = e_model::MODEL_32B; break;
                     case 80: model.type = e_model::MODEL_70B; break;
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }

From dc22344088a7ee81a1e4f096459b03a72f24ccdc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 28 Nov 2024 20:46:40 +0200
Subject: [PATCH 40/43] ggml : remove redundant copyright notice + update
 authors

---
 AUTHORS                              | 186 ++++++++++++++++++++++++++-
 ggml/src/ggml-cpu/ggml-cpu-aarch64.c |   4 -
 2 files changed, 185 insertions(+), 5 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index 1bd36158a..2eb60806a 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,4 +1,4 @@
-# date: Wed Jun 26 19:36:34 EEST 2024
+# date: Thu Nov 28 20:46:15 EET 2024
 # this file is auto-generated by scripts/gen-authors.sh
 
 0cc4m <picard12@live.de>
@@ -7,6 +7,7 @@
 2f38b454 <dxf@protonmail.com>
 3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
 44670 <44670@users.noreply.github.com>
+65a <10104049+65a@users.noreply.github.com>
 AN Long <aisk@users.noreply.github.com>
 AT <manyoso@users.noreply.github.com>
 Aarni Koskela <akx@iki.fi>
@@ -19,20 +20,28 @@ Adithya Balaji <adithya.b94@gmail.com>
 AdithyanI <adithyan.i4internet@gmail.com>
 Adrian <smith.adriane@gmail.com>
 Adrian Hesketh <a-h@users.noreply.github.com>
+Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com>
 Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr>
 AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
+AidanBeltonS <aidan.belton@codeplay.com>
 Aisuko <urakiny@gmail.com>
+Akarshan Biswas <akarshan.biswas@gmail.com>
 Akarshan Biswas <akarshanbiswas@fedoraproject.org>
+Al Mochkin <14274697+amochkin@users.noreply.github.com>
 Albert Jin <albert.jin@gmail.com>
 Alberto <57916483+albbus-stack@users.noreply.github.com>
+Alberto Cabrera Pérez <alberto.cabrera@codeplay.com>
+Alberto Cabrera Pérez <alberto.cabrera@intel.com>
 Alex <awhill19@icloud.com>
 Alex Azarov <alex@azarov.by>
 Alex Azarov <alexander.azarov@mapbox.com>
 Alex Klinkhamer <from.github.com.917@grencez.dev>
 Alex Klinkhamer <git@grencez.dev>
 Alex Nguyen <tiendung@users.noreply.github.com>
+Alex O'Connell <35843486+acon96@users.noreply.github.com>
 Alex Petenchea <alex.petenchea@gmail.com>
 Alex Renda <alexrenda@users.noreply.github.com>
+Alex Tuddenham <61622354+AlexsCode@users.noreply.github.com>
 Alex von Gluck IV <kallisti5@unixzen.com>
 Alexey Parfenov <zxed@alkatrazstudio.net>
 Ali Chraghi <63465728+alichraghi@users.noreply.github.com>
@@ -45,18 +54,25 @@ AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
 Ananta Bastola <anantarajbastola@gmail.com>
 Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
 András Salamon <ott2@users.noreply.github.com>
+Andreas (Andi) Kunar <andreask@msn.com>
 Andrei <abetlen@gmail.com>
 Andrew Canis <andrew.canis@gmail.com>
 Andrew Downing <andrew2085@gmail.com>
 Andrew Duffy <a10y@users.noreply.github.com>
 Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
+Andrew Minh Nguyen <40281306+amqdn@users.noreply.github.com>
+Andy Salerno <andysalerno@gmail.com>
 Andy Tai <andy-tai@users.noreply.github.com>
+Anthony Van de Gejuchte <anthonyvdgent@gmail.com>
+Antonis Makropoulos <benuix@gmail.com>
 Arik Poznanski <arikpoz@users.noreply.github.com>
+Armen Kaleshian <kriation@users.noreply.github.com>
 Artem <guinmoon@gmail.com>
 Artem Zinnatullin <ceo@abstractny.gay>
 Artyom Lebedev <vagran.ast@gmail.com>
 Asbjørn Olling <asbjornolling@gmail.com>
 Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
+Asghar Ghorbani <a-ghorbani@users.noreply.github.com>
 Ashish <1856117+ashishdatta@users.noreply.github.com>
 Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
 Ashraful Islam <ashraful.meche@gmail.com>
@@ -76,12 +92,16 @@ Ben Williams <ben@719ben.com>
 Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com>
 Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
 Bernat Vadell <hounter.caza@gmail.com>
+Bert Wagner <github@bertwagner.com>
 Bingan <70050083+binganao@users.noreply.github.com>
+Bjarke Viksøe <164612031+bviksoe@users.noreply.github.com>
 Bodo Graumann <mail@bodograumann.de>
 Bono Lv <lvscar@users.noreply.github.com>
 Borislav Stanimirov <b.stanimirov@abv.bg>
 Branden Butler <bwtbutler@hotmail.com>
+Brandon Squizzato <35474886+bsquizz@users.noreply.github.com>
 Brian <mofosyne@gmail.com>
+Brian Cunnie <brian.cunnie@gmail.com>
 Bruce MacDonald <brucewmacdonald@gmail.com>
 Bryan Honof <bryanhonof@gmail.com>
 CJ Pais <cj@cjpais.com>
@@ -90,32 +110,47 @@ Calvin Laurenson <calvin@laurenson.dev>
 Cameron <csteele@steelecameron.com>
 Cameron Kaiser <classilla@users.noreply.github.com>
 Carolinabanana <140120812+Carolinabanana@users.noreply.github.com>
+CarryFun <76023481+CarryFun@users.noreply.github.com>
+Carsten Kragelund Jørgensen <carsten@kragelund.me>
+CarterLi999 <664681047@qq.com>
 Casey Primozic <casey@cprimozic.net>
 Casey Primozic <me@ameo.link>
 CausalLM <148736309+CausalLM@users.noreply.github.com>
 Cebtenzzre <cebtenzzre@gmail.com>
 Chad Brewbaker <crb002@gmail.com>
+Changyeon Kim <cyzero.kim@samsung.com>
 Chao Jiang <jc19chaoj@zoho.com>
+Charles Xu <63788048+chaxu01@users.noreply.github.com>
+Charles Xu <charles.xu@arm.com>
+Chen Xi <xi2.chen@intel.com>
+Chen Xi <xixichen08@foxmail.com>
 Cheng Shao <terrorjack@type.dance>
+Chenguang Li <87689256+noemotiovon@users.noreply.github.com>
 Chris Elrod <elrodc@gmail.com>
 Chris Kuehl <ckuehl@ckuehl.me>
 Christian Demsar <christian@github.email.demsar.us>
 Christian Demsar <crasm@git.vczf.us>
 Christian Falch <875252+chrfalch@users.noreply.github.com>
 Christian Kögler <ck3d@gmx.de>
+Christian Köhnenkamp <cvk5@me.com>
 Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
 Clark Saben <76020733+csaben@users.noreply.github.com>
 Clint Herron <hanclinto@gmail.com>
+Conrad Kramer <conrad@conradkramer.com>
 CrispStrobe <154636388+CrispStrobe@users.noreply.github.com>
+Csaba Kecskemeti <csaba.kecskemeti@gmail.com>
 Cuong Trinh Manh <nguoithichkhampha@gmail.com>
 DAN™ <dranger003@gmail.com>
 Damian Stewart <d@damianstewart.com>
+Dan Johansson <164997844+eddnjjn@users.noreply.github.com>
+Dan Johansson <dan.johansson@arm.com>
 Dane Madsen <dane_madsen@hotmail.com>
 DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com>
 Daniel Bevenius <daniel.bevenius@gmail.com>
 Daniel Drake <drake@endlessos.org>
 Daniel Hiltgen <dhiltgen@users.noreply.github.com>
 Daniel Illescas Romero <illescas.daniel@protonmail.com>
+Daniel Kleine <53251018+d-kleine@users.noreply.github.com>
 Daniele <57776841+daniandtheweb@users.noreply.github.com>
 DannyDaemonic <DannyDaemonic@gmail.com>
 Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
@@ -129,19 +164,28 @@ David Pflug <david@pflug.email>
 David Renshaw <dwrenshaw@gmail.com>
 David Sommers <12738+databyte@users.noreply.github.com>
 David Yang <davidyang6us@gmail.com>
+DavidKorczynski <david@adalogics.com>
 Dawid Potocki <github@dawidpotocki.com>
 Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
 Dean <Dean.Sinaean@gmail.com>
 Deins <deinsegle@gmail.com>
+Denis Spasyuk <34203011+dspasyuk@users.noreply.github.com>
+Derrick T. Woolworth <dwoolworth@gmail.com>
 Deven Mistry <31466137+deven367@users.noreply.github.com>
+Dibakar Gope <dibakar.gope@arm.com>
 Didzis Gosko <didzis@users.noreply.github.com>
+Diego Devesa <slarengh@gmail.com>
+Diogo Teles Sant'Anna <diogoteles@google.com>
 Djip007 <djip.perois@free.fr>
 Don Mahurin <dmahurin@users.noreply.github.com>
 DooWoong Lee (David) <manics99@naver.com>
 Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
+Dou Xinpeng <15529241576@163.com>
+Dou Xinpeng <81913537+Dou-Git@users.noreply.github.com>
 Douglas Hanley <thesecretaryofwar@gmail.com>
 Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
 Ebey Abraham <ebey97@gmail.com>
+Echo Nolan <echo@echonolan.net>
 Ed Lee <edilee@mozilla.com>
 Ed Lepedus <ed.lepedus@googlemail.com>
 Eddie-Wang <wangjinheng1120@163.com>
@@ -151,10 +195,13 @@ Elbios <141279586+Elbios@users.noreply.github.com>
 Elton Kola <eltonkola@gmail.com>
 Engininja2 <139037756+Engininja2@users.noreply.github.com>
 Equim <sayaka@ekyu.moe>
+Eric Curtin <ecurtin@redhat.com>
+Eric Curtin <ericcurtin17@gmail.com>
 Eric Sommerlade <es0m@users.noreply.github.com>
 Eric Zhang <34133756+EZForever@users.noreply.github.com>
 Erik Garrison <erik.garrison@gmail.com>
 Erik Scholz <Green-Sky@users.noreply.github.com>
+Esko Toivonen <eskot98@gmail.com>
 Ettore Di Giacinto <mudler@users.noreply.github.com>
 Evan Jones <evan.q.jones@gmail.com>
 Evan Miller <emmiller@gmail.com>
@@ -166,19 +213,26 @@ FK <sozforex@gmail.com>
 Fabian <cmdrf@users.noreply.github.com>
 Fabio R. Sluzala <Fabio3rs@users.noreply.github.com>
 Faez Shakil <faez.shakil@gmail.com>
+Faisal Zaghloul <faisal.zaghloul@gmail.com>
+Faisal Zaghloul <quic_fzaghlou@quicinc.com>
+Fan Shupei <dymarkfan@outlook.com>
 FantasyGmm <16450052+FantasyGmm@users.noreply.github.com>
+Farbod Bijary <110523279+farbodbj@users.noreply.github.com>
 Fattire <528174+fat-tire@users.noreply.github.com>
 Felix <stenbackfelix@gmail.com>
 Finn Voorhees <finnvoorhees@gmail.com>
 Firat <firatkiral@gmail.com>
+FirstTimeEZ <179362031+FirstTimeEZ@users.noreply.github.com>
 Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
 Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
 Francisco Melo <43780565+francis2tm@users.noreply.github.com>
 Frank Mai <thxcode0824@gmail.com>
 FrankHB <frankhb1989@gmail.com>
+Frankie Robertson <frankier@users.noreply.github.com>
 Fred Douglas <43351173+fredlas@users.noreply.github.com>
 Frederik Vogel <Schaltfehler@users.noreply.github.com>
 Gabe Goodhart <gabe.l.hart@gmail.com>
+Gabe Goodhart <ghart@us.ibm.com>
 GainLee <perfecter.gen@gmail.com>
 Galunid <karolek1231456@gmail.com>
 Gary Linscott <glinscott@gmail.com>
@@ -187,11 +241,13 @@ Gavin Zhao <gavinzhaojw@protonmail.com>
 Genkagaku.GPT <hlhr202@163.com>
 Georgi Gerganov <ggerganov@gmail.com>
 Gilad S <giladgd@users.noreply.github.com>
+Gilad S. <7817232+giladgd@users.noreply.github.com>
 Giuseppe Scrivano <giuseppe@scrivano.org>
 GiviMAD <GiviMAD@users.noreply.github.com>
 Govlzkoy <gotope@users.noreply.github.com>
 Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
 Guillaume Wenzek <gwenzek@users.noreply.github.com>
+Guoliang Hua <32868157+nbcsm@users.noreply.github.com>
 Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
 Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
 Haggai Nuchi <h.nuchi@gmail.com>
@@ -213,11 +269,14 @@ Hong Bo PENG <penghb@cn.ibm.com>
 Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
 Howard Su <howard0su@gmail.com>
 Hua Jiang <allenhjiang@outlook.com>
+Huang Qi <huangqi3@xiaomi.com>
 Huawei Lin <huaweilin.cs@gmail.com>
 Hugo Roussel <hugo.rous@gmail.com>
+Huifeng Ou <79071290+ho2103@users.noreply.github.com>
 Ian Bull <irbull@eclipsesource.com>
 Ian Bull <irbull@gmail.com>
 Ian Scrivener <github@zilogy.asia>
+Icecream95 <the.real.icecream95@gmail.com>
 Ido S <ido.pluto@gmail.com>
 IgnacioFDM <ignaciofdm@gmail.com>
 Igor Okulist <okigan@gmail.com>
@@ -226,11 +285,15 @@ Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
 Ionoclast Laboratories <brigham@ionoclast.com>
 Isaac McFadyen <isaac@imcf.me>
 IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com>
+Ivan <nekotekina@gmail.com>
+Ivan Filipov <159561759+vanaka11@users.noreply.github.com>
 Ivan Komarov <Ivan.Komarov@dfyz.info>
 Ivan Stepanov <ivanstepanovftw@gmail.com>
 JH23X <165871467+JH23X@users.noreply.github.com>
+Jack Mousseau <jack@software.inc>
 Jack Mousseau <jmousseau@users.noreply.github.com>
 JackJollimore <130917767+JackJollimore@users.noreply.github.com>
+Jaeden Amero <jaeden@patater.com>
 Jaemin Son <woalsdnd@gmail.com>
 Jag Chadha <jagtesh@gmail.com>
 Jakub N <jakubniemczyk97@gmail.com>
@@ -243,10 +306,14 @@ Jannis Schönleber <joennlae@gmail.com>
 Jared Van Bortel <cebtenzzre@gmail.com>
 Jared Van Bortel <jared@nomic.ai>
 Jason McCartney <jmac@theroot.org>
+Jason Stillerman <jason.t.stillerman@gmail.com>
 Jean-Christophe Hoelt <hoelt@fovea.cc>
 Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
 Jed Fox <git@jedfox.com>
+Jeff Bolz <jbolz@nvidia.com>
+Jeffrey Morgan <jmorganca@gmail.com>
 Jeffrey Quesnelle <emozilla@nousresearch.com>
+Jeroen Mostert <jeroen.mostert@cm.com>
 Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
 Jeximo <jeximo@gmail.com>
 Jhen-Jie Hong <iainst0409@gmail.com>
@@ -258,6 +325,9 @@ Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
 Jiří Sejkora <Sejseloid@gmail.com>
 Joan Fontanals <jfontanalsmartinez@gmail.com>
 Joan Fontanals <joan.fontanals.martinez@jina.ai>
+João Dinis Ferreira <hello@joaof.eu>
+Joe Eli McIlvain <joe.eli.mac@gmail.com>
+Joe Todd <joe.todd@codeplay.com>
 Johan <JohanAR@users.noreply.github.com>
 Johannes Gäßler <johannesg@5d6.de>
 Johannes Rudolph <johannes.rudolph@gmail.com>
@@ -274,7 +344,9 @@ Joyce <joycebrum@google.com>
 Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
 Judd <foldl@users.noreply.github.com>
 Julius Arkenberg <arki05@users.noreply.github.com>
+Jun Hee Yoo <contact.jhyoo@gmail.com>
 Jun Jie <71215065+junnjiee16@users.noreply.github.com>
+Junil Kim <logyourself@gmail.com>
 Junyang Lin <justinlin930319@hotmail.com>
 Juraj Bednar <juraj@bednar.io>
 Justin Parker <jparkerweb@gmail.com>
@@ -292,12 +364,14 @@ Karthik Sethuraman <k.seth1993@gmail.com>
 Kasumi <90275229+kasumi-1@users.noreply.github.com>
 Kawrakow <48489457+ikawrakow@users.noreply.github.com>
 Keiichi Tabata <keiichi.tabata@outlook.com>
+Keke Han <hankeke303@163.com>
 Kenvix ⭐ <kenvixzure@live.com>
 Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
 Kevin Gibbons <bakkot@gmail.com>
 Kevin Ji <1146876+kevinji@users.noreply.github.com>
 Kevin Kwok <antimatter15@gmail.com>
 Kevin Lo <kevlo@kevlo.org>
+Kevin Wang <kevmo314@gmail.com>
 Kolen Cheung <ickc@users.noreply.github.com>
 Konstantin Herud <konstantin.herud@denkbares.com>
 Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
@@ -315,22 +389,29 @@ LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
 Leonardo Neumann <leonardo@neumann.dev.br>
 Li Tan <tanliboy@gmail.com>
 Linwei Wang <wanix1988@gmail.com>
+Liu Jia <109258120+Septa2112@users.noreply.github.com>
+Liu Jia <jia3.liu@intel.com>
 LoganDark <github@logandark.mozmail.com>
+Loïc Carrère <loic.carrere@gmail.com>
 LostRuins <39025047+LostRuins@users.noreply.github.com>
 Luciano <lucianostrika44@gmail.com>
 Luo Tian <lt@basecity.com>
 Lyle Dean <dean@lyle.dev>
+M-A <maruel@gmail.com>
 M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
+Ma Mingfei <mingfei.ma@intel.com>
 Maarten ter Huurne <maarten@treewalker.org>
 Mack Straight <eiz@users.noreply.github.com>
 Maël Kerbiriou <m431.kerbiriou@gmail.com>
 MaggotHATE <clay1326@gmail.com>
+Mahesh Madhav <67384846+heshpdx@users.noreply.github.com>
 Manuel <44313466+makuche@users.noreply.github.com>
 Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
 Marco Matthies <71844+marcom@users.noreply.github.com>
 Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
 Marian Cepok <marian.cepok@gmail.com>
 Mark Fairbairn <thebaron88@gmail.com>
+Mark Zhuang <zhuangqiubin@gmail.com>
 Marko Tasic <mtasic85@gmail.com>
 Markus Tavenrath <mtavenrath@users.noreply.github.com>
 Martin Delille <martin@delille.org>
@@ -342,11 +423,15 @@ MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com>
 Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
 Matheus C. França <matheus-catarino@hotmail.com>
 Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
+Mathieu Geli <mathieu.geli@gmail.com>
 Mathieu Nayrolles <MathieuNls@users.noreply.github.com>
+Mathijs Henquet <mathijs.henquet@gmail.com>
 Mathijs de Bruin <mathijs@mathijsfietst.nl>
 Matt Clayton <156335168+mattjcly@users.noreply.github.com>
 Matt Pulver <matt.pulver@heavy.ai>
+Matt Stephenson <mstephenson6@users.noreply.github.com>
 Matteo Boschini <12133566+mbosc@users.noreply.github.com>
+Matteo Mortari <matteo.mortari@gmail.com>
 Mattheus Chediak <shammcity00@gmail.com>
 Matthew Tejo <matthew.tejo@gmail.com>
 Matvey Soloviev <blackhole89@gmail.com>
@@ -356,8 +441,10 @@ Maxime <672982+maximegmd@users.noreply.github.com>
 Maximilian Winter <maximilian.winter.91@gmail.com>
 Meng Zhang <meng@tabbyml.com>
 Meng, Hengyu <hengyu.meng@intel.com>
+Mengqing Cao <cmq0113@163.com>
 Merrick Christensen <merrick.christensen@gmail.com>
 Michael Coppola <m18coppola@gmail.com>
+Michael Francis <edude03@gmail.com>
 Michael Hueschen <m@mhueschen.dev>
 Michael Kesper <mkesper@schokokeks.org>
 Michael Klimenko <mklimenko29@gmail.com>
@@ -365,41 +452,57 @@ Michael Podvitskiy <podvitskiymichael@gmail.com>
 Michael Potter <NanoTekGuy@Gmail.com>
 Michael de Gans <michael.john.degans@gmail.com>
 Michaël de Vries <vriesdemichael@gmail.com>
+Michał Tuszyński <srgtuszy@gmail.com>
 Mihai <mihai.chirculescu@yahoo.com>
 Mike <ytianhui2004@gmail.com>
 Mikko Juola <mikjuo@gmail.com>
 Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
+Minsoo Cheong <icycle0409@snu.ac.kr>
 Mirko185 <mirkosig@gmail.com>
 Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
+MistApproach <98988043+MistApproach@users.noreply.github.com>
 Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
 Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
 Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com>
+Molly Sophia <mollysophia379@gmail.com>
+MorganRO8 <47795945+MorganRO8@users.noreply.github.com>
 Murilo Santana <mvrilo@gmail.com>
 Musab Gultekin <musabgultekin@users.noreply.github.com>
 Nam D. Tran <42194884+namtranase@users.noreply.github.com>
 Nathan Epstein <nate2@umbc.edu>
+Natsu <chino@hotococoa.moe>
 NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
 Nebula <infinitewormhole@gmail.com>
 Neo Zhang <14088817+arthw@users.noreply.github.com>
 Neo Zhang <zhang.jianyu@outlook.com>
 Neo Zhang Jianyu <jianyu.zhang@intel.com>
 Neuman Vong <neuman.vong@gmail.com>
+Nexes the Old <124105151+Nexesenex@users.noreply.github.com>
 Nexesenex <124105151+Nexesenex@users.noreply.github.com>
 Niall Coates <1349685+Niall-@users.noreply.github.com>
+Nicholai Tukanov <nicholaitukanov@gmail.com>
+Nico Bosshard <nico@bosshome.ch>
 Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
 Nicolás Pérez <nicolas_perez@brown.edu>
 Nigel Bosch <pnigelb@gmail.com>
 Niklas Korz <niklas@niklaskorz.de>
+NikolaiLyssogor <59844691+NikolaiLyssogor@users.noreply.github.com>
 Nikolas <127742645+nneubacher@users.noreply.github.com>
 Nindaleth <Nindaleth@users.noreply.github.com>
+OSecret <135510162+OLSecret@users.noreply.github.com>
 Oleksandr Nikitin <oleksandr@tvori.info>
 Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
 Olivier Chafik <ochafik@users.noreply.github.com>
 Ondřej Čertík <ondrej@certik.us>
 Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
+PAB <pierreantoine.bannier@gmail.com>
+Pablo Duboue <pablo.duboue@gmail.com>
+Pascal Patry <ppatry@mtacitlabs.com>
 Patrice Ferlet <metal3d@gmail.com>
 Paul Tsochantaris <ptsochantaris@icloud.com>
+Pavel Zloi <github.com@drteam.rocks>
 Pavol Rusnak <pavol@rusnak.io>
+Paweł Wodnicki <151604+32bitmicro@users.noreply.github.com>
 Pedro Cuenca <pedro@huggingface.co>
 Peter Sugihara <peter@campsh.com>
 Phil H <5756783+phiharri@users.noreply.github.com>
@@ -407,10 +510,15 @@ Philip Taron <philip.taron@gmail.com>
 Phillip Kravtsov <phillip@kravtsov.net>
 Pierre Alexandre SCHEMBRI <pa.schembri@gmail.com>
 Pierrick Hymbert <pierrick.hymbert@gmail.com>
+Pieter Ouwerkerk <pieter.ouwerkerk@gmail.com>
+Plamen Minev <pacominev@gmail.com>
+Prashant Vithule <119530321+Vithulep@users.noreply.github.com>
 Przemysław Pawełczyk <przemoc@gmail.com>
 Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
 Qingyou Meng <meng.qingyou@gmail.com>
 Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com>
+R0CKSTAR <xiaodong.ye@mthreads.com>
+R0CKSTAR <yeahdongcn@gmail.com>
 RJ Adriaansen <adriaansen@eshcc.eur.nl>
 Radoslav Gerganov <rgerganov@gmail.com>
 Radosław Gryta <radek.gryta@gmail.com>
@@ -419,11 +527,13 @@ Raj Hammeer Singh Hada <hammeerraj@gmail.com>
 Ralph Soika <ralph.soika@imixs.com>
 Rand Xie <randxiexyy29@gmail.com>
 Randall Fitzgerald <randall@dasaku.net>
+Random Fly <renfei8@live.cn>
 Reinforce-II <fate@eastal.com>
 Ren Xuancheng <jklj077@users.noreply.github.com>
 Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
 RhinoDevel <RhinoDevel@users.noreply.github.com>
 Riceball LEE <snowyu.lee@gmail.com>
+Rich Dougherty <rich@rd.nz>
 Richard Kiss <him@richardkiss.com>
 Richard Roberson <richardr1126@gmail.com>
 Rick G <26732651+TheFlipbook@users.noreply.github.com>
@@ -439,21 +549,30 @@ Robey Holderith <robey@flaminglunchbox.net>
 Robyn <robyngraf@users.noreply.github.com>
 Roger Meier <r.meier@siemens.com>
 Roland <14355895+rbur0425@users.noreply.github.com>
+Romain Biessy <romain.biessy@codeplay.com>
 Romain D <90720+Artefact2@users.noreply.github.com>
 Romain Neutron <romain@neutron.io>
 Roman Parykin <donderom@gmail.com>
 Ron Evans <ron@hybridgroup.com>
 Ron Jailall <rojailal@gmail.com>
+Roni <sulpher@gmx.net>
 Ronny Brendel <ronnybrendel@gmail.com>
 Ronsor <ronsor@ronsor.pw>
 Rowan Hart <rowanbhart@gmail.com>
+Ruchira Hasaranga <ruchira66@gmail.com>
+Ruixin Huang <18860020911@163.com>
 Rune <43761327+Rune-AI@users.noreply.github.com>
+RunningLeon <maningsheng@sensetime.com>
+RunningLeon <mnsheng@yeah.net>
 Ryan Landay <rlanday@gmail.com>
 Ryder Wishart <ryderwishart@gmail.com>
 Ryuei <louixs@users.noreply.github.com>
 Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
+SRHMorris <69468379+SRHMorris@users.noreply.github.com>
+SXX <sxx1136965276@gmail.com>
 SakuraUmi <yukinon244@gmail.com>
 Salvador E. Tropea <stropea@inti.gob.ar>
+Salvatore Mesoraca <s.mesoraca16@gmail.com>
 Sam Spilsbury <smspillaz@gmail.com>
 Sami Farin <3876865+Safari77@users.noreply.github.com>
 Samuel Maynard <samwmaynard@gmail.com>
@@ -463,23 +582,29 @@ Sebastián A <sebastian.aedo29@gmail.com>
 SebastianApel <13675545+SebastianApel@users.noreply.github.com>
 Senemu <10880819+Senemu@users.noreply.github.com>
 Sergey Alirzaev <zl29ah@gmail.com>
+Sergio López <slp@redhat.com>
 Sergio López <slp@sinrega.org>
 Sertaç Özercan <852750+sozercan@users.noreply.github.com>
 SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
 ShadovvBeast <ShadovvBeast@gmail.com>
 Shakhar Dasgupta <shakhardasgupta@gmail.com>
+Shane A <shanea@allenai.org>
 Shangning Xu <32517059+xushangning@users.noreply.github.com>
+Shankar <gshankar.87@gmail.com>
+Shanshan Shen <467638484@qq.com>
 Shijie <821898965@qq.com>
 Shintarou Okada <kokuzen@gmail.com>
 Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
 Shouzheng Liu <lshzh.hi@gmail.com>
 Shuichi Tsutsumi <shuichi0526@gmail.com>
+Shupei Fan <dymarkfan@outlook.com>
 Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
 Simon Willison <swillison@gmail.com>
 Siwen Yu <yusiwen@gmail.com>
 Sky Yan <skyan83@gmail.com>
 Slaren <2141330+slaren@users.noreply.github.com>
 Slava Primenko <primenko.s@gmail.com>
+Small Grass Forest <zixuanxcl@gmail.com>
 SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com>
 Someone <sergei.kozlukov@aalto.fi>
 Someone Serge <sergei.kozlukov@aalto.fi>
@@ -491,12 +616,15 @@ Stefan Sydow <stefan@sydow.email>
 Steffen Röcker <sroecker@gmail.com>
 Stephan Walter <stephan@walter.name>
 Stephen Nichols <snichols@users.noreply.github.com>
+Steve Bonds <sbonds@gmail.com>
 Steve Grubb <ausearch.1@gmail.com>
 Steven Prichard <spprichard20@gmail.com>
 Steven Roussey <sroussey@gmail.com>
 Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
+StrangeBytesDev <141275258+StrangeBytesDev@users.noreply.github.com>
 Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
 SuperUserNameMan <yoann@terminajones.com>
+Sutou Kouhei <kou@cozmixng.org>
 Tai Duc Nguyen <taiducnguyen.drexel@gmail.com>
 Taikono-Himazin <kazu@po.harenet.ne.jp>
 Tameem <113388789+AhmadTameem@users.noreply.github.com>
@@ -507,7 +635,9 @@ Theia Vogel <theia@vgel.me>
 Thérence <13496987+Royalphax@users.noreply.github.com>
 Thibault Terrasson <thibault.terrasson@gmail.com>
 Thomas Klausner <wiz@gatalith.at>
+Thorsten Sommer <SommerEngineering@users.noreply.github.com>
 Tim Miller <drasticactions@users.noreply.github.com>
+Tim Wang <overocean@gmail.com>
 Timmy Knight <r2d2fish@gmail.com>
 Timothy Cronin <40186632+4imothy@users.noreply.github.com>
 Ting Lou <ting.lou@gmail.com>
@@ -517,24 +647,31 @@ Tom C <tom.corelis@gmail.com>
 Tom Jobbins <784313+TheBloke@users.noreply.github.com>
 Tomas <tom.tomas.36478119@gmail.com>
 Tomáš Pazdiora <tomas.pazdiora@gmail.com>
+Tony Wasserka <4840017+neobrain@users.noreply.github.com>
 Tristan Druyen <tristan@vault81.mozmail.com>
 Tristan Ross <rosscomputerguy@protonmail.com>
+Trivikram Kamat <16024985+trivikr@users.noreply.github.com>
 Tungsten842 <886724vf@anonaddy.me>
 Tungsten842 <quantmint@protonmail.com>
 Tushar <ditsuke@protonmail.com>
 UEXTM.com <84163508+uextm@users.noreply.github.com>
+Ujjawal Panchal <31011628+Ujjawal-K-Panchal@users.noreply.github.com>
 Ulrich Drepper <drepper@gmail.com>
 Uzo Nweke <uzoechi@gmail.com>
 Vaibhav Srivastav <vaibhavs10@gmail.com>
 Val Kharitonov <mail@kharvd.com>
 Valentin Konovalov <valle.ketsujin@gmail.com>
 Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
+Vali Malinoiu <0x4139@gmail.com>
 Victor Nogueira <felladrin@gmail.com>
 Victor Z. Peng <ziliangdotme@gmail.com>
+Viet-Anh NGUYEN (Andrew) <vietanh.dev@gmail.com>
+Vinesh Janarthanan <36610342+VJHack@users.noreply.github.com>
 Vlad <spitfireage@gmail.com>
 Vladimir <bogdad@gmail.com>
 Vladimir Malyutin <first-leon@yandex.ru>
 Vladimir Zorin <vladimir@deviant.guru>
+VoidIsVoid <343750470@qq.com>
 Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
 WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
 Weird Constructor <weirdconstructor@gmail.com>
@@ -551,15 +688,22 @@ Xiang (Kevin) Li <kevinli020508@gmail.com>
 Xiao-Yong Jin <jinxiaoyong@gmail.com>
 XiaotaoChen <chenxiaotao1234@gmail.com>
 Xiaoyi Chen <cxychina@gmail.com>
+Xie Yanbo <xieyanbo@gmail.com>
 Xingchen Song(宋星辰) <xingchensong1996@163.com>
+Xinpeng Dou <81913537+Dou-Git@users.noreply.github.com>
 Xuan Son Nguyen <thichthat@gmail.com>
+Yaiko <elyaiko@hotmail.com>
 Yann Follet <131855179+YannFollet@users.noreply.github.com>
 Yaroslav <yaroslav.yashin@me.com>
 Yazan Agha-Schrader <mountaiin@icloud.com>
 Yiming Cui <conandiy@vip.qq.com>
 Yishuo Wang <MeouSker77@outlook.com>
+Yoshi Suhara <y.suhara@gmail.com>
+Yoshi Suhara <ysuhara@nvidia.com>
+Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
 Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
 Yui <dev@sleepyyui.com>
+Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
 Yusuf Kağan Hanoğlu <hanoglu@yahoo.com>
 Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com>
 ZHAOKAI WANG <sanxianwei@163.com>
@@ -568,6 +712,8 @@ Zay <95888118+isaiahbjork@users.noreply.github.com>
 Zenix <zenixls2@gmail.com>
 Zhang Peiyuan <a1286225768@gmail.com>
 Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com>
+Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com>
+Zhiyuan Li <lizhiyuan@uniartisan.com>
 ZhouYuChen <zhouyuchen@naver.com>
 Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
 Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
@@ -581,6 +727,7 @@ alexpinel <93524949+alexpinel@users.noreply.github.com>
 alonfaraj <alonfaraj@gmail.com>
 alwqx <kenan3015@gmail.com>
 amd-lalithnc <lalithnc@amd.com>
+amritahs-ibm <amritahs@linux.vnet.ibm.com>
 andrijdavid <david@geek.mg>
 anon998 <131767832+anon998@users.noreply.github.com>
 anzz1 <anzz1@live.com>
@@ -588,14 +735,18 @@ apaz <aarpazdera@gmail.com>
 apcameron <37645737+apcameron@users.noreply.github.com>
 arch-btw <57669023+arch-btw@users.noreply.github.com>
 arcrank <arcrank@gmail.com>
+ardfork <134447697+ardfork@users.noreply.github.com>
 arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
 at8u <129688334+at8u@users.noreply.github.com>
 automaticcat <daogiatuank54@gmail.com>
+awatuna <23447591+awatuna@users.noreply.github.com>
+b4b4o <zwbao@foxmail.com>
 bandoti <141645996+bandoti@users.noreply.github.com>
 beiller <beiller@gmail.com>
 bhubbb <79117352+bhubbb@users.noreply.github.com>
 bmwl <brian.marshall@tolko.com>
 bobqianic <129547291+bobqianic@users.noreply.github.com>
+brucepro <git@brucepro.net>
 bryanSwk <93190252+bryanSwk@users.noreply.github.com>
 bsilvereagle <bsilvereagle@users.noreply.github.com>
 bssrdf <merlintiger@hotmail.com>
@@ -614,10 +765,14 @@ cpumaxx <163466046+cpumaxx@users.noreply.github.com>
 crasm <crasm@git.vczf.net>
 crasm <crasm@git.vczf.us>
 daboe01 <daboe01@googlemail.com>
+daghanerdonmez <44506702+daghanerdonmez@users.noreply.github.com>
+daminho <37615795+daminho@users.noreply.github.com>
 david raistrick <keen99@users.noreply.github.com>
 ddh0 <dylanhalladay02@icloud.com>
 ddpasa <112642920+ddpasa@users.noreply.github.com>
 deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
+devojony <61173062+devojony@users.noreply.github.com>
+ditsuke <ditsuke@protonmail.com>
 divinity76 <divinity76@gmail.com>
 dm4 <sunrisedm4@gmail.com>
 dotpy314 <33351922+dotpy314@users.noreply.github.com>
@@ -629,14 +784,18 @@ ebraminio <ebraminio@gmail.com>
 eiery <19350831+eiery@users.noreply.github.com>
 eric8607242 <e0928021388@gmail.com>
 fairydreaming <166155368+fairydreaming@users.noreply.github.com>
+fengerhu1 <2748250768@qq.com>
 fraxy-v <65565042+fraxy-v@users.noreply.github.com>
 github-actions[bot] <github-actions[bot]@users.noreply.github.com>
 gliptic <gliptic@users.noreply.github.com>
 goerch <jhr.walter@t-online.de>
 grahameth <96447521+grahameth@users.noreply.github.com>
+gtygo <gtydoit@gmail.com>
 gwjr <502526+gwjr@users.noreply.github.com>
 h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
 hankcs <cnhankmc@gmail.com>
+haopeng <657407891@qq.com>
+hipudding <huafengchun@gmail.com>
 hoangmit <hoangmit@users.noreply.github.com>
 hongbo.mo <352280764@qq.com>
 hopkins385 <98618192+hopkins385@users.noreply.github.com>
@@ -649,12 +808,14 @@ hxer7963 <hxer7963@gmail.com>
 hydai <z54981220@gmail.com>
 iSma <ismail.senhaji@gmail.com>
 iacore <74560659+iacore@users.noreply.github.com>
+icppWorld <124377669+icppWorld@users.noreply.github.com>
 igarnier <igarnier@protonmail.com>
 intelmatt <61025942+intelmatt@users.noreply.github.com>
 iohub <rickyang.pro@gmail.com>
 jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
 jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
 jameswu2014 <545426914@qq.com>
+jdomke <28772296+jdomke@users.noreply.github.com>
 jiez <373447296@qq.com>
 jneem <joeneeman@gmail.com>
 joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
@@ -677,28 +838,35 @@ klosax <131523366+klosax@users.noreply.github.com>
 kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
 kunnis <kunnis@users.noreply.github.com>
 kuronekosaiko <EvanChanJ@163.com>
+kustaaya <58045274+kustaaya@users.noreply.github.com>
 kuvaus <22169537+kuvaus@users.noreply.github.com>
 kwin1412 <42286931+kwin1412@users.noreply.github.com>
 l3utterfly <gc.pthzfoldr@gmail.com>
+laik <laik.lj@me.com>
 ldwang <ftgreat@163.com>
 le.chang <cljs118@126.com>
 leejet <leejet714@gmail.com>
+leo-pony <nengjunma@outlook.com>
 limitedAtonement <limitedAtonement@users.noreply.github.com>
 liuwei-git <14815172+liuwei-git@users.noreply.github.com>
 lon <114724657+longregen@users.noreply.github.com>
 loonerin <132926317+loonerin@users.noreply.github.com>
+ltoniazzi <61414566+ltoniazzi@users.noreply.github.com>
 luoyu-intel <yu.luo@intel.com>
 m3ndax <adrian.goessl@outlook.com>
 maddes8cht <55592906+maddes8cht@users.noreply.github.com>
 makomk <makosoft@googlemail.com>
 manikbhandari <mbbhandarimanik2@gmail.com>
 maor-ps <154728172+maor-ps@users.noreply.github.com>
+matiaslin <45382001+matiaslin@users.noreply.github.com>
+matteo <matteogeniaccio@yahoo.it>
 mdrokz <mohammadmunshi@gmail.com>
 mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
 minarchist <minarchist@users.noreply.github.com>
 mj-shifu <77107165+mj-shifu@users.noreply.github.com>
 mmyjona <jonathan.gonse@gmail.com>
 momonga <115213907+mmnga@users.noreply.github.com>
+momonga <146910567+mmngays@users.noreply.github.com>
 moritzbrantner <31051084+moritzbrantner@users.noreply.github.com>
 mzcu <milos.cubrilo@gmail.com>
 nanahi <130121847+na-na-hi@users.noreply.github.com>
@@ -716,8 +884,10 @@ omahs <73983677+omahs@users.noreply.github.com>
 oobabooga <112222186+oobabooga@users.noreply.github.com>
 opparco <parco.opaai@gmail.com>
 ostix360 <55257054+ostix360@users.noreply.github.com>
+pculliton <phillipculliton@gmail.com>
 pengxin99 <pengxin.yuan@intel.com>
 perserk <perserk@gmail.com>
+piDack <104877312+piDack@users.noreply.github.com>
 pmysl <piotr.myslinski@outlook.com>
 postmasters <namnguyen@google.com>
 pudepiedj <pudepiedj@gmail.com>
@@ -733,6 +903,7 @@ runfuture <runfuture@users.noreply.github.com>
 sandyiscool <sandyiscool@gmail.com>
 sasha0552 <admin@sasha0552.org>
 semidark <me@semidark.net>
+serhii-nakon <57632032+serhii-nakon@users.noreply.github.com>
 sharpHL <132747147+sharpHL@users.noreply.github.com>
 shibe2 <shibe@tuta.io>
 singularity <12184989+singularity-s0@users.noreply.github.com>
@@ -741,42 +912,55 @@ sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com>
 slaren <2141330+slaren@users.noreply.github.com>
 slaren <slarengh@gmail.com>
 snadampal <87143774+snadampal@users.noreply.github.com>
+standby24x7 <standby24x7@gmail.com>
 staviq <staviq@gmail.com>
 stduhpf <stephduh@live.fr>
 strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com>
 swittk <switt1995@gmail.com>
 takov751 <40316768+takov751@users.noreply.github.com>
 tarcey <cey.tarik@gmail.com>
+tc-mb <157115220+tc-mb@users.noreply.github.com>
 texmex76 <40733439+texmex76@users.noreply.github.com>
 thement <40525767+thement@users.noreply.github.com>
+thewh1teagle <61390950+thewh1teagle@users.noreply.github.com>
 tjohnman <tjohnman@users.noreply.github.com>
+toyer <2042519524@qq.com>
 tslmy <tslmy@users.noreply.github.com>
 ubik2 <ubik2@users.noreply.github.com>
 uint256_t <konndennsa@gmail.com>
 uint256_t <maekawatoshiki1017@gmail.com>
 unbounded <haakon@likedan.net>
+uvos <devnull@uvos.xyz>
 valiray <133289098+valiray@users.noreply.github.com>
+vb <vaibhavs10@gmail.com>
 vik <vikhyatk@gmail.com>
 viric <viric@viric.name>
 vodkaslime <646329483@qq.com>
 vvhg1 <94630311+vvhg1@users.noreply.github.com>
 vxiiduu <73044267+vxiiduu@users.noreply.github.com>
+wangshuai09 <391746016@qq.com>
 wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
 whoreson <139810751+whoreson@users.noreply.github.com>
 woachk <24752637+woachk@users.noreply.github.com>
 wonjun Jang <strutive07@gmail.com>
 woodx <124784234+woodx9@users.noreply.github.com>
+wwoodsTM <104587230+wwoodsTM@users.noreply.github.com>
 wzy <32936898+Freed-Wu@users.noreply.github.com>
 xaedes <xaedes@gmail.com>
 xaedes <xaedes@googlemail.com>
+xctan <axunlei@gmail.com>
 xloem <0xloem@gmail.com>
 yangli2 <yangli2@gmail.com>
 yuiseki <yuiseki@gmail.com>
+yuri@FreeBSD <yurivict@users.noreply.github.com>
 zakkor <edward.partenie@gmail.com>
 zhangkaihuo <zhangkaihuo@gmail.com>
+zhentaoyu <zhentao.yu@intel.com>
 zhouwg <6889919+zhouwg@users.noreply.github.com>
 zhouwg <zhouwg2000@gmail.com>
 zrm <trustiosity.zrm@gmail.com>
 Ștefan-Gabriel Muscalu <legraphista@users.noreply.github.com>
+杨朱 · Kiki <baofa.fan@daocloud.io>
 源文雨 <41315874+fumiama@users.noreply.github.com>
+蕭澧邦 <45505768+shou692199@users.noreply.github.com>
 Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
index ced378879..69d3d327d 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
@@ -1,7 +1,3 @@
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-// SPDX-License-Identifier: MIT
-//
-
 #define GGML_COMMON_IMPL_C
 #include "ggml-common.h"
 

From 678d7994f4da0af3d29046be99950ac999ee9762 Mon Sep 17 00:00:00 2001
From: Ting Lou <louting@189.cn>
Date: Fri, 29 Nov 2024 08:09:46 +0800
Subject: [PATCH 41/43] llava: return false instead of exit (#10546)

---
 examples/llava/clip.cpp  | 15 +++++++++++----
 examples/llava/llava.cpp | 28 +++++++++++++++++++---------
 2 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index aae49c965..7ba4cea58 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -40,10 +40,17 @@
 #include <cinttypes>
 #include <limits>
 
-#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
-#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#if defined(LLAVA_LOG_OFF)
+#   define LOG_INF(...)
+#   define LOG_WRN(...)
+#   define LOG_ERR(...)
+#   define LOG_DBG(...)
+#else // defined(LLAVA_LOG_OFF)
+#   define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#   define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#   define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#   define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#endif // defined(LLAVA_LOG_OFF)
 
 //#define CLIP_DEBUG_FUNCTIONS
 
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index be6988540..4ca53a0b8 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -11,13 +11,17 @@
 #include <limits>
 #include <vector>
 
-#define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
-#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
-
-#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
-#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#if defined(LLAVA_LOG_OFF)
+#   define LOG_INF(...)
+#   define LOG_WRN(...)
+#   define LOG_ERR(...)
+#   define LOG_DBG(...)
+#else // defined(LLAVA_LOG_OFF)
+#   define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#   define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#   define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#   define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#endif // defined(LLAVA_LOG_OFF)
 
 // RGB uint8 image
 struct clip_image_u8 {
@@ -498,10 +502,16 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
     errno = 0;
     size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
     if (ferror(file)) {
-        die_fmt("read error: %s", strerror(errno));
+        LOG_ERR("read error: %s", strerror(errno));
+        free(buffer);
+        fclose(file);
+        return false;
     }
     if (ret != (size_t) fileSize) {
-        die("unexpectedly reached end of file");
+        LOG_ERR("unexpectedly reached end of file");
+        free(buffer);
+        fclose(file);
+        return false;
     }
     fclose(file); // Close the file
 

From f095a649ec390e04dfab1b04e646ae8549dafaef Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Fri, 29 Nov 2024 00:18:02 -0600
Subject: [PATCH 42/43] vulkan: get the first command buffer submitted sooner
 (#10499)

This is an incremental improvement over #9118 to get work to the GPU a bit
sooner. The first part is to start with a smaller number of nodes before
the first submit, and ramp it up to the current 100 nodes/submit. The
second part is to reduce the dryrun overhead for all the nodes that just
need to request descriptor space.

With these changes I get around 1-2% speedup on RTX 4070 combined with my
old Haswell-era CPU.
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 60 ++++++++++++++++++++++++++--
 1 file changed, 56 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index a833007fb..849c11923 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -5672,6 +5672,48 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
         } else {
             compute_ctx = ctx->compute_ctx.lock();
         }
+    } else {
+        switch (node->op) {
+        case GGML_OP_REPEAT:
+        case GGML_OP_ACC:
+        case GGML_OP_GET_ROWS:
+        case GGML_OP_ADD:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_CONCAT:
+        case GGML_OP_UPSCALE:
+        case GGML_OP_SCALE:
+        case GGML_OP_SQR:
+        case GGML_OP_SIN:
+        case GGML_OP_COS:
+        case GGML_OP_CLAMP:
+        case GGML_OP_PAD:
+        case GGML_OP_CPY:
+        case GGML_OP_CONT:
+        case GGML_OP_DUP:
+        case GGML_OP_NORM:
+        case GGML_OP_GROUP_NORM:
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_UNARY:
+        case GGML_OP_DIAG_MASK_INF:
+        case GGML_OP_SOFT_MAX:
+        case GGML_OP_ROPE:
+        case GGML_OP_ARGSORT:
+        case GGML_OP_SUM_ROWS:
+        case GGML_OP_IM2COL:
+        case GGML_OP_TIMESTEP_EMBEDDING:
+        case GGML_OP_POOL_2D:
+        case GGML_OP_LEAKY_RELU:
+            {
+                // These operations all go through ggml_vk_op_f32, so short-circuit and
+                // do the only thing needed for the dryrun.
+                vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, node, node->op);
+                ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+                return false;
+            }
+        default:
+            break;
+        }
     }
 
     switch (node->op) {
@@ -6401,16 +6443,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
     bool first_node_in_batch = true; // true if next node will be first node in a batch
     int submit_node_idx = 0; // index to first node in a batch
 
-    // submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution
-    constexpr int submit_count = 100;
+    // Submit work every nodes_per_submit nodes to overlap CPU cmdbuffer generation with GPU execution.
+    // Start with a smaller count to get work submitted right away, and increase it after each submit.
+    int nodes_per_submit = 20;
     int submitted_nodes = 0;
+    int submit_count = 0;
     for (int i = 0; i < cgraph->n_nodes; i++) {
         if (first_node_in_batch) {
             submit_node_idx = i;
         }
 
-        bool submit = (submitted_nodes >= submit_count) || (i == last_node);
-
+        bool submit = (submitted_nodes >= nodes_per_submit) || (i == last_node);
 
         bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
 
@@ -6427,6 +6470,15 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
         if (submit) {
             first_node_in_batch = true;
             submitted_nodes = 0;
+            switch (submit_count) {
+            case 0:
+                nodes_per_submit = 50;
+                break;
+            default:
+                nodes_per_submit = 100;
+                break;
+            }
+            submit_count++;
         }
     }
 

From 938f6087421889a3af7d0786c64406ced2be81b8 Mon Sep 17 00:00:00 2001
From: Chenguang Li <87689256+noemotiovon@users.noreply.github.com>
Date: Fri, 29 Nov 2024 14:46:55 +0800
Subject: [PATCH 43/43] CANN: RoPE operator optimization (#10563)

* [cann] RoPE operator optimization

* [CANN]Code Formatting

---------

Co-authored-by: noemotiovon <noemotiovon@gmail.com>
---
 ggml/src/ggml-cann/aclnn_ops.cpp | 241 ++++++++++++++++++++++++++++---
 ggml/src/ggml-cann/ggml-cann.cpp |  13 +-
 2 files changed, 222 insertions(+), 32 deletions(-)

diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index d707efc5d..b2d857e1e 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -2965,7 +2965,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
                              aclTensor* acl_cos_repeat_tensor,
                              aclTensor* acl_sin_repeat_tensor,
                              float theta_scale, float freq_scale,
-                             bool is_neox) {
+                             float attn_factor, bool is_neox) {
     // int sin/cos cache, cache has different repeat method depond on
     // @param.is_neox
 
@@ -3017,6 +3017,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
             ggml_type_size(src2->type), arange_ne, arange_nb, GGML_MAX_DIMS);
         aclnn_div_tensor(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor,
                          nullptr, true);
+        ACL_CHECK(aclDestroyTensor(acl_freq_factors_tensor));
     }
 
     // position
@@ -3047,16 +3048,6 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
     aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor,
               acl_theta_tensor);
 
-    // // power[] * position[] * freq_scale / freq_factors[]
-    // ggml_cann_pool_alloc theta_final_allocator(ctx.pool(),
-    //                                            theta_length *
-    //                                            sizeof(float_t));
-    // aclTensor* acl_theat_final_tensor = aclnn_zero(
-    //     ctx, theta_final_allocator.get(), sizeof(float_t) * theta_length,
-    //     theta_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t));
-    // aclnn_inplace_addcdiv(ctx, acl_theat_final_tensor, acl_theta_tensor,
-    //                       acl_freq_factors_tensor, freq_scale);
-
     // permute: [0,1,2,3]->[0,2,1,3]
     int64_t permute_ne[] = {arange_length, 1, position_length, 1};
     size_t permute_nb[GGML_MAX_DIMS];
@@ -3092,6 +3083,12 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
         GGML_MAX_DIMS, ACL_FORMAT_ND);
     aclnn_cos(ctx, acl_permute_tensor, acl_cos_tensor);
 
+    // attn_factor
+    if (attn_factor != 1) {
+        aclnn_muls(ctx, acl_sin_tensor, attn_factor, nullptr, true);
+        aclnn_muls(ctx, acl_cos_tensor, attn_factor, nullptr, true);
+    }
+
     // repeat
     if (is_neox) {
         int64_t repeatsArray[] = {1, 1, 1, 2};
@@ -3155,15 +3152,11 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
     memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
 
-    // TODO: attn_factor != 1
-    GGML_ASSERT(attn_factor == 1);
     // TODO: n_dims <= ne0
     GGML_ASSERT(n_dims == ne0);
     GGML_ASSERT(n_dims % 2 == 0);
     // TODO: ext_factor != 0
     GGML_ASSERT(ext_factor == 0);
-    // TODO: type == GGML_TYPE_F16
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 
     const float theta_scale = powf(freq_base, -2.0f / n_dims);
 
@@ -3194,7 +3187,217 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
         ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t),
                                 sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
     aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
-                     theta_scale, freq_scale, is_neox);
+                     theta_scale, freq_scale, attn_factor, is_neox);
+
+    aclTensor* acl_src = ggml_cann_create_tensor(src0);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+
+#ifdef ASCEND_310P
+    // Special ROPE operation for 310P
+
+    // roll input
+    void* input_roll_buffer;
+    aclTensor* acl_minus_one_tensor;
+    void* minus_one_scale_buffer = nullptr;
+    ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
+    ggml_cann_pool_alloc minus_one_scale_allocator(
+        ctx.pool(), sizeof(float_t) * src0->ne[0]);
+    if (!is_neox) {
+        // roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
+        input_roll_buffer = roll_allocator.get();
+        int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2),
+                                    src0->ne[2], src0->ne[3]};
+        size_t input_roll_nb[GGML_MAX_DIMS];
+        input_roll_nb[0] = ggml_type_size(src0->type);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
+        }
+        aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
+            input_roll_buffer, ggml_cann_type_mapping(src0->type),
+            ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
+            GGML_MAX_DIMS);
+        aclTensor* acl_input_tensor = ggml_cann_create_tensor(
+            src0->data, ggml_cann_type_mapping(src0->type),
+            ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
+            GGML_MAX_DIMS);
+
+        int64_t shifts[] = {1};
+        int64_t dims[] = {3};
+        aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
+        ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_input_tensor));
+
+        // init [-1, 1, -1, 1, ...]
+        minus_one_scale_buffer = minus_one_scale_allocator.get();
+
+        int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
+        size_t minus_one_nb[GGML_MAX_DIMS];
+        minus_one_nb[0] = sizeof(float_t);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
+        }
+        acl_minus_one_tensor = aclnn_values(
+            ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
+            minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
+        int64_t dim = 3;
+        int64_t* index = new int64_t[src0->ne[0]];
+        for (int i = 0; i < src0->ne[0]; i++) {
+            index[i] = i / 2 * 2;
+        }
+        int64_t index_num = src0->ne[0];
+        float value = -1;
+        aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index,
+                                index_num, value);
+    } else {
+        // roll input: [q0,q1,q2,...] ->
+        // [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
+        input_roll_buffer = roll_allocator.get();
+        aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
+            input_roll_buffer, ggml_cann_type_mapping(src0->type),
+            ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
+        aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0);
+
+        int64_t shifts[] = {src0->ne[0] / 2};
+        int64_t dims[] = {3};
+        aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
+
+        ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_input_tensor));
+        // init [-1, -1, -1, 1, 1，1，...]
+        minus_one_scale_buffer = minus_one_scale_allocator.get();
+        int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
+        size_t minus_one_nb[GGML_MAX_DIMS];
+        minus_one_nb[0] = sizeof(float_t);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
+        }
+        acl_minus_one_tensor = aclnn_values(
+            ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
+            minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
+        // -1 * first half
+        int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
+        size_t first_half_nb[GGML_MAX_DIMS];
+        first_half_nb[0] = sizeof(float_t);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
+        }
+        aclTensor* acl_first_half_tensor = ggml_cann_create_tensor(
+            minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne,
+            first_half_nb, GGML_MAX_DIMS);
+        bool inplace = true;
+        float scale = -1;
+        aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
+        ACL_CHECK(aclDestroyTensor(acl_first_half_tensor));
+    }
+
+    // TODO: n_dims < ne0
+    GGML_ASSERT(n_dims == src0->ne[0]);
+
+    // input * scale
+    ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(),
+                                                  ggml_nbytes(src0));
+    void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
+    size_t input_nb[GGML_MAX_DIMS];
+    input_nb[0] = ggml_type_size(src0->type);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
+    }
+    aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor(
+        input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
+        ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
+    aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor(
+        input_roll_buffer, ggml_cann_type_mapping(src0->type),
+        ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
+
+    aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor,
+              acl_input_roll_mul_scale_tensor);
+
+    // output
+    void* output_fp32_buffer;
+    if (src0->type == GGML_TYPE_F32) {
+        aclnn_inplace_mul(ctx, acl_src, acl_cos_reshape_tensor);
+        aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor,
+                          acl_sin_reshape_tensor);
+        aclnn_add(ctx, acl_src, acl_input_roll_mul_scale_tensor, acl_dst);
+        // TODO: ne0 != n_dims in mode2
+    } else if (src0->type == GGML_TYPE_F16) {
+        size_t input_fp32_nb[GGML_MAX_DIMS];
+        input_fp32_nb[0] = sizeof(float_t);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
+        }
+        ggml_cann_pool_alloc fp32_allocator1(
+            ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
+        void* input_fp32_buffer1 = fp32_allocator1.get();
+        aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor(
+            input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne,
+            input_fp32_nb, GGML_MAX_DIMS);
+        ggml_cann_pool_alloc fp32_allocator2(
+            ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
+        void* input_fp32_buffer2 = fp32_allocator2.get();
+        aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor(
+            input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne,
+            input_fp32_nb, GGML_MAX_DIMS);
+
+        ggml_cann_pool_alloc fp32_allocator(
+            ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
+        output_fp32_buffer = fp32_allocator.get();
+        aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
+            output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne,
+            input_fp32_nb, GGML_MAX_DIMS);
+        aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor, input_fp32_tensor1);
+        aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
+                  input_fp32_tensor2);
+        aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2,
+                  output_fp32_tensor);
+        aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
+
+        ACL_CHECK(aclDestroyTensor(input_fp32_tensor1));
+        ACL_CHECK(aclDestroyTensor(input_fp32_tensor2));
+        ACL_CHECK(aclDestroyTensor(output_fp32_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_src));
+    }
+    return;
+#endif
+
+    // src0 == GGML_TYPE_F16
+    // TODO: optimization this `if` code
+    if (src0->type == GGML_TYPE_F16) {
+        ggml_cann_pool_alloc sin_final_allocator(
+            ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type));
+        ggml_cann_pool_alloc cos_final_allocator(
+            ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type));
+        void* sin_final_buffer = sin_final_allocator.get();
+        void* cos_final_buffer = cos_final_allocator.get();
+
+        int64_t sin_final_ne[4] = {src0->ne[0], 1, src0->ne[2], 1};
+        size_t sin_final_nb[GGML_MAX_DIMS];
+        sin_final_nb[0] = ggml_type_size(src0->type);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            sin_final_nb[i] = sin_final_nb[i - 1] * sin_final_ne[i - 1];
+        }
+        aclTensor* acl_sin_final_tensor = ggml_cann_create_tensor(
+            sin_final_buffer, ggml_cann_type_mapping(src0->type),
+            ggml_type_size(src0->type), sin_final_ne, sin_final_nb,
+            GGML_MAX_DIMS);
+        aclTensor* acl_cos_final_tensor = ggml_cann_create_tensor(
+            cos_final_buffer, ggml_cann_type_mapping(src0->type),
+            ggml_type_size(src0->type), sin_final_ne, sin_final_nb,
+            GGML_MAX_DIMS);
+
+        aclnn_cast(ctx, acl_sin_reshape_tensor, acl_sin_final_tensor,
+                   ggml_cann_type_mapping(src0->type));
+        aclnn_cast(ctx, acl_cos_reshape_tensor, acl_cos_final_tensor,
+                   ggml_cann_type_mapping(src0->type));
+        ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
+        acl_sin_reshape_tensor = acl_sin_final_tensor;
+        acl_cos_reshape_tensor = acl_cos_final_tensor;
+    }
 
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
@@ -3206,10 +3409,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
         acl_mode = 1;
     }
 
-    aclTensor* acl_x = ggml_cann_create_tensor(src0);
-    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
     ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize(
-        acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode,
+        acl_src, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode,
         acl_dst, &workspaceSize, &executor));
     if (workspaceSize > 0) {
         ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
@@ -3219,7 +3420,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize,
                                            executor, ctx.stream()));
 
-    ACL_CHECK(aclDestroyTensor(acl_x));
+    ACL_CHECK(aclDestroyTensor(acl_src));
     ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
     ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
     ACL_CHECK(aclDestroyTensor(acl_dst));
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index bcb54e444..04e25b8ab 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1739,7 +1739,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
         case GGML_OP_ROPE: {
             // TODO: with ops-test v == 1
             float * ext_factor = (float*)((int32_t*)op->op_params + 7);
-            float * attn_factor = (float*)((int32_t*)op->op_params + 8);
             // TODO: n_dims <= ne0
             if (op->src[0]->ne[0] != op->op_params[1]) {
                 return false;
@@ -1748,17 +1747,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
             if (*ext_factor != 0) {
                 return false;
             }
-            // TODO: attn_factor != 1
-            if (*attn_factor != 1) {
-                return false;
-            }
-            //TODO: type == GGML_TYPE_F16
-            switch (op->src[0]->type) {
-                case GGML_TYPE_F32:
-                    return true;
-                default:
-                    return false;
-            }
+            return true;
         }
         case GGML_OP_UPSCALE: {
             // aclnnUpsampleNearest2dGetWorkspaceSize not support