From 0d2f22e45c3c3b6f8222acb6284d0c8c93443ba1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 18 Sep 2024 18:34:32 +0300
Subject: [PATCH 01/78] scripts : verify py deps at the start of compare
 (#9520)

---
 scripts/compare-commits.sh     | 3 +++
 scripts/compare-llama-bench.py | 5 +++++
 2 files changed, 8 insertions(+)

diff --git a/scripts/compare-commits.sh b/scripts/compare-commits.sh
index 70679f4e5..8b9b1ad39 100755
--- a/scripts/compare-commits.sh
+++ b/scripts/compare-commits.sh
@@ -8,6 +8,9 @@ fi
 set -e
 set -x
 
+# verify at the start that the compare script has all the necessary dependencies installed
+./scripts/compare-llama-bench.py --check
+
 bench_args="${@:3}"
 
 rm -f llama-bench.sqlite > /dev/null
diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py
index 92b9e682a..e45e83ce8 100755
--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-llama-bench.py
@@ -92,6 +92,7 @@ help_s = (
     "If the columns are manually specified, then the results for each unique combination of the "
     "specified values are averaged WITHOUT weighing by the --repetitions parameter of llama-bench."
 )
+parser.add_argument("--check", action="store_true", help="check if all required Python libraries are installed")
 parser.add_argument("-s", "--show", help=help_s)
 parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
 
@@ -99,6 +100,10 @@ known_args, unknown_args = parser.parse_known_args()
 
 logging.basicConfig(level=logging.DEBUG if known_args.verbose else logging.INFO)
 
+if known_args.check:
+    # Check if all required Python libraries are installed. Would have failed earlier if not.
+    sys.exit(0)
+
 if unknown_args:
     logger.error(f"Received unknown args: {unknown_args}.\n")
     parser.print_help()

From 64c6af3195c3cd4aa3328a1282d29cd2635c34c9 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Wed, 18 Sep 2024 19:13:08 +0200
Subject: [PATCH 02/78] ggml : fix n_threads_cur initialization with one thread
 (#9538)

* ggml : fix n_threads_cur initialization with one thread

* Update ggml/src/ggml.c

---------

Co-authored-by: Max Krasnyansky <quic_maxk@quicinc.com>
---
 ggml/src/ggml.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index bccb62377..c5c98dbe4 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -20239,6 +20239,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
             ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
         }
     } else {
+        atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
         ggml_graph_compute_thread(&threadpool->workers[0]);
     }
 #else

From eca0fab44eb449ed34e17a9b651ae7398b494117 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Thu, 19 Sep 2024 09:58:14 +0200
Subject: [PATCH 03/78] imatrix : disable prompt escape by default (#9543)

---
 examples/imatrix/imatrix.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 265281699..c8e273529 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -572,6 +572,7 @@ int main(int argc, char ** argv) {
 
     params.n_ctx = 512;
     params.logits_all = true;
+    params.escape = false;
 
     if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
         return 1;

From 6026da52d6942b253df835070619775d849d0258 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 19 Sep 2024 12:44:53 +0300
Subject: [PATCH 04/78] server : clean-up completed tasks from waiting list
 (#9531)

ggml-ci
---
 examples/server/server.cpp | 40 ++++++++++++++++++++++++++++++++------
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index dce69f832..0ca999994 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -531,26 +531,38 @@ struct server_response {
 
     // add the id_task to the list of tasks waiting for response
     void add_waiting_task_id(int id_task) {
-        SRV_DBG("waiting for task id = %d\n", id_task);
+        SRV_DBG("add task %d to waiting list. current waiting = %d (before add)\n", id_task, (int) waiting_task_ids.size());
 
         std::unique_lock<std::mutex> lock(mutex_results);
         waiting_task_ids.insert(id_task);
     }
 
     void add_waiting_tasks(const std::vector<server_task> & tasks) {
-        for (const auto & t : tasks) {
-            add_waiting_task_id(t.id);
+        std::unique_lock<std::mutex> lock(mutex_results);
+
+        for (const auto & task : tasks) {
+            SRV_DBG("add task %d to waiting list. current waiting = %d (before add)\n", task.id, (int) waiting_task_ids.size());
+            waiting_task_ids.insert(task.id);
         }
     }
 
     // when the request is finished, we can remove task associated with it
     void remove_waiting_task_id(int id_task) {
-        SRV_DBG("task id = %d is done\n", id_task);
+        SRV_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size());
 
         std::unique_lock<std::mutex> lock(mutex_results);
         waiting_task_ids.erase(id_task);
     }
 
+    void remove_waiting_task_ids(const std::unordered_set<int> & id_tasks) {
+        std::unique_lock<std::mutex> lock(mutex_results);
+
+        for (const auto & id_task : id_tasks) {
+            SRV_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size());
+            waiting_task_ids.erase(id_task);
+        }
+    }
+
     // This function blocks the thread until there is a response for one of the id_tasks
     server_task_result recv(const std::unordered_set<int> & id_tasks) {
         while (true) {
@@ -2774,6 +2786,8 @@ int main(int argc, char ** argv) {
             }, [&](const json & error_data) {
                 res_error(res, error_data);
             });
+
+            ctx_server.queue_results.remove_waiting_task_ids(task_ids);
         } else {
             const auto chunked_content_provider = [task_ids, &ctx_server](size_t, httplib::DataSink & sink) {
                 ctx_server.receive_cmpl_results_stream(task_ids, [&](const server_task_result & result) -> bool {
@@ -2784,7 +2798,12 @@ int main(int argc, char ** argv) {
                 sink.done();
                 return false;
             };
-            res.set_chunked_content_provider("text/event-stream", chunked_content_provider);
+
+            auto on_complete = [task_ids, &ctx_server] (bool) {
+                ctx_server.queue_results.remove_waiting_task_ids(task_ids);
+            };
+
+            res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
         }
     };
 
@@ -2823,6 +2842,8 @@ int main(int argc, char ** argv) {
             }, [&](const json & error_data) {
                 res_error(res, error_data);
             });
+
+            ctx_server.queue_results.remove_waiting_task_ids(task_ids);
         } else {
             const auto chunked_content_provider = [task_ids, &ctx_server, completion_id](size_t, httplib::DataSink & sink) {
                 ctx_server.receive_cmpl_results_stream(task_ids, [&](const server_task_result & result) -> bool {
@@ -2844,7 +2865,12 @@ int main(int argc, char ** argv) {
                 sink.done();
                 return true;
             };
-            res.set_chunked_content_provider("text/event-stream", chunked_content_provider);
+
+            auto on_complete = [task_ids, &ctx_server] (bool) {
+                ctx_server.queue_results.remove_waiting_task_ids(task_ids);
+            };
+
+            res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
         }
     };
 
@@ -2953,6 +2979,8 @@ int main(int argc, char ** argv) {
                 res_error(res, error_data);
                 error = true;
             });
+
+            ctx_server.queue_results.remove_waiting_task_ids(task_ids);
         }
 
         if (error) {

From 722ec1eb51ed53be2ab1ef31c6a1da8261803c71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Fri, 20 Sep 2024 08:38:10 +0200
Subject: [PATCH 05/78] perplexity : do not escape input data by default
 (#9548)

---
 examples/perplexity/perplexity.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 18e75a7a2..cbd466656 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1961,6 +1961,7 @@ int main(int argc, char ** argv) {
 
     params.n_ctx = 512;
     params.logits_all = true;
+    params.escape = false;
 
     if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
         return 1;

From d39e26741f9f02340651dbc640c9776e1a1128ef Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 20 Sep 2024 11:46:56 +0300
Subject: [PATCH 06/78] examples : flush log upon ctrl+c (#9559)

---
 examples/infill/infill.cpp | 5 +++++
 examples/main/main.cpp     | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index b77b876cc..35607276a 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -97,6 +97,11 @@ static void sigint_handler(int signo) {
             LOG("\n");
             gpt_perf_print(*g_ctx, *g_smpl);
             write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
+
+            // make sure all logs are flushed
+            LOG("Interrupted by user\n");
+            gpt_log_pause(gpt_log_main());
+
             _exit(130);
         }
     }
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 91fea9326..c3041f1fb 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -116,6 +116,11 @@ static void sigint_handler(int signo) {
             LOG("\n");
             gpt_perf_print(*g_ctx, *g_smpl);
             write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
+
+            // make sure all logs are flushed
+            LOG("Interrupted by user\n");
+            gpt_log_pause(gpt_log_main());
+
             _exit(130);
         }
     }

From 5cb12f68395a5ec00b357e408883ce124a11dcdb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Fri, 20 Sep 2024 18:35:35 +0200
Subject: [PATCH 07/78] CUDA: fix sum.cu compilation for CUDA < 11.7 (#9562)

---
 ggml/src/ggml-cuda/sum.cu | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-cuda/sum.cu b/ggml/src/ggml-cuda/sum.cu
index 21da63509..0583e4fe0 100644
--- a/ggml/src/ggml-cuda/sum.cu
+++ b/ggml/src/ggml-cuda/sum.cu
@@ -1,9 +1,13 @@
-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
+#define USE_CUB
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
+
+#ifdef USE_CUB
 // On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh.
 // For this reason CUB must be included BEFORE anything else.
 #include <cub/cub.cuh>
 using namespace cub;
-#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+#endif // USE_CUB
 
 #include "sumrows.cuh"
 #include "sum.cuh"
@@ -11,7 +15,7 @@ using namespace cub;
 #include <cstdint>
 
 void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+#ifdef USE_CUB
     size_t tmp_size = 0;
     DeviceReduce::Sum(nullptr,       tmp_size, x, dst, ne, stream);
     ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
@@ -21,7 +25,7 @@ void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int
     // For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14.
     sum_rows_f32_cuda(x, dst, ne, 1, stream);
     GGML_UNUSED(pool);
-#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+#endif // USE_CUB
 }
 
 void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

From a6809c6a2e8163cba296d4cc0c5cd4bbc8073638 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 8 Sep 2024 11:10:43 +0300
Subject: [PATCH 08/78] examples : add null threadpool args where needed
 (ggml/0)

ggml-ci
---
 ggml/src/ggml.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index c5c98dbe4..51532c5ea 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -19753,8 +19753,8 @@ void ggml_threadpool_resume(struct ggml_threadpool * threadpool) {
 
 struct ggml_cplan ggml_graph_plan(
           const struct ggml_cgraph * cgraph,
-                           int       n_threads,
-    struct ggml_threadpool * threadpool) {
+                               int   n_threads,
+            struct ggml_threadpool * threadpool) {
 
     if (threadpool == NULL) {
         GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);

From 424c5d00a9b97dd5559635872db9b57f87c23b02 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Fri, 20 Sep 2024 19:04:44 +0300
Subject: [PATCH 09/78] ggml/examples: add backend support for numerical
 optimization (ggml/949)

* CUDA eval works

* stochastic gradient descent op

* Adam except decay

* CUDA CROSS_ENTROPY_LOSS_BACK

* CUDA mnist-fc training works

* backend CLI arg

* refactor gguf load

* remove sched from opt_step_adam

* implement l1 regularization (weight decay)

* extra call to add optimizer

* initialize gradients with ggml_graph_reset

* gradient accumulation

* increment iter per eval instead of epoch

* adjust backend interfaces

* fix ggml_graph_reset without backend

* fix ggml graph export/import

* fixup

* rename

* revert ggml_opt changes

* more general CUDA repeat_back

* update documentation, fix CNN

* validation split

* add clarifying comment

* optimize PyTorch training

* adjust buffer size, thread count

* fix 0.0f validation split

* Update examples/mnist/mnist-common.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* fix gradient accumulation

* tensor flag for accumulators -> tensor hash set

* Update include/ggml.h

Co-authored-by: slaren <slarengh@gmail.com>

* Update tests/test-backend-ops.cpp

Co-authored-by: slaren <slarengh@gmail.com>

* Update tests/test-backend-ops.cpp

Co-authored-by: slaren <slarengh@gmail.com>

* fix test prints

* Update src/ggml-backend.c

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* better CUDA support for noncontiguous out_prod

* add comment

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
---
 ggml/include/ggml-backend.h               |   3 +-
 ggml/include/ggml.h                       |  40 +-
 ggml/src/ggml-backend-impl.h              |  19 +-
 ggml/src/ggml-backend.c                   |  25 ++
 ggml/src/ggml-cann.cpp                    |   1 +
 ggml/src/ggml-cuda.cu                     |  42 ++-
 ggml/src/ggml-cuda/binbcast.cu            |  67 ++++
 ggml/src/ggml-cuda/binbcast.cuh           |   2 +
 ggml/src/ggml-cuda/cross-entropy-loss.cu  |  60 +++
 ggml/src/ggml-cuda/cross-entropy-loss.cuh |   2 +
 ggml/src/ggml-cuda/opt-step-adamw.cu      |  80 ++++
 ggml/src/ggml-cuda/opt-step-adamw.cuh     |   5 +
 ggml/src/ggml-cuda/out-prod.cu            |  52 +++
 ggml/src/ggml-cuda/out-prod.cuh           |   3 +
 ggml/src/ggml-cuda/unary.cu               |  29 ++
 ggml/src/ggml-cuda/unary.cuh              |   3 +
 ggml/src/ggml-kompute.cpp                 |   1 +
 ggml/src/ggml-metal.m                     |   1 +
 ggml/src/ggml-rpc.cpp                     |   1 +
 ggml/src/ggml-sycl.cpp                    |   1 +
 ggml/src/ggml-vulkan.cpp                  |   1 +
 ggml/src/ggml.c                           | 424 +++++++++++++++++-----
 tests/test-backend-ops.cpp                | 148 ++++++--
 tests/test-grad0.cpp                      |   2 +-
 24 files changed, 883 insertions(+), 129 deletions(-)
 create mode 100644 ggml/src/ggml-cuda/opt-step-adamw.cu
 create mode 100644 ggml/src/ggml-cuda/opt-step-adamw.cuh
 create mode 100644 ggml/src/ggml-cuda/out-prod.cu
 create mode 100644 ggml/src/ggml-cuda/out-prod.cuh

diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index e497b6d02..71c0bef8e 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -66,6 +66,7 @@ extern "C" {
     // "offset" refers to the offset of the tensor data for setting/getting data
     GGML_API GGML_CALL void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
     GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+    GGML_API GGML_CALL void ggml_backend_tensor_memset(   struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
 
     GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
 
@@ -122,7 +123,7 @@ extern "C" {
     // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
 
     GGML_API size_t                     ggml_backend_reg_get_count(void);
-    GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name);
+    GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name); // returns index of backend with name, or SIZE_MAX if not found
     GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
     GGML_API const char *               ggml_backend_reg_get_name(size_t i);
     GGML_API ggml_backend_t             ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index a413df357..2035001e9 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -534,6 +534,7 @@ extern "C" {
 
         GGML_OP_CROSS_ENTROPY_LOSS,
         GGML_OP_CROSS_ENTROPY_LOSS_BACK,
+        GGML_OP_OPT_STEP_ADAMW,
 
         GGML_OP_COUNT,
     };
@@ -571,10 +572,12 @@ extern "C" {
         GGML_LOG_LEVEL_DEBUG = 4,
     };
 
+    // this tensor...
     enum ggml_tensor_flag {
-        GGML_TENSOR_FLAG_INPUT  = 1,
-        GGML_TENSOR_FLAG_OUTPUT = 2,
-        GGML_TENSOR_FLAG_PARAM  = 4,
+        GGML_TENSOR_FLAG_INPUT    = 1, // ...is an input for the GGML compute graph
+        GGML_TENSOR_FLAG_OUTPUT   = 2, // ...is an output for the GGML compute graph
+        GGML_TENSOR_FLAG_PARAM    = 4, // ...contains trainable parameters
+        GGML_TENSOR_FLAG_LOSS     = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
     };
 
     // n-dimensional tensor
@@ -2037,23 +2040,44 @@ extern "C" {
             struct ggml_tensor          * b,
             struct ggml_tensor          * c);
 
+    // AdamW optimizer step
+    // Paper: https://arxiv.org/pdf/1711.05101v3.pdf
+    // PyTorch: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
+    GGML_API struct ggml_tensor * ggml_opt_step_adamw(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 alpha,
+            float                 beta1,
+            float                 beta2,
+            float                 eps,
+            float                 wd); // weight decay
+
     //
     // automatic differentiation
     //
 
-    GGML_API void ggml_set_param(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * tensor);
+    GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
+    GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
 
     GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
-    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
+    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate, bool keep);
+
+    GGML_API void ggml_build_opt_adamw(
+            struct ggml_context * ctx,
+            struct ggml_cgraph  * gf,
+            struct ggml_cgraph  * gb,
+            float                 alpha,
+            float                 beta1,
+            float                 beta2,
+            float                 eps,
+            float                 wd); // weight decay
 
     // graph allocation in a context
     GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
     GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
     GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
     GGML_API void                 ggml_graph_cpy       (struct ggml_cgraph * src, struct ggml_cgraph * dst);
-    GGML_API void                 ggml_graph_reset     (struct ggml_cgraph * cgraph);  // zero grads
+    GGML_API void                 ggml_graph_reset     (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
     GGML_API void                 ggml_graph_clear     (struct ggml_cgraph * cgraph);
 
     GGML_API int                   ggml_graph_size   (struct ggml_cgraph * cgraph);
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index 36ca37086..b0d4141cc 100644
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -38,15 +38,16 @@ extern "C" {
     typedef void * ggml_backend_buffer_context_t;
 
     struct ggml_backend_buffer_i {
-        const char * (*GGML_CALL get_name)   (ggml_backend_buffer_t buffer);
-        void         (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
-        void *       (*GGML_CALL get_base)   (ggml_backend_buffer_t buffer);
-        void         (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-        void         (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void         (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        bool         (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
-        void         (*GGML_CALL clear)      (ggml_backend_buffer_t buffer, uint8_t value);
-        void         (*GGML_CALL reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
+        const char * (*GGML_CALL get_name)      (ggml_backend_buffer_t buffer);
+        void         (*GGML_CALL free_buffer)   (ggml_backend_buffer_t buffer);
+        void *       (*GGML_CALL get_base)      (ggml_backend_buffer_t buffer);
+        void         (*GGML_CALL init_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+        void         (*GGML_CALL memset_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
+        void         (*GGML_CALL set_tensor)    (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void         (*GGML_CALL get_tensor)    (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        bool         (*GGML_CALL cpy_tensor)    (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
+        void         (*GGML_CALL clear)         (ggml_backend_buffer_t buffer, uint8_t value);
+        void         (*GGML_CALL reset)         (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
     };
 
     struct ggml_backend_buffer {
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
index b5d9301a7..97ca5a1f3 100644
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@@ -246,6 +246,22 @@ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void *
     buf->iface.get_tensor(buf, tensor, data, offset, size);
 }
 
+GGML_API GGML_CALL void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+
+    GGML_ASSERT(buf != NULL && "tensor buffer not set");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
+
+    if (!size) {
+        return;
+    }
+    
+    GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer");
+
+    buf->iface.memset_tensor(buf, tensor, value, offset, size);
+}
+
 void ggml_backend_synchronize(ggml_backend_t backend) {
     if (backend->iface.synchronize == NULL) {
         return;
@@ -569,6 +585,12 @@ GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t
     free(buffer->context);
 }
 
+GGML_CALL static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+    memset((char *)tensor->data + offset, value, size);
+
+    GGML_UNUSED(buffer);
+}
+
 GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     memcpy((char *)tensor->data + offset, data, size);
 
@@ -600,6 +622,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
     /* .free_buffer     = */ ggml_backend_cpu_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_cpu_buffer_get_base,
     /* .init_tensor     = */ NULL, // no initialization required
+    /* .memset_tensor   = */ ggml_backend_cpu_buffer_memset_tensor,
     /* .set_tensor      = */ ggml_backend_cpu_buffer_set_tensor,
     /* .get_tensor      = */ ggml_backend_cpu_buffer_get_tensor,
     /* .cpy_tensor      = */ ggml_backend_cpu_buffer_cpy_tensor,
@@ -613,6 +636,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
     /* .free_buffer     = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
     /* .get_base        = */ ggml_backend_cpu_buffer_get_base,
     /* .init_tensor     = */ NULL, // no initialization required
+    /* .memset_tensor   = */ ggml_backend_cpu_buffer_memset_tensor,
     /* .set_tensor      = */ ggml_backend_cpu_buffer_set_tensor,
     /* .get_tensor      = */ ggml_backend_cpu_buffer_get_tensor,
     /* .cpy_tensor      = */ ggml_backend_cpu_buffer_cpy_tensor,
@@ -980,6 +1004,7 @@ static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(
         /* .free_buffer     = */ ggml_backend_multi_buffer_free_buffer,
         /* .get_base        = */ NULL,
         /* .init_tensor     = */ NULL,
+        /* .memset_tensor   = */ NULL,
         /* .set_tensor      = */ NULL,
         /* .get_tensor      = */ NULL,
         /* .cpy_tensor      = */ NULL,
diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp
index aa315b83f..d3ab78006 100644
--- a/ggml/src/ggml-cann.cpp
+++ b/ggml/src/ggml-cann.cpp
@@ -1037,6 +1037,7 @@ static ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
     /* .free_buffer     = */ ggml_backend_cann_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_cann_buffer_get_base,
     /* .init_tensor     = */ ggml_backend_cann_buffer_init_tensor,
+    /* .memset_tensor   = */ NULL,
     /* .set_tensor      = */ ggml_backend_cann_buffer_set_tensor,
     /* .get_tensor      = */ ggml_backend_cann_buffer_get_tensor,
     /* .cpy_tensor      = */ ggml_backend_cann_buffer_cpy_tensor,
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 54f1a7c2d..b0843dc62 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -21,6 +21,8 @@
 #include "ggml-cuda/mmq.cuh"
 #include "ggml-cuda/mmvq.cuh"
 #include "ggml-cuda/norm.cuh"
+#include "ggml-cuda/opt-step-adamw.cuh"
+#include "ggml-cuda/out-prod.cuh"
 #include "ggml-cuda/pad.cuh"
 #include "ggml-cuda/pool2d.cuh"
 #include "ggml-cuda/quantize.cuh"
@@ -493,6 +495,14 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
     }
 }
 
+GGML_CALL static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
+
+    ggml_cuda_set_device(ctx->device);
+    CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + offset, value, size, cudaStreamPerThread));
+    CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
+}
+
 GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
 
@@ -544,6 +554,7 @@ static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
     /* .free_buffer     = */ ggml_backend_cuda_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_cuda_buffer_get_base,
     /* .init_tensor     = */ ggml_backend_cuda_buffer_init_tensor,
+    /* .memset_tensor   = */ ggml_backend_cuda_buffer_memset_tensor,
     /* .set_tensor      = */ ggml_backend_cuda_buffer_set_tensor,
     /* .get_tensor      = */ ggml_backend_cuda_buffer_get_tensor,
     /* .cpy_tensor      = */ ggml_backend_cuda_buffer_cpy_tensor,
@@ -860,6 +871,7 @@ static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
     /* .free_buffer     = */ ggml_backend_cuda_split_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_cuda_split_buffer_get_base,
     /* .init_tensor     = */ ggml_backend_cuda_split_buffer_init_tensor,
+    /* .memset_tensor   = */ NULL,
     /* .set_tensor      = */ ggml_backend_cuda_split_buffer_set_tensor,
     /* .get_tensor      = */ ggml_backend_cuda_split_buffer_get_tensor,
     /* .cpy_tensor      = */ NULL,
@@ -2168,6 +2180,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_REPEAT:
             ggml_cuda_op_repeat(ctx, dst);
             break;
+        case GGML_OP_REPEAT_BACK:
+            ggml_cuda_op_repeat_back(ctx, dst);
+            break;
         case GGML_OP_GET_ROWS:
             ggml_cuda_op_get_rows(ctx, dst);
             break;
@@ -2201,6 +2216,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
                 case GGML_UNARY_OP_NEG:
                     ggml_cuda_op_neg(ctx, dst);
                     break;
+                case GGML_UNARY_OP_STEP:
+                    ggml_cuda_op_step(ctx, dst);
+                    break;
                 case GGML_UNARY_OP_GELU:
                     ggml_cuda_op_gelu(ctx, dst);
                     break;
@@ -2267,6 +2285,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_MUL_MAT_ID:
             ggml_cuda_mul_mat_id(ctx, dst);
             break;
+        case GGML_OP_OUT_PROD:
+            ggml_cuda_out_prod(ctx, dst);
+            break;
         case GGML_OP_SCALE:
             ggml_cuda_op_scale(ctx, dst);
             break;
@@ -2324,6 +2345,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_CROSS_ENTROPY_LOSS:
             ggml_cuda_cross_entropy_loss(ctx, dst);
             break;
+        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
+            ggml_cuda_cross_entropy_loss_back(ctx, dst);
+            break;
+        case GGML_OP_OPT_STEP_ADAMW:
+            ggml_cuda_opt_step_adamw(ctx, dst);
+            break;
         default:
             return false;
     }
@@ -2761,6 +2788,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(op)) {
                 case GGML_UNARY_OP_NEG:
+                case GGML_UNARY_OP_STEP:
                 case GGML_UNARY_OP_GELU:
                 case GGML_UNARY_OP_SILU:
                 case GGML_UNARY_OP_RELU:
@@ -2813,6 +2841,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
                         return false;
                 }
             } break;
+        case GGML_OP_OUT_PROD:
+            return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
         case GGML_OP_GET_ROWS:
             {
                 switch (op->src[0]->type) {
@@ -2869,6 +2899,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
             } break;
         case GGML_OP_DUP:
         case GGML_OP_REPEAT:
+            {
+                ggml_type src0_type = op->src[0]->type;
+                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
+            } break;
+        case GGML_OP_REPEAT_BACK:
+                return op->type == GGML_TYPE_F32 && op->src[0]->ne[3] == 1;
         case GGML_OP_CONCAT:
             {
                 ggml_type src0_type = op->src[0]->type;
@@ -2935,9 +2971,11 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
             }
             return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA &&
                 op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
-        case GGML_OP_CROSS_ENTROPY_LOSS:
-            return true;
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+        case GGML_OP_CROSS_ENTROPY_LOSS:
+        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
+        case GGML_OP_OPT_STEP_ADAMW:
+            return true;
         default:
             return false;
     }
diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu
index e1390a041..c7b6be4e2 100644
--- a/ggml/src/ggml-cuda/binbcast.cu
+++ b/ggml/src/ggml-cuda/binbcast.cu
@@ -1,4 +1,5 @@
 #include "binbcast.cuh"
+#include <cstdint>
 
 static __device__ __forceinline__ float op_repeat(const float a, const float b) {
     return b;
@@ -90,6 +91,30 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
     dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
 }
 
+template <typename T>
+static __global__ void k_repeat_back(
+    const T * __restrict__ src, T * __restrict__ dst, const int64_t ne00, const int64_t ne01, const int64_t ne02,
+    const int64_t ne0, const int64_t ne1, const int64_t ne2) {
+
+    const int64_t tid0 = (int64_t) blockIdx.x*blockDim.x + threadIdx.x;
+    const int64_t tid1 = (int64_t) blockIdx.y*blockDim.y + threadIdx.y;
+    const int64_t tid2 = (int64_t) blockIdx.z*blockDim.z + threadIdx.z;
+
+    if (tid0 >= ne0) {
+        return;
+    }
+
+    T sum = 0;
+    for (int64_t i2 = tid2; i2 < ne02; i2 += ne2) {
+        for (int64_t i1 = tid1; i1 < ne01; i1 += ne1) {
+            for (int64_t i0 = tid0; i0 < ne00; i0 += ne0) {
+                sum += src[i2*ne01*ne00 + i1*ne00 + i0];
+            }
+        }
+    }
+    dst[tid2*ne1*ne0 + tid1*ne0 + tid0] = sum;
+}
+
 template<float (*bin_op)(const float, const float)>
 struct bin_bcast_cuda {
     template<typename src0_t, typename src1_t, typename dst_t>
@@ -247,6 +272,16 @@ struct bin_bcast_cuda {
     }
 };
 
+template <typename T>
+static void repeat_back_cuda(
+    const T * src, T * dst, const int64_t ne00, const int64_t ne01, const int64_t ne02,
+    const int64_t ne0, const int64_t ne1, const int64_t ne2, cudaStream_t stream) {
+
+    const dim3 block_dims(WARP_SIZE, 1, 1);
+    const dim3 block_nums((ne0 + WARP_SIZE - 1) / WARP_SIZE, ne1, ne2);
+    k_repeat_back<T><<<block_nums, block_dims, 0, stream>>>(src, dst, ne00, ne01, ne02, ne0, ne1, ne2);
+}
+
 template<class op>
 static void ggml_cuda_op_bin_bcast(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
@@ -286,3 +321,35 @@ void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
 }
+
+void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0->type == dst->type);
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_can_repeat(dst, src0));
+
+    cudaStream_t stream = ctx.stream();
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    GGML_ASSERT(src0->ne[3] == 1);
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+    const int64_t ne2 = dst->ne[2];
+    GGML_ASSERT(dst->ne[3] == 1);
+
+    switch (dst->type) {
+        case GGML_TYPE_F32: {
+            const float * src0_d = (const float *) src0->data;
+            float       * dst_d  = (float       *) dst->data;
+            repeat_back_cuda<float>(src0_d, dst_d, ne00, ne01, ne02, ne0, ne1, ne2, stream);
+        } break;
+        default: {
+            GGML_ASSERT(false);
+        } break;
+    }
+}
diff --git a/ggml/src/ggml-cuda/binbcast.cuh b/ggml/src/ggml-cuda/binbcast.cuh
index 198c9ef6f..3ac1c9b03 100644
--- a/ggml/src/ggml-cuda/binbcast.cuh
+++ b/ggml/src/ggml-cuda/binbcast.cuh
@@ -5,3 +5,5 @@ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/cross-entropy-loss.cu b/ggml/src/ggml-cuda/cross-entropy-loss.cu
index 5575a90f6..ed09406a8 100644
--- a/ggml/src/ggml-cuda/cross-entropy-loss.cu
+++ b/ggml/src/ggml-cuda/cross-entropy-loss.cu
@@ -71,6 +71,32 @@ static __global__ void cross_entropy_loss_f32(const float * logits, const float
     dst[blockIdx.x] = loss;
 }
 
+static __global__ void cross_entropy_loss_back_f32(const float * logits, const float * labels, const float * loss, float * dst, const int nclasses) {
+    extern __shared__ float tmp[];
+
+    float maxval = -INFINITY;
+    for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
+        const float val = logits[blockIdx.x*nclasses + i];
+        maxval = fmaxf(maxval, val);
+        tmp[i] = val;
+    }
+    maxval = warp_reduce_max(maxval);
+
+    float sum = 0.0f;
+    for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
+        const float val = expf(tmp[i] - maxval);
+        sum += val;
+        tmp[i] = val;
+    }
+    sum = warp_reduce_sum(sum);
+    const float sm_scale = 1.0f/sum;
+
+    const float d_by_nrows = *loss/gridDim.x;
+    for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
+        dst[blockIdx.x*nclasses + i] = (tmp[i]*sm_scale - labels[blockIdx.x*nclasses + i])*d_by_nrows;
+    }
+}
+
 void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const ggml_tensor * src0 = dst->src[0];
     const ggml_tensor * src1 = dst->src[1];
@@ -104,3 +130,37 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
     // Combine results from individual blocks:
     sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream);
 }
+
+void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+    const ggml_tensor * opt0 = dst->src[2];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(opt0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src1));
+    GGML_ASSERT(ggml_is_contiguous(opt0));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_are_same_shape(src0, src1));
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    const int64_t ne00  = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    const float * src0_d = (const float *) src0->data;
+    const float * src1_d = (const float *) src1->data;
+    const float * opt0_d = (const float *) opt0->data;
+    float       * dst_d  = (float       *) dst->data;
+
+    cudaStream_t stream = ctx.stream();
+
+    const dim3 blocks_dim(WARP_SIZE, 1, 1);
+    const dim3 blocks_num(nrows, 1, 1);
+    const int shmem = ne00*sizeof(float);
+
+    cross_entropy_loss_back_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, opt0_d, dst_d, ne00);
+}
diff --git a/ggml/src/ggml-cuda/cross-entropy-loss.cuh b/ggml/src/ggml-cuda/cross-entropy-loss.cuh
index 9d7b8b0f0..9ec7152ff 100644
--- a/ggml/src/ggml-cuda/cross-entropy-loss.cuh
+++ b/ggml/src/ggml-cuda/cross-entropy-loss.cuh
@@ -3,3 +3,5 @@
 #define CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE 256
 
 void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/opt-step-adamw.cu b/ggml/src/ggml-cuda/opt-step-adamw.cu
new file mode 100644
index 000000000..d6f13a9c6
--- /dev/null
+++ b/ggml/src/ggml-cuda/opt-step-adamw.cu
@@ -0,0 +1,80 @@
+#include "opt-step-adamw.cuh"
+
+#include <cstdint>
+
+static __global__ void opt_step_adamw_f32(
+    float * __restrict__ x, const float * __restrict__ g, float * __restrict__ g_m, float * __restrict__ g_v, const int64_t k,
+    const float alpha, const float beta1, const float beta2, const float eps, const float wd,
+    const float beta1h, const float beta2h) {
+
+    const int64_t i = (int64_t) blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    const float gi = g[i];
+    const float gmi = g_m[i]*beta1 +    gi*(1.0f - beta1);
+    const float gvi = g_v[i]*beta2 + gi*gi*(1.0f - beta2);
+
+    g_m[i] = gmi;
+    g_v[i] = gvi;
+
+    const float mh =       gmi*beta1h;
+    const float vh = sqrtf(gvi*beta2h) + eps;
+
+    x[i] = x[i]*(1.0f - alpha*wd) - mh/vh;
+}
+
+static void opt_step_adamw_f32_cuda(
+    float * x, const float * g, float * g_m, float * g_v, const int64_t k,
+    const float alpha, const float beta1, const float beta2, const float eps, const float wd,
+    const float beta1h, const float beta2h, cudaStream_t stream) {
+
+    const dim3 block_dims(CUDA_OPT_STEP_ADAMW_BLOCK_SIZE, 1, 1);
+    const dim3 block_nums((k + CUDA_OPT_STEP_ADAMW_BLOCK_SIZE - 1) / CUDA_OPT_STEP_ADAMW_BLOCK_SIZE, 1, 1);
+    opt_step_adamw_f32<<<block_nums, block_dims, 0, stream>>>(x, g, g_m, g_v, k, alpha, beta1, beta2, eps, wd, beta1h, beta2h);
+}
+
+void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0        = dst->src[0];
+    const ggml_tensor * src0_grad   = dst->src[1];
+    const ggml_tensor * src0_grad_m = dst->src[2];
+    const ggml_tensor * src0_grad_v = dst->src[3];
+
+    GGML_ASSERT(src0->type        == GGML_TYPE_F32);
+    GGML_ASSERT(src0_grad->type   == GGML_TYPE_F32);
+    GGML_ASSERT(src0_grad_m->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0_grad_v->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src0_grad));
+    GGML_ASSERT(ggml_is_contiguous(src0_grad_m));
+    GGML_ASSERT(ggml_is_contiguous(src0_grad_v));
+    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
+    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m));
+    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v));
+
+    float       * src0_d        = (float       *) src0->data;
+    const float * src0_grad_d   = (const float *) src0_grad->data;
+    float       * src0_grad_m_d = (float       *) src0_grad_m->data;
+    float       * src0_grad_v_d = (float       *) src0_grad_v->data;
+
+    cudaStream_t stream = ctx.stream();
+
+    const int64_t ne = ggml_nelements(src0);
+
+    int64_t iter;  memcpy(&iter,  &dst->op_params[0], sizeof(int64_t));
+    float   alpha; memcpy(&alpha, &dst->op_params[2], sizeof(float));
+    float   beta1; memcpy(&beta1, &dst->op_params[3], sizeof(float));
+    float   beta2; memcpy(&beta2, &dst->op_params[4], sizeof(float));
+    float   eps;   memcpy(&eps,   &dst->op_params[5], sizeof(float));
+    float   wd;    memcpy(&wd,    &dst->op_params[6], sizeof(float));
+
+    const float beta1h  = alpha/(1.0f - powf(beta1, iter));
+    const float beta2h  =  1.0f/(1.0f - powf(beta2, iter));
+
+    opt_step_adamw_f32_cuda(src0_d, src0_grad_d, src0_grad_m_d, src0_grad_v_d, ne, alpha, beta1, beta2, eps, wd, beta1h, beta2h, stream);
+
+    iter++;
+    memcpy(&dst->op_params[0], &iter, sizeof(int64_t));
+}
diff --git a/ggml/src/ggml-cuda/opt-step-adamw.cuh b/ggml/src/ggml-cuda/opt-step-adamw.cuh
new file mode 100644
index 000000000..58d6f6e5d
--- /dev/null
+++ b/ggml/src/ggml-cuda/opt-step-adamw.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_OPT_STEP_ADAMW_BLOCK_SIZE 256
+
+void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/out-prod.cu b/ggml/src/ggml-cuda/out-prod.cu
new file mode 100644
index 000000000..657d50e15
--- /dev/null
+++ b/ggml/src/ggml-cuda/out-prod.cu
@@ -0,0 +1,52 @@
+#include "out-prod.cuh"
+#include "vendors/cuda.h"
+
+#include <cstdint>
+
+void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+
+    GGML_ASSERT(ne01 == ne11);
+    GGML_ASSERT(ne0 == ne00);
+    GGML_ASSERT(ne1 == ne10);
+
+    GGML_ASSERT(ne2 == src0->ne[2]);
+    GGML_ASSERT(ne2 == src1->ne[2]);
+    GGML_ASSERT(ne3 == src0->ne[3]);
+    GGML_ASSERT(ne3 == src1->ne[3]);
+
+    const float * src0_d = (const float *) src0->data;
+    const float * src1_d = (const float *) src1->data;
+    float       *  dst_d = (float       *)  dst->data;
+
+    cudaStream_t   stream = ctx.stream();
+    cublasHandle_t handle = ctx.cublas_handle();
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+
+    GGML_ASSERT(ne2 == 1);
+    GGML_ASSERT(ne3 == 1);
+    CUBLAS_CHECK(cublasSetStream(handle, stream));
+
+    const bool src1_T = ggml_is_transposed(src1);
+    const cublasOperation_t src1_cublas_op =  src1_T ? CUBLAS_OP_N : CUBLAS_OP_T;
+    const int64_t           ldb            = (src1_T ?        nb10 :        nb11) /  sizeof(float);
+    GGML_ASSERT(                             (src1_T ?        nb11 :        nb10) == sizeof(float));
+
+    CUBLAS_CHECK(
+        cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op,
+                ne0, ne1, ne01,
+                &alpha, src0_d, ne00,
+                        src1_d, ldb,
+                &beta,  dst_d,  ne0));
+}
diff --git a/ggml/src/ggml-cuda/out-prod.cuh b/ggml/src/ggml-cuda/out-prod.cuh
new file mode 100644
index 000000000..a0046f5f8
--- /dev/null
+++ b/ggml/src/ggml-cuda/out-prod.cuh
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu
index 8ac669f94..163b5a8ff 100644
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@@ -10,6 +10,16 @@ static __global__ void neg_f32(const float * x, float * dst, const int k) {
     dst[i] = -x[i];
 }
 
+static __global__ void step_f32(const float * x, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = x[i] > 0.0f;
+}
+
 static __global__ void gelu_f32(const float * x, float * dst, const int k) {
     const float GELU_COEF_A    = 0.044715f;
     const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
@@ -134,6 +144,11 @@ static void neg_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
     neg_f32<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 }
 
+static void step_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_STEP_BLOCK_SIZE - 1) / CUDA_STEP_BLOCK_SIZE;
+    step_f32<<<num_blocks, CUDA_STEP_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
 static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
     const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
     gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@@ -213,6 +228,20 @@ void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     neg_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 }
 
+void ggml_cuda_op_step(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    step_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
+}
+
 void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const ggml_tensor * src0 = dst->src[0];
     const float * src0_d = (const float *)src0->data;
diff --git a/ggml/src/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh
index ed2ffc461..fe519f6a2 100644
--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@@ -1,6 +1,7 @@
 #include "common.cuh"
 
 #define CUDA_NEG_BLOCK_SIZE 256
+#define CUDA_STEP_BLOCK_SIZE 256
 #define CUDA_GELU_BLOCK_SIZE 256
 #define CUDA_SILU_BLOCK_SIZE 256
 #define CUDA_TANH_BLOCK_SIZE 256
@@ -15,6 +16,8 @@
 
 void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
+void ggml_cuda_op_step(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
 void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
 void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp
index 7f0bd82d5..9cbc57a64 100644
--- a/ggml/src/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute.cpp
@@ -1872,6 +1872,7 @@ static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
     /* .free_buffer     = */ ggml_backend_kompute_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_kompute_buffer_get_base,
     /* .init_tensor     = */ NULL,
+    /* .memset_tensor   = */ NULL,
     /* .set_tensor      = */ ggml_backend_kompute_buffer_set_tensor,
     /* .get_tensor      = */ ggml_backend_kompute_buffer_get_tensor,
     /* .cpy_tensor      = */ NULL,
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index f87181d19..ef3b7f0e8 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -3167,6 +3167,7 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
     /* .free_buffer     = */ ggml_backend_metal_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_metal_buffer_get_base,
     /* .init_tensor     = */ NULL,
+    /* .memset_tensor   = */ NULL,
     /* .set_tensor      = */ ggml_backend_metal_buffer_set_tensor,
     /* .get_tensor      = */ ggml_backend_metal_buffer_get_tensor,
     /* .cpy_tensor      = */ ggml_backend_metal_buffer_cpy_tensor,
diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp
index a8a2eb85a..49b3fa911 100644
--- a/ggml/src/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc.cpp
@@ -469,6 +469,7 @@ static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
     /* .free_buffer     = */ ggml_backend_rpc_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_rpc_buffer_get_base,
     /* .init_tensor     = */ ggml_backend_rpc_buffer_init_tensor,
+    /* .memset_tensor   = */ NULL,
     /* .set_tensor      = */ ggml_backend_rpc_buffer_set_tensor,
     /* .get_tensor      = */ ggml_backend_rpc_buffer_get_tensor,
     /* .cpy_tensor      = */ ggml_backend_rpc_buffer_cpy_tensor,
diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp
index acef7c6d4..2cf445047 100644
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -4323,6 +4323,7 @@ static struct ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
     /* .free_buffer     = */ ggml_backend_sycl_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_sycl_buffer_get_base,
     /* .init_tensor     = */ ggml_backend_sycl_buffer_init_tensor,
+    /* .memset_tensor   = */ NULL,
     /* .set_tensor      = */ ggml_backend_sycl_buffer_set_tensor,
     /* .get_tensor      = */ ggml_backend_sycl_buffer_get_tensor,
     /* .cpy_tensor      = */ ggml_backend_sycl_buffer_cpy_tensor,
diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index bad960510..f9da45881 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -6246,6 +6246,7 @@ static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
     /* .free_buffer     = */ ggml_backend_vk_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_vk_buffer_get_base,
     /* .init_tensor     = */ ggml_backend_vk_buffer_init_tensor,
+    /* .memset_tensor   = */ NULL,
     /* .set_tensor      = */ ggml_backend_vk_buffer_set_tensor,
     /* .get_tensor      = */ ggml_backend_vk_buffer_get_tensor,
     /* .cpy_tensor      = */ ggml_backend_vk_buffer_cpy_tensor,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 51532c5ea..201d5466a 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1,6 +1,7 @@
 #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
 #define _USE_MATH_DEFINES // For M_PI on MSVC
 
+#include "ggml-backend.h"
 #include "ggml-impl.h"
 #include "ggml-cpu-impl.h"
 #include "ggml-quants.h"
@@ -2997,9 +2998,10 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
 
     "CROSS_ENTROPY_LOSS",
     "CROSS_ENTROPY_LOSS_BACK",
+    "OPT_STEP_ADAMW",
 };
 
-static_assert(GGML_OP_COUNT == 79, "GGML_OP_COUNT != 79");
+static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -3090,9 +3092,10 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
 
     "cross_entropy_loss(x,y)",
     "cross_entropy_loss_back(x,y)",
+    "adamw(x)",
 };
 
-static_assert(GGML_OP_COUNT == 79, "GGML_OP_COUNT != 79");
+static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -4094,7 +4097,11 @@ static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, floa
 }
 
 struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
-    memset(tensor->data, 0, ggml_nbytes(tensor));
+    if (tensor->buffer) {
+        ggml_backend_tensor_memset(tensor, 0, 0, ggml_nbytes(tensor));
+    } else {
+        memset(tensor->data, 0, ggml_nbytes(tensor));
+    }
     return tensor;
 }
 
@@ -8320,11 +8327,46 @@ struct ggml_tensor * ggml_cross_entropy_loss_back(
     return result;
 }
 
+// opt_step_adamw
+
+struct ggml_tensor * ggml_opt_step_adamw(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        float                 alpha,
+        float                 beta1,
+        float                 beta2,
+        float                 eps,
+        float                 wd) {
+    GGML_ASSERT(a->grad);
+    GGML_ASSERT(alpha >  0.0f);
+    GGML_ASSERT(beta1 >= 0.0f && beta1 <= 1.0f);
+    GGML_ASSERT(beta2 >= 0.0f && beta2 <= 1.0f);
+    GGML_ASSERT(eps   >= 0.0f);
+    GGML_ASSERT(wd    >= 0.0f && wd    <= 1.0f);
+
+    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+
+    result->op   = GGML_OP_OPT_STEP_ADAMW;
+    result->grad = NULL;
+    result->src[0] = a;
+    result->src[1] = a->grad;
+    result->src[2] = ggml_dup_tensor(ctx, a->grad);
+    result->src[3] = ggml_dup_tensor(ctx, a->grad);
+
+    const int64_t iter = 1;
+    memcpy(&result->op_params[0], &iter, sizeof(int64_t));
+    ggml_set_op_params_f32(result, 2, alpha);
+    ggml_set_op_params_f32(result, 3, beta1);
+    ggml_set_op_params_f32(result, 4, beta2);
+    ggml_set_op_params_f32(result, 5, eps);
+    ggml_set_op_params_f32(result, 6, wd);
+
+    return result;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
-void ggml_set_param(
-        struct ggml_context * ctx,
-        struct ggml_tensor * tensor) {
+void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor) {
     tensor->flags |= GGML_TENSOR_FLAG_PARAM;
 
     GGML_ASSERT(tensor->grad == NULL);
@@ -8332,6 +8374,13 @@ void ggml_set_param(
     ggml_format_name(tensor->grad, "%s (grad)", tensor->name);
 }
 
+void ggml_set_loss(struct ggml_tensor * tensor) {
+    GGML_ASSERT(ggml_is_scalar(tensor));
+    GGML_ASSERT(tensor->type == GGML_TYPE_F32);
+    GGML_ASSERT(tensor->grad);
+    tensor->flags |= GGML_TENSOR_FLAG_LOSS;
+}
+
 // ggml_compute_forward_dup
 
 static void ggml_compute_forward_dup_same_cont(
@@ -17406,7 +17455,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
     const int64_t ir0 = dr*ith;
     const int64_t ir1 = MIN(ir0 + dr, nr);
 
-    float * d   = (float *) opt0->data;
+    const float d_by_nr = ((const float *) opt0->data)[0] / (float) nr;
 
     for (int64_t i1 = ir0; i1 < ir1; i1++) {
         float * ds0 = (float *)((char *) dst->data  + i1*dst->nb[1]);
@@ -17430,7 +17479,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
 
         // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
         ggml_vec_sub_f32(nc, ds0, ds0, s1);
-        ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
+        ggml_vec_scale_f32(nc, ds0, d_by_nr);
 
 #ifndef NDEBUG
         for (int i = 0; i < nc; ++i) {
@@ -17459,6 +17508,94 @@ static void ggml_compute_forward_cross_entropy_loss_back(
     }
 }
 
+static void ggml_compute_forward_opt_step_adamw_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0        = dst->src[0];
+    const struct ggml_tensor * src0_grad   = dst->src[1];
+    const struct ggml_tensor * src0_grad_m = dst->src[2];
+    const struct ggml_tensor * src0_grad_v = dst->src[3];
+    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    /* const float   gnorm = 1.0f; */
+    int64_t       iter;   memcpy(&iter, &dst->op_params[0], sizeof(int64_t));
+    const float   alpha = ggml_get_op_params_f32(dst, 2);
+    const float   beta1 = ggml_get_op_params_f32(dst, 3);
+    const float   beta2 = ggml_get_op_params_f32(dst, 4);
+    const float   eps   = ggml_get_op_params_f32(dst, 5);
+    const float   wd    = ggml_get_op_params_f32(dst, 6);
+
+    const float beta1h  = alpha/(1.0f - powf(beta1, iter));
+    const float beta2h  =  1.0f/(1.0f - powf(beta2, iter));
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        const int64_t i03 = ir/(ne02*ne01);
+        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+        const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+        const size_t offset = i03*nb03 + i02*nb02 + i01*nb01;
+
+        float       * w = (float       *) ((char       *) src0->data        + offset); // weight
+        const float * g = (const float *) ((const char *) src0_grad->data   + offset); // grad
+        float       * m = (float       *) ((char       *) src0_grad_m->data + offset);
+        float       * v = (float       *) ((char       *) src0_grad_v->data + offset);
+
+        for (int i00 = 0; i00 < ne00; ++i00) {
+            m[i00] = m[i00]*beta1 +        g[i00]*(1.0f - beta1);
+            v[i00] = v[i00]*beta2 + g[i00]*g[i00]*(1.0f - beta2);
+
+            const float mh =       m[i00]*beta1h;
+            const float vh = sqrtf(v[i00]*beta2h) + eps;
+
+            // The weight decay is applied independently of the Adam momenta m and v.
+            // This is NOT equivalent to l2 regularization that adds w[i00]*w[i00] to the loss.
+            // See: https://arxiv.org/pdf/1711.05101v3.pdf
+            w[i00] = w[i00]*(1.0f - alpha*wd) - mh/vh;
+        }
+    }
+
+    ggml_barrier(params->threadpool);
+    if (ith != 0) {
+        return;
+    }
+
+    iter++;
+    memcpy(&dst->op_params[0], &iter, sizeof(int64_t));
+}
+
+static void ggml_compute_forward_opt_step_adamw(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_opt_step_adamw_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
 /////////////////////////////////
 
 static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
@@ -17804,6 +17941,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
                 ggml_compute_forward_cross_entropy_loss_back(params, tensor);
             }
             break;
+        case GGML_OP_OPT_STEP_ADAMW:
+            {
+                ggml_compute_forward_opt_step_adamw(params, tensor);
+            }
+            break;
         case GGML_OP_NONE:
             {
                 // nop
@@ -17958,7 +18100,7 @@ void ggml_build_backward_gradient_checkpointing(
         struct ggml_tensor  * * checkpoints,
         int                     n_checkpoints) {
     ggml_graph_cpy(gf, gb_tmp);
-    ggml_build_backward_expand(ctx, gf, gb_tmp, true);
+    ggml_build_backward_expand(ctx, gf, gb_tmp, false, true);
 
     if (n_checkpoints <= 0) {
         ggml_graph_cpy(gb_tmp, gb);
@@ -17996,42 +18138,93 @@ void ggml_build_backward_gradient_checkpointing(
     ggml_hash_map_free(replacements);
 }
 
-// functions to change gradients considering the case that input a might be initial gradient with zero value
+// utility functions to change gradients
+// if a is in acc_table, modify gradients in-place and mark result as gradient accumulator
+// else if a is in zero_table, replace a
+// else, just add/subtract/etc. the gradients
 
-static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) {
+static struct ggml_tensor * ggml_add_or_set(
+        struct ggml_context  * ctx,
+        struct ggml_tensor   * a,
+        struct ggml_tensor   * b,
+        struct ggml_hash_set * zero_table,
+        struct ggml_hash_set * acc_table) {
+    if (ggml_hash_contains(acc_table, a)) {
+        struct ggml_tensor * ret = ggml_add_impl(ctx, a, b, true);
+        const size_t insert_result = ggml_hash_insert(acc_table, ret);
+        GGML_ASSERT(insert_result != GGML_HASHSET_FULL);
+        GGML_ASSERT(insert_result != GGML_HASHSET_ALREADY_EXISTS);
+        return ret;
+    }
     if (ggml_hash_contains(zero_table, a)) {
         return b;
-    } else {
-        return ggml_add_impl(ctx, a, b, false);
     }
+    return ggml_add_impl(ctx, a, b, false);
 }
 
-static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set * zero_table) {
+static struct ggml_tensor * ggml_acc_or_set(
+        struct ggml_context  * ctx,
+        struct ggml_tensor   * a,
+        struct ggml_tensor   * b,
+        const  size_t          nb1,
+        const  size_t          nb2,
+        const  size_t          nb3,
+        const  size_t          offset,
+        struct ggml_hash_set * zero_table,
+        struct ggml_hash_set * acc_table) {
+    if (ggml_hash_contains(acc_table, a)) {
+        struct ggml_tensor * ret = ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
+        const size_t insert_result = ggml_hash_insert(acc_table, ret);
+        GGML_ASSERT(insert_result != GGML_HASHSET_FULL);
+        GGML_ASSERT(insert_result != GGML_HASHSET_ALREADY_EXISTS);
+        return ret;
+    }
     if (ggml_hash_contains(zero_table, a)) {
-        struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f);
+        struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f); // FIXME this is going to produce NaN if a contains inf/NaN
         return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
-    } else {
-        return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
     }
+    return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
 }
 
-static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) {
+static struct ggml_tensor * ggml_add1_or_set(
+        struct ggml_context  * ctx,
+        struct ggml_tensor   * a,
+        struct ggml_tensor   * b,
+        struct ggml_hash_set * zero_table,
+        struct ggml_hash_set * acc_table) {
+    if (ggml_hash_contains(acc_table, a)) {
+        struct ggml_tensor * ret = ggml_add1_impl(ctx, a, b, true);
+        const size_t insert_result = ggml_hash_insert(acc_table, ret);
+        GGML_ASSERT(insert_result != GGML_HASHSET_FULL);
+        GGML_ASSERT(insert_result != GGML_HASHSET_ALREADY_EXISTS);
+        return ret;
+    }
     if (ggml_hash_contains(zero_table, a)) {
         return ggml_repeat(ctx, b, a);
-    } else {
-        return ggml_add1_impl(ctx, a, b, false);
     }
+    return ggml_add1_impl(ctx, a, b, false);
 }
 
-static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) {
+static struct ggml_tensor * ggml_sub_or_set(
+        struct ggml_context  * ctx,
+        struct ggml_tensor   * a,
+        struct ggml_tensor   * b,
+        struct ggml_hash_set * zero_table,
+        struct ggml_hash_set * acc_table) {
+    if (ggml_hash_contains(acc_table, a)) {
+        struct ggml_tensor * ret = ggml_sub_impl(ctx, a, b, true);
+        const size_t insert_result = ggml_hash_insert(acc_table, ret);
+        GGML_ASSERT(insert_result != GGML_HASHSET_FULL);
+        GGML_ASSERT(insert_result != GGML_HASHSET_ALREADY_EXISTS);
+        return ret;
+    }
     if (ggml_hash_contains(zero_table, a)) {
         return ggml_neg(ctx, b);
-    } else {
-        return ggml_sub_impl(ctx, a, b, false);
     }
+    return ggml_sub_impl(ctx, a, b, false);
 }
 
-static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set * zero_table) {
+static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set * zero_table, struct ggml_hash_set * acc_table) {
     struct ggml_tensor * src0 = tensor->src[0];
     struct ggml_tensor * src1 = tensor->src[1];
     struct ggml_tensor * src2 = tensor->src[2];
@@ -18040,38 +18233,38 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
         case GGML_OP_DUP:
             {
                 if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
+                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table, acc_table);
                 }
             } break;
         case GGML_OP_ADD:
             {
                 if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
+                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table, acc_table);
                 }
                 if (src1->grad) {
                     if (ggml_are_same_shape(src0, src1)) {
-                        src1->grad = ggml_add_or_set(ctx, src1->grad,                       tensor->grad,        zero_table);
+                        src1->grad = ggml_add_or_set(ctx, src1->grad,                       tensor->grad,        zero_table, acc_table);
                     } else {
-                        src1->grad = ggml_add_or_set(ctx, src1->grad, ggml_repeat_back(ctx, tensor->grad, src1), zero_table);
+                        src1->grad = ggml_add_or_set(ctx, src1->grad, ggml_repeat_back(ctx, tensor->grad, src1), zero_table, acc_table);
                     }
                 }
             } break;
         case GGML_OP_ADD1:
             {
                 if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
+                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table, acc_table);
                 }
                 if (src1->grad) {
                     src1->grad = ggml_add_or_set(ctx,
                         src1->grad,
                         ggml_mean(ctx, tensor->grad), // TODO: should probably be sum instead of mean
-                        zero_table);
+                        zero_table, acc_table);
                 }
             } break;
         case GGML_OP_ACC:
             {
                 if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
+                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table, acc_table);
                 }
                 if (src1->grad) {
                     const size_t nb1     = ((int32_t *) tensor->op_params)[0];
@@ -18093,16 +18286,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                             ggml_reshape(ctx,
                                 ggml_cont(ctx, tensor_grad_view),
                                 src1->grad),
-                            zero_table);
+                            zero_table, acc_table);
                 }
             } break;
         case GGML_OP_SUB:
             {
                 if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
+                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table, acc_table);
                 }
                 if (src1->grad) {
-                    src1->grad = ggml_sub_or_set(ctx, src1->grad, tensor->grad, zero_table);
+                    src1->grad = ggml_sub_or_set(ctx, src1->grad, tensor->grad, zero_table, acc_table);
                 }
             } break;
         case GGML_OP_MUL:
@@ -18112,14 +18305,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                         ggml_add_or_set(ctx,
                                 src0->grad,
                                 ggml_mul(ctx, src1, tensor->grad),
-                                zero_table);
+                                zero_table, acc_table);
                 }
                 if (src1->grad) {
                     src1->grad =
                         ggml_add_or_set(ctx,
                                 src1->grad,
                                 ggml_mul(ctx, src0, tensor->grad),
-                                zero_table);
+                                zero_table, acc_table);
                 }
             } break;
         case GGML_OP_DIV:
@@ -18129,7 +18322,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                         ggml_add_or_set(ctx,
                                 src0->grad,
                                 ggml_div(ctx, tensor->grad, src1),
-                                zero_table);
+                                zero_table, acc_table);
                 }
                 if (src1->grad) {
                     src1->grad =
@@ -18138,7 +18331,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                 ggml_mul(ctx,
                                     tensor->grad,
                                     ggml_div(ctx, tensor, src1)),
-                                zero_table);
+                                zero_table, acc_table);
                 }
             } break;
         case GGML_OP_SQR:
@@ -18150,7 +18343,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                 ggml_scale(ctx,
                                     ggml_mul(ctx, src0, tensor->grad),
                                     2.0f),
-                                zero_table);
+                                zero_table, acc_table);
                 }
             } break;
         case GGML_OP_SQRT:
@@ -18164,7 +18357,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                         tensor->grad,
                                         tensor),
                                     0.5f),
-                                zero_table);
+                                zero_table, acc_table);
                 }
             } break;
         case GGML_OP_LOG:
@@ -18176,7 +18369,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                 ggml_div(ctx,
                                     tensor->grad,
                                     src0),
-                                zero_table);
+                                zero_table, acc_table);
                 }
             } break;
         case GGML_OP_SIN:
@@ -18188,7 +18381,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                 ggml_mul(ctx,
                                     tensor->grad,
                                     ggml_cos(ctx, src0)),
-                                zero_table);
+                                zero_table, acc_table);
                 }
             } break;
         case GGML_OP_COS:
@@ -18200,7 +18393,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                 ggml_mul(ctx,
                                     tensor->grad,
                                     ggml_sin(ctx, src0)),
-                                zero_table);
+                                zero_table, acc_table);
                 }
             } break;
         case GGML_OP_SUM:
@@ -18210,7 +18403,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                         ggml_add1_or_set(ctx,
                                 src0->grad,
                                 tensor->grad,
-                                zero_table);
+                                zero_table, acc_table);
                 }
             } break;
         case GGML_OP_SUM_ROWS:
@@ -18222,7 +18415,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                 ggml_repeat(ctx,
                                     tensor->grad,
                                     src0->grad),
-                                zero_table);
+                                zero_table, acc_table);
                 }
             } break;
         case GGML_OP_MEAN:
@@ -18237,7 +18430,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     src0->grad = ggml_add_or_set(ctx,
                             src0->grad,
                             ggml_repeat_back(ctx, tensor->grad, src0->grad),
-                            zero_table);
+                            zero_table, acc_table);
                 }
             } break;
         case GGML_OP_REPEAT_BACK:
@@ -18247,7 +18440,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     src0->grad = ggml_add_or_set(ctx,
                             src0->grad,
                             ggml_repeat(ctx, tensor->grad, src0->grad),
-                            zero_table);
+                            zero_table, acc_table);
                 }
             } break;
         case GGML_OP_CONCAT:
@@ -18272,7 +18465,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     src0->grad = ggml_add_or_set(ctx,
                             src0->grad,
                             ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
-                            zero_table);
+                            zero_table, acc_table);
                 }
             } break;
         case GGML_OP_RMS_NORM_BACK:
@@ -18320,7 +18513,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                         ggml_add_or_set(ctx,
                                 src0->grad, // [n,m,q1,r1]
                                 s1_tg,      // [n,m,q1,r1]
-                                zero_table);
+                                zero_table, acc_table);
                 }
                 if (src1->grad) {
                     src1->grad =
@@ -18338,7 +18531,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                     src0,                           // [n,m,q1,r1]
                                     ggml_transpose(ctx,             // [p,m,qq,rr]
                                         tensor->grad)),             // [m,p,qq,rr]
-                                zero_table);
+                                zero_table, acc_table);
                 }
             } break;
         case GGML_OP_MUL_MAT_ID:
@@ -18360,7 +18553,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                         ggml_add_or_set(ctx,
                             src0->grad,
                             ggml_scale_impl(ctx, tensor->grad, s, false),
-                            zero_table);
+                            zero_table, acc_table);
                 }
             } break;
         case GGML_OP_SET:
@@ -18389,7 +18582,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                             tensor->grad,
                             ggml_neg(ctx, tensor_grad_view),
                             nb1, nb2, nb3, offset, false),
-                        zero_table);
+                        zero_table, acc_table);
                 }
 
                 if (src1->grad) {
@@ -18399,7 +18592,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                             ggml_reshape(ctx,
                                 ggml_cont(ctx, tensor_grad_view),
                                 src1->grad),
-                            zero_table);
+                            zero_table, acc_table);
                 }
             } break;
         case GGML_OP_CPY:
@@ -18410,7 +18603,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 // tensor = src0 * 1 + src1 * 0
                 if (src0->grad) {
                     // dsrc0 = dtensor * 1
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
+                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table, acc_table);
                 }
                 if (src1->grad) {
                     // dsrc1 = dtensor * 0 -> noop
@@ -18422,7 +18615,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 if (src0->grad) {
                     GGML_ASSERT(ggml_is_contiguous(src0->grad));
                     GGML_ASSERT(ggml_is_contiguous(tensor->grad));
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
+                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table, acc_table);
                 }
             } break;
         case GGML_OP_RESHAPE:
@@ -18436,7 +18629,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                     ? tensor->grad
                                     : ggml_cont(ctx, tensor->grad),
                                 src0->grad),
-                        zero_table);
+                        zero_table, acc_table);
                 }
             } break;
         case GGML_OP_VIEW:
@@ -18465,7 +18658,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                         nb3 = (nb3 / n0) * ng;
                     }
 
-                    src0->grad = ggml_acc_or_set(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, zero_table);
+                    src0->grad = ggml_acc_or_set(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, zero_table, acc_table);
                 }
             } break;
         case GGML_OP_PERMUTE:
@@ -18490,7 +18683,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                 axes_backward[1],
                                 axes_backward[2],
                                 axes_backward[3]),
-                            zero_table);
+                            zero_table, acc_table);
                 }
             } break;
         case GGML_OP_TRANSPOSE:
@@ -18500,7 +18693,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     src0->grad =
                         ggml_add_or_set(ctx, src0->grad,
                             ggml_transpose(ctx, tensor->grad),
-                        zero_table);
+                        zero_table, acc_table);
                 }
             } break;
         case GGML_OP_GET_ROWS:
@@ -18512,7 +18705,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                             // last ggml_get_rows_back argument src0->grad is only
                             // necessary to setup correct output shape
                             ggml_get_rows_back(ctx, tensor->grad, src1, src0->grad),
-                        zero_table);
+                        zero_table, acc_table);
                 }
                 if (src1->grad) {
                     // noop
@@ -18536,7 +18729,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                             /* ggml_diag_mask_inf_impl() shouldn't be here */
                             /* ref:  https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
                             ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
-                        zero_table);
+                        zero_table, acc_table);
                 }
             } break;
         case GGML_OP_DIAG_MASK_ZERO:
@@ -18547,7 +18740,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     src0->grad =
                         ggml_add_or_set(ctx, src0->grad,
                             ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
-                        zero_table);
+                        zero_table, acc_table);
                 }
             } break;
         case GGML_OP_SOFT_MAX:
@@ -18557,7 +18750,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     src0->grad =
                         ggml_add_or_set(ctx, src0->grad,
                             ggml_soft_max_back(ctx, tensor->grad, tensor),
-                        zero_table);
+                        zero_table, acc_table);
                 }
 
             } break;
@@ -18598,7 +18791,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                 attn_factor,
                                 beta_fast,
                                 beta_slow),
-                            zero_table);
+                            zero_table, acc_table);
                 }
             } break;
         case GGML_OP_ROPE_BACK:
@@ -18634,7 +18827,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                 beta_fast,
                                 beta_slow,
                                 false),
-                            zero_table);
+                            zero_table, acc_table);
                 }
             } break;
         case GGML_OP_CLAMP:
@@ -18659,7 +18852,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     src1->grad = ggml_add_or_set(ctx,
                             src1->grad,
                             ggml_im2col_back(ctx, src0, tensor->grad, src1->ne, s0, s1, p0, p1, d0, d1, is_2D),
-                            zero_table);
+                            zero_table, acc_table);
                 }
             } break;
         case GGML_OP_IM2COL_BACK:
@@ -18688,7 +18881,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     src0->grad = ggml_add_or_set(ctx,
                             src0->grad,
                             ggml_pool_2d_back(ctx, tensor->grad, src0, op, k0, k1, s0, s1, p0, p1),
-                            zero_table);
+                            zero_table, acc_table);
                 }
             } break;
         case GGML_OP_POOL_2D_BACK:
@@ -18753,7 +18946,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     src0->grad = ggml_add_or_set(ctx,
                             src0->grad,
                             grad_q,
-                            zero_table);
+                            zero_table, acc_table);
                 }
                 if (src1->grad) {
                     struct ggml_tensor * view_k = ggml_view_1d(ctx, flash_grad, elem_k, offs_k);
@@ -18761,7 +18954,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     src1->grad = ggml_add_or_set(ctx,
                             src1->grad,
                             grad_k,
-                            zero_table);
+                            zero_table, acc_table);
                 }
                 if (src2->grad) {
                     struct ggml_tensor * view_v = ggml_view_1d(ctx, flash_grad, elem_v, offs_v);
@@ -18769,7 +18962,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     src2->grad = ggml_add_or_set(ctx,
                             src2->grad,
                             grad_v,
-                            zero_table);
+                            zero_table, acc_table);
                 }
             } break;
         case GGML_OP_FLASH_ATTN_BACK:
@@ -18795,7 +18988,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                             ggml_mul(ctx,
                                                 ggml_sgn(ctx, src0),
                                                 tensor->grad),
-                                            zero_table);
+                                            zero_table, acc_table);
                             }
                         } break;
                     case GGML_UNARY_OP_SGN:
@@ -18807,7 +19000,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     case GGML_UNARY_OP_NEG:
                         {
                             if (src0->grad) {
-                                src0->grad = ggml_sub_or_set(ctx, src0->grad, tensor->grad, zero_table);
+                                src0->grad = ggml_sub_or_set(ctx, src0->grad, tensor->grad, zero_table, acc_table);
                             }
                         } break;
                     case GGML_UNARY_OP_STEP:
@@ -18832,7 +19025,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                         ggml_mul(ctx,
                                             ggml_step(ctx, src0),
                                             tensor->grad),
-                                        zero_table);
+                                        zero_table, acc_table);
                             }
                         } break;
                     case GGML_UNARY_OP_SIGMOID:
@@ -18854,7 +19047,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                 src0->grad = ggml_add_or_set(ctx,
                                         src0->grad,
                                         ggml_silu_back(ctx, src0, tensor->grad),
-                                        zero_table);
+                                        zero_table, acc_table);
                             }
                         } break;
                     case GGML_UNARY_OP_EXP:
@@ -18863,7 +19056,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                 src0->grad = ggml_add_or_set(ctx,
                                         src0->grad,
                                         ggml_mul(ctx, tensor, tensor->grad),
-                                        zero_table);
+                                        zero_table, acc_table);
                             }
                         } break;
                     default:
@@ -18893,13 +19086,17 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                     src0,
                                     src1,
                                     tensor->grad),
-                                zero_table);
+                                zero_table, acc_table);
                 }
             } break;
         case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
             {
                 GGML_ABORT("fatal error"); // not supported
             }
+        case GGML_OP_OPT_STEP_ADAMW:
+            {
+                GGML_ABORT("fatal error"); // not supported
+            }
         case GGML_OP_NONE:
             {
                 // nop
@@ -18989,7 +19186,7 @@ void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor *
     ggml_build_forward_impl(cgraph, tensor, true);
 }
 
-void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
+void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate, bool keep) {
     GGML_ASSERT(gf->n_nodes > 0);
     GGML_ASSERT(gf->grads);
 
@@ -19005,21 +19202,35 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
         }
     }
 
-    // remember original gradients which start with zero values
+    // keep tables of original gradients for replacement/accumulation logic
     struct ggml_hash_set zero_table = ggml_hash_set_new(gf->size);
+    struct ggml_hash_set acc_table  = ggml_hash_set_new(gf->size);
     for (int i = 0; i < gf->n_nodes; i++) {
-        if (gf->grads[i]) {
-            ggml_hash_insert(&zero_table, gf->grads[i]);
+        struct ggml_tensor * node = gf->nodes[i];
+
+        if (node->grad) {
+            {
+                const size_t insert_result = ggml_hash_insert(&zero_table, node->grad);
+                GGML_ASSERT(insert_result != GGML_HASHSET_FULL);
+                GGML_ASSERT(insert_result != GGML_HASHSET_ALREADY_EXISTS);
+            }
+
+            // only gradients of trainable parameters should be accumulated
+            if (accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) {
+                const size_t insert_result = ggml_hash_insert(&acc_table, node->grad);
+                GGML_ASSERT(insert_result != GGML_HASHSET_FULL);
+                GGML_ASSERT(insert_result != GGML_HASHSET_ALREADY_EXISTS);
+            }
         }
     }
 
     for (int i = gf->n_nodes - 1; i >= 0; i--) {
         struct ggml_tensor * node = gf->nodes[i];
 
-        // inplace operations to add gradients are not created by ggml_compute_backward
+        // inplace operations to add gradients are not created by ggml_compute_backward except for gradient accumulation
         // use allocator to automatically make inplace operations
         if (node->grad) {
-            ggml_compute_backward(ctx, node, &zero_table);
+            ggml_compute_backward(ctx, node, &zero_table, &acc_table);
         }
     }
 
@@ -19033,8 +19244,30 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
     }
 
     ggml_hash_set_free(&zero_table);
+    ggml_hash_set_free(&acc_table);
 }
 
+void ggml_build_opt_adamw(
+        struct ggml_context * ctx,
+        struct ggml_cgraph  * gf,
+        struct ggml_cgraph  * gb,
+        float                 alpha,
+        float                 beta1,
+        float                 beta2,
+        float                 eps,
+        float                 wd) {
+    for (int i = 0; i < gf->n_nodes; i++) {
+        struct ggml_tensor * node = gf->nodes[i];
+
+        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
+            GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
+            struct ggml_tensor * opt_step = ggml_opt_step_adamw(ctx, node, alpha, beta1, beta2, eps, wd);
+            ggml_build_forward_expand(gb, opt_step);
+        }
+    }
+}
+
+
 static void * incr_ptr_aligned(void ** p, size_t size, size_t align) {
     void * ptr = *p;
     ptr = (void *) GGML_PAD((uintptr_t) ptr, align);
@@ -19162,10 +19395,28 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
     GGML_ASSERT(cgraph->grads != NULL);
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * grad = cgraph->grads[i];
+        struct ggml_tensor * node = cgraph->nodes[i];
 
-        if (grad) {
-            ggml_set_zero(grad);
+        // initial gradients of loss should be 1, 0 otherwise
+        if (node->grad) {
+            if (node->flags & GGML_TENSOR_FLAG_LOSS) {
+                GGML_ASSERT(node->grad->buffer);
+                GGML_ASSERT(node->type == GGML_TYPE_F32);
+                GGML_ASSERT(ggml_is_scalar(node));
+
+                const float onef = 1.0f;
+                ggml_backend_tensor_set(node->grad, &onef, 0, ggml_nbytes(node->grad));
+            } else {
+                ggml_set_zero(node->grad);
+            }
+        }
+
+        GGML_ASSERT(node);
+        if (node->op == GGML_OP_OPT_STEP_ADAMW) {
+            // set iteration to 1 and clear momenta
+            ggml_set_op_params_i32(node, 0, 1);
+            ggml_set_zero(node->src[2]);
+            ggml_set_zero(node->src[3]);
         }
     }
 }
@@ -19458,6 +19709,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
             } break;
         case GGML_OP_CROSS_ENTROPY_LOSS:
         case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
+        case GGML_OP_OPT_STEP_ADAMW:
             {
                 n_tasks = n_threads;
             } break;
@@ -21851,7 +22103,7 @@ enum ggml_opt_result ggml_opt_resume(
     ggml_build_forward_expand(gf, f);
 
     struct ggml_cgraph * gb = ggml_graph_dup(ctx, gf);
-    ggml_build_backward_expand(ctx, gf, gb, true);
+    ggml_build_backward_expand(ctx, gf, gb, false, true);
 
     return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
 }
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index aa7896def..889a19944 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -799,10 +799,11 @@ struct test_case {
             out = ggml_sum(ctx, out);
             ggml_set_name(out, "sum_of_out");
         }
+        ggml_set_loss(out);
 
         ggml_build_forward_expand(gf, out);
         ggml_graph_cpy(gf, gb);
-        ggml_build_backward_expand(ctx, gf, gb, false);
+        ggml_build_backward_expand(ctx, gf, gb, false, false);
         if (expect.size() != 1 || expect[0] != 0.0f) {
             GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
             for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
@@ -837,22 +838,11 @@ struct test_case {
             return false;
         }
 
-        // randomize tensors
-        initialize_tensors(ctx);
 
-        for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
-            if (!t->grad) {
-                continue;
-            }
+        initialize_tensors(ctx); // Randomizes all tensors (including gradients).
+        ggml_graph_reset(gb);    // Sets gradients to 1 if loss, 0 otherwise.
 
-            std::vector<float> tmp(ggml_nelements(t->grad));
-            ggml_backend_tensor_set(t->grad, tmp.data(), 0, ggml_nbytes(t->grad));
-        }
-
-        // build graphs
-        const float onef = 1.0f;
         ggml_backend_graph_compute(backend, gf);
-        ggml_backend_tensor_set(out->grad, &onef, 0, ggml_nbytes(out->grad));
         ggml_backend_graph_compute(backend, gb);
 
         bool ok = true;
@@ -1681,6 +1671,50 @@ struct test_mul_mat_id : public test_case {
     }
 };
 
+// GGML_OP_OUT_PROD
+struct test_out_prod : public test_case {
+    const ggml_type type_a;
+    const ggml_type type_b;
+    const int64_t m;
+    const int64_t n;
+    const int64_t k;
+    const std::array<int64_t, 2> bs; // dims 3 and 4
+    const bool trans_b;
+
+    std::string vars() override {
+        return VARS_TO_STR7(type_a, type_b, m, n, k, bs, trans_b);
+    }
+
+    double max_nmse_err() override {
+        return 5e-4;
+    }
+
+    test_out_prod(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
+            int64_t m = 32, int64_t n = 32, int64_t k = 32,
+            std::array<int64_t, 2> bs = {10, 10},
+            bool trans_b = false)
+        : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), trans_b(trans_b) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, m, k, bs[0], bs[1]);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * b;
+        if (trans_b) {
+            b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0], bs[1]);
+            b = ggml_transpose(ctx, b);
+        } else {
+            b = ggml_new_tensor_4d(ctx, type_b, n, k, bs[0], bs[1]);
+        }
+        ggml_set_name(b, "b");
+
+        ggml_tensor * out = ggml_out_prod(ctx, a, b);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
 // GGML_OP_SQR
 struct test_sqr : public test_case {
     const ggml_type type;
@@ -2666,6 +2700,51 @@ struct test_cross_entropy_loss : public test_case {
     }
 };
 
+// GGML_OP_OPT_STEP_ADAMW
+struct test_opt_step_adamw : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const float alpha;
+    const float beta1;
+    const float beta2;
+    const float eps;
+    const float wd;
+
+    std::string vars() override {
+        return VARS_TO_STR7(type, ne, alpha, beta1, beta2, eps, wd);
+    }
+
+    test_opt_step_adamw(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3},
+            float alpha = 1e-3f,
+            float beta1 = 0.9f,
+            float beta2 = 0.999f,
+            float eps = 1e-8f,
+            float wd = 0.0f)
+        : type(type), ne(ne), alpha(alpha), beta1(beta1), beta2(beta2), eps(eps), wd(wd) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_param(ctx, a); // Despite tensor a having gradients the output tensor will not.
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_opt_step_adamw(ctx, a, alpha, beta1, beta2, eps, wd);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, 0.0f, 1.0f); // grad_v needs non-negative values.
+        }
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
 enum llm_norm_type {
     LLM_NORM,
     LLM_NORM_RMS,
@@ -3159,14 +3238,15 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
     test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
 
-
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 1, 1}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {2, 1, 1, 1}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 2, 1, 1}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 1}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 1, 2}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 5, 4, 3}, {2, 1, 1, 1}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 5, 4, 3}, {1, 1, 1, 2}));
+    for (int ne3 : {1, 3}) { // CUDA backwards pass only supports ne3 == 1
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 1}));
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 2, 1, 1}));
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 2, 1}));
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 2}));
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 5, 4, ne3}, {1, 1, 1, 2}));
+    }
 
     test_cases.emplace_back(new test_dup(GGML_TYPE_F32));
     test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
@@ -3350,6 +3430,27 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
         }
     }
 
+    for (ggml_type type_a : base_types) {
+        for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+            test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, { 1,  1}));
+            test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10,  1}));
+            test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10,  1}));
+            test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
+            test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
+            test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
+            test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
+
+            test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, { 1,  1}));
+            test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, { 1,  1}, true));
+            test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10,  1}));
+            test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10,  1}));
+            test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
+            test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
+            test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
+            test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
+        }
+    }
+
     test_cases.emplace_back(new test_sqr());
     test_cases.emplace_back(new test_sqrt());
     test_cases.emplace_back(new test_log());
@@ -3476,6 +3577,9 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     }
 
     test_cases.emplace_back(new test_cross_entropy_loss());
+    for (float wd : {0.0f, 1e-2f}) {
+        test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}, 1.0f, 1e-3f, 0.9f, 0.999f, wd));
+    }
 
     // these tests are disabled to save execution time, but they can be handy for debugging
 #if 0
diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp
index 1834c11d8..2ef606d2c 100644
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@@ -240,7 +240,7 @@ static bool check_gradient(
     struct ggml_cgraph * gb = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
     ggml_build_forward_expand(gf, f);
     ggml_graph_cpy(gf, gb);
-    ggml_build_backward_expand(ctx0, gf, gb, false);
+    ggml_build_backward_expand(ctx0, gf, gb, false, false);
 
     ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
 

From 43015353262de835215103475055b74045cb9f49 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 20 Sep 2024 19:06:59 +0300
Subject: [PATCH 10/78] sync : ggml

ggml-ci
---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 3d2dfb413..cf7b97d45 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-10e83a412717c20d57ba19f025248e18e43addf3
+e7b23907cb2816e9951fe9b524d7127ab777297a

From 27609c49b94766a136764ce7919c8a082bf71548 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 20 Sep 2024 19:13:02 +0300
Subject: [PATCH 11/78] ggml : fix trailing whitespace (#0)

ggml-ci
---
 ggml/src/ggml-backend.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
index 97ca5a1f3..ba280e064 100644
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@@ -256,7 +256,7 @@ GGML_API GGML_CALL void ggml_backend_tensor_memset(struct ggml_tensor * tensor,
     if (!size) {
         return;
     }
-    
+
     GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer");
 
     buf->iface.memset_tensor(buf, tensor, value, offset, size);

From d13edb17ed1ce3b961016cbdb616b1c8d161c026 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 20 Sep 2024 20:12:52 +0300
Subject: [PATCH 12/78] ggml : fix builds (#0)

ggml-ci
---
 ggml/src/ggml-cuda/out-prod.cu   | 1 -
 ggml/src/ggml-cuda/vendors/hip.h | 1 +
 ggml/src/ggml-sycl.cpp           | 1 +
 3 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/out-prod.cu b/ggml/src/ggml-cuda/out-prod.cu
index 657d50e15..619cfdcb5 100644
--- a/ggml/src/ggml-cuda/out-prod.cu
+++ b/ggml/src/ggml-cuda/out-prod.cu
@@ -1,5 +1,4 @@
 #include "out-prod.cuh"
-#include "vendors/cuda.h"
 
 #include <cstdint>
 
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
index d0c377255..1f3c70c2e 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -30,6 +30,7 @@
 #define cublasSetStream hipblasSetStream
 #define cublasSgemm hipblasSgemm
 #define cublasStatus_t hipblasStatus_t
+#define cublasOperation_t hipblasOperation_t
 #define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
 #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
 #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp
index 2cf445047..16e6be4a0 100644
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -4735,6 +4735,7 @@ static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
     /* .free_buffer     = */ ggml_backend_sycl_split_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_sycl_split_buffer_get_base,
     /* .init_tensor     = */ ggml_backend_sycl_split_buffer_init_tensor,
+    /* .memset_tensor   = */ NULL,
     /* .set_tensor      = */ ggml_backend_sycl_split_buffer_set_tensor,
     /* .get_tensor      = */ ggml_backend_sycl_split_buffer_get_tensor,
     /* .cpy_tensor      = */ NULL,

From 63351143b2ea5efe9f8b9c61f553af8a51f1deff Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 20 Sep 2024 20:55:36 +0200
Subject: [PATCH 13/78] quantize : improve type name parsing (#9570)

quantize : do not ignore invalid types in arg parsing

quantize : ignore case of type and ftype arguments
---
 examples/quantize/quantize.cpp | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index a23bfb86b..b98993210 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -63,6 +63,16 @@ static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET    = "quantize.imatrix
 static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES  = "quantize.imatrix.entries_count";
 static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS   = "quantize.imatrix.chunks_count";
 
+static bool striequals(const char * a, const char * b) {
+    while (*a && *b) {
+        if (std::tolower(*a) != std::tolower(*b)) {
+            return false;
+        }
+        a++; b++;
+    }
+    return *a == *b;
+}
+
 static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
     std::string ftype_str;
 
@@ -70,7 +80,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
         ftype_str.push_back(std::toupper(ch));
     }
     for (auto & it : QUANT_OPTIONS) {
-        if (it.name == ftype_str) {
+        if (striequals(it.name.c_str(), ftype_str.c_str())) {
             ftype = it.ftype;
             ftype_str_out = it.name;
             return true;
@@ -225,15 +235,15 @@ static int prepare_imatrix(const std::string & imatrix_file,
 }
 
 static ggml_type parse_ggml_type(const char * arg) {
-    ggml_type result = GGML_TYPE_COUNT;
-    for (int j = 0; j < GGML_TYPE_COUNT; ++j) {
-        auto type = ggml_type(j);
+    for (int i = 0; i < GGML_TYPE_COUNT; ++i) {
+        auto type = (ggml_type)i;
         const auto * name = ggml_type_name(type);
-        if (name && strcmp(arg, name) == 0) {
-            result = type; break;
+        if (name && striequals(name, arg)) {
+            return type;
         }
     }
-    return result;
+    fprintf(stderr, "%s: invalid ggml_type '%s'\n", __func__, arg);
+    return GGML_TYPE_COUNT;
 }
 
 int main(int argc, char ** argv) {
@@ -254,12 +264,18 @@ int main(int argc, char ** argv) {
         } else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
             if (arg_idx < argc-1) {
                 params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.output_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
             } else {
                 usage(argv[0]);
             }
         } else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) {
             if (arg_idx < argc-1) {
                 params.token_embedding_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.token_embedding_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
             } else {
                 usage(argv[0]);
             }

From e948a7da7af7f2dbfdcd695b3ba903ab12575f78 Mon Sep 17 00:00:00 2001
From: Huang Qi <huangqi3@xiaomi.com>
Date: Sat, 21 Sep 2024 08:39:41 +0800
Subject: [PATCH 14/78] CI: Provide prebuilt windows binary for hip (#9467)

---
 .github/workflows/build.yml | 70 +++++++++++++++++++++++++++++++++++--
 1 file changed, 68 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 1777489ec..a54c5de99 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -967,6 +967,7 @@ jobs:
           name: llama-bin-win-sycl-x64.zip
 
   windows-latest-cmake-hip:
+    if: ${{ github.event.inputs.create_release != 'true' }}
     runs-on: windows-latest
 
     steps:
@@ -994,8 +995,72 @@ jobs:
         run: |
           $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
           $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON
-          cmake --build build --config Release
+          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON
+          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
+
+  windows-latest-cmake-hip-release:
+    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+    runs-on: windows-latest
+
+    strategy:
+      matrix:
+        gpu_target: [gfx1100, gfx1101, gfx1030]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Install
+        id: depends
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "Downloading AMD HIP SDK Installer"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          write-host "Installing AMD HIP SDK"
+          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
+          write-host "Completed AMD HIP SDK installation"
+
+      - name: Verify ROCm
+        id: verify
+        run: |
+          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
+
+      - name: Build
+        id: cmake_build
+        run: |
+          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
+          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
+          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
+          md "build\bin\rocblas\library\"
+          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
+          name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
 
   ios-xcode-build:
     runs-on: macos-latest
@@ -1060,6 +1125,7 @@ jobs:
       - macOS-latest-cmake
       - windows-latest-cmake
       - windows-latest-cmake-cuda
+      - windows-latest-cmake-hip-release
       - macOS-latest-cmake-arm64
       - macOS-latest-cmake-x64
 

From 41f477879fd5ccc31211634292e8e293ec700e85 Mon Sep 17 00:00:00 2001
From: agray3 <agray3@users.noreply.github.com>
Date: Sat, 21 Sep 2024 01:41:07 +0100
Subject: [PATCH 15/78] Update CUDA graph on scale change plus clear
 nodes/params  (#9550)

* Avoid using saved CUDA graph if scale changes and reset nodes/params on update

Fixes https://github.com/ggerganov/llama.cpp/issues/9451

* clear before resize
---
 ggml/src/ggml-cuda.cu         | 9 +++++++++
 ggml/src/ggml-cuda/common.cuh | 1 +
 2 files changed, 10 insertions(+)

diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index b0843dc62..895ba4794 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2478,6 +2478,7 @@ static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_p
     for (int i = 0; i < GGML_MAX_SRC; i++) {
         graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
     }
+    memcpy(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS);
 }
 
 static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
@@ -2509,6 +2510,12 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
             return false;
         }
     }
+
+    if (node->op == GGML_OP_SCALE &&
+        memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
+        return false;
+    }
+
     return true;
 }
 
@@ -2720,7 +2727,9 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
             // First call with null argument gets number of nodes in graph
             CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
             // Subsequent call with non-null argument gets nodes
+            cuda_ctx->cuda_graph->nodes.clear();
             cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
+            cuda_ctx->cuda_graph->params.clear();
             cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
             if (cuda_ctx->cuda_graph->num_nodes > 0) {
                 CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index eb39b6d23..85eb200f0 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -569,6 +569,7 @@ struct ggml_graph_node_properties {
     int64_t ne[GGML_MAX_DIMS];
     size_t nb[GGML_MAX_DIMS];
     void * src_address[GGML_MAX_SRC];
+    int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
 };
 
 struct ggml_cuda_graph {

From d09770cae71b416c032ec143dda530f7413c4038 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Sat, 21 Sep 2024 14:24:23 +0200
Subject: [PATCH 16/78] ggml-alloc : fix list of allocated tensors with
 GGML_ALLOCATOR_DEBUG (#9573)

---
 ggml/src/ggml-alloc.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index e485326ab..70187b9b6 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -294,6 +294,12 @@ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
     alloc->free_blocks[0].offset = 0;
     alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
     alloc->max_size = 0;
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    for (int i = 0; i < 1024; i++) {
+        alloc->allocated_tensors[i].tensor = NULL;
+    }
+#endif
 }
 
 static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {

From 2a63caaa69fad6c2afddc47bae052cf2afb01529 Mon Sep 17 00:00:00 2001
From: Molly Sophia <mollysophia379@gmail.com>
Date: Sun, 22 Sep 2024 10:29:12 +0800
Subject: [PATCH 17/78] RWKV v6: RWKV_WKV op CUDA implementation (#9454)

* ggml: CUDA unary op EXP

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>

* ggml: rwkv_wkv op CUDA impl

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>

---------

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
---
 ggml/src/ggml-cuda.cu           |  8 +++
 ggml/src/ggml-cuda/rwkv-wkv.cu  | 89 +++++++++++++++++++++++++++++++++
 ggml/src/ggml-cuda/rwkv-wkv.cuh |  5 ++
 ggml/src/ggml-cuda/unary.cu     | 28 +++++++++++
 ggml/src/ggml-cuda/unary.cuh    |  3 ++
 tests/test-backend-ops.cpp      | 35 +++++++++++++
 6 files changed, 168 insertions(+)
 create mode 100644 ggml/src/ggml-cuda/rwkv-wkv.cu
 create mode 100644 ggml/src/ggml-cuda/rwkv-wkv.cuh

diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 895ba4794..f94051198 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -34,6 +34,7 @@
 #include "ggml-cuda/tsembd.cuh"
 #include "ggml-cuda/unary.cuh"
 #include "ggml-cuda/upscale.cuh"
+#include "ggml-cuda/rwkv-wkv.cuh"
 
 #include <algorithm>
 #include <array>
@@ -2243,6 +2244,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
                 case GGML_UNARY_OP_HARDSWISH:
                     ggml_cuda_op_hardswish(ctx, dst);
                     break;
+                case GGML_UNARY_OP_EXP:
+                    ggml_cuda_op_exp(ctx, dst);
+                    break;
                 default:
                     return false;
             }
@@ -2345,6 +2349,8 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_CROSS_ENTROPY_LOSS:
             ggml_cuda_cross_entropy_loss(ctx, dst);
             break;
+        case GGML_OP_RWKV_WKV:
+            ggml_cuda_op_rwkv_wkv(ctx, dst);
         case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
             ggml_cuda_cross_entropy_loss_back(ctx, dst);
             break;
@@ -2806,6 +2812,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
                 case GGML_UNARY_OP_HARDSWISH:
                 case GGML_UNARY_OP_GELU_QUICK:
                 case GGML_UNARY_OP_TANH:
+                case GGML_UNARY_OP_EXP:
                     return ggml_is_contiguous(op->src[0]);
                 default:
                     return false;
@@ -2967,6 +2974,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_LEAKY_RELU:
+        case GGML_OP_RWKV_WKV:
             return true;
         case GGML_OP_FLASH_ATTN_EXT:
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
diff --git a/ggml/src/ggml-cuda/rwkv-wkv.cu b/ggml/src/ggml-cuda/rwkv-wkv.cu
new file mode 100644
index 000000000..098e92d35
--- /dev/null
+++ b/ggml/src/ggml-cuda/rwkv-wkv.cu
@@ -0,0 +1,89 @@
+#include "common.cuh"
+#include "rwkv-wkv.cuh"
+
+static __global__ void rwkv_wkv_f32(const int B, const int T, const int C, const int H, const float * k, const float * v, const float * r, const float * tf, const float * td, const float * s, float * dst) {
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+
+    const int head_size = CUDA_WKV_BLOCK_SIZE;
+    const int batch_i = bid / H;
+    const int head_i = bid % H;
+    const int state_size = C * head_size;
+    const int n_seq_tokens = T / B;
+
+    float state[head_size];
+    __shared__ float _k[head_size], _r[head_size], _tf[head_size], _td[head_size];
+
+    #pragma unroll
+    for (int i = 0; i < head_size; i++) {
+        state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid];
+    }
+
+    __syncthreads();
+    _tf[tid] = tf[head_i * head_size + tid];
+    __syncthreads();
+
+    for (int t = batch_i * n_seq_tokens * C + head_i * head_size + tid; t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid; t += C) {
+        __syncthreads();
+        _k[tid] = k[t];
+        _r[tid] = r[t];
+        _td[tid] = td[t];
+        __syncthreads();
+
+        const float _v = v[t];
+        float y = 0;
+        for (int j = 0; j < head_size; j += 4) {
+            const float4& k = (float4&)(_k[j]);
+            const float4& r = (float4&)(_r[j]);
+            const float4& tf = (float4&)(_tf[j]);
+            const float4& td = (float4&)(_td[j]);
+            float4& s = (float4&)(state[j]);
+            float4 kv;
+
+            kv.x = k.x * _v;
+            kv.y = k.y * _v;
+            kv.z = k.z * _v;
+            kv.w = k.w * _v;
+
+            y += r.x * (tf.x * kv.x + s.x);
+            y += r.y * (tf.y * kv.y + s.y);
+            y += r.z * (tf.z * kv.z + s.z);
+            y += r.w * (tf.w * kv.w + s.w);
+
+            s.x = s.x * td.x + kv.x;
+            s.y = s.y * td.y + kv.y;
+            s.z = s.z * td.z + kv.z;
+            s.w = s.w * td.w + kv.w;
+        }
+        dst[t] = y;
+    }
+
+    #pragma unroll
+    for (int i = 0; i < head_size; i++) {
+        dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i];
+    }
+}
+
+void ggml_cuda_op_rwkv_wkv(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const float * k_d  = (const float *)dst->src[0]->data;
+    const float * v_d  = (const float *)dst->src[1]->data;
+    const float * r_d  = (const float *)dst->src[2]->data;
+    const float * tf_d = (const float *)dst->src[3]->data;
+    const float * td_d = (const float *)dst->src[4]->data;
+    const float * s_d  = (const float *)dst->src[5]->data;
+
+    const int64_t B = dst->src[5]->ne[1];
+    const int64_t T = dst->src[0]->ne[3];
+    const int64_t C = dst->ne[0];
+    const int64_t H = dst->src[0]->ne[2];
+
+    float * dst_d = (float *)dst->data;
+
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(dst->src[5]->type == GGML_TYPE_F32);
+    GGML_ASSERT(C % H == 0);
+    GGML_ASSERT(C / H == CUDA_WKV_BLOCK_SIZE);
+
+    rwkv_wkv_f32<<<B * H, C / H, 0, stream>>>(B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d);
+}
diff --git a/ggml/src/ggml-cuda/rwkv-wkv.cuh b/ggml/src/ggml-cuda/rwkv-wkv.cuh
new file mode 100644
index 000000000..13795247f
--- /dev/null
+++ b/ggml/src/ggml-cuda/rwkv-wkv.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_WKV_BLOCK_SIZE 64
+
+void ggml_cuda_op_rwkv_wkv(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu
index 163b5a8ff..81fc92202 100644
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@@ -95,6 +95,15 @@ static __global__ void hardswish_f32(const float * x, float * dst, const int k)
     dst[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
 }
 
+static __global__ void exp_f32(const float * x, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = expf(x[i]);
+}
+
 static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
     const int i  = blockDim.x*blockIdx.x + threadIdx.x;
     if (i >= k) {
@@ -189,6 +198,11 @@ static void hardswish_f32_cuda(const float * x, float * dst, const int k, cudaSt
     hardswish_f32<<<num_blocks, CUDA_HARDSWISH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 }
 
+static void exp_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_EXP_BLOCK_SIZE - 1) / CUDA_EXP_BLOCK_SIZE;
+    exp_f32<<<num_blocks, CUDA_EXP_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
 static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
     const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
     leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
@@ -354,6 +368,20 @@ void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
     hardswish_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 }
 
+void ggml_cuda_op_exp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    exp_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
+}
+
 void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const ggml_tensor * src0 = dst->src[0];
     const float * src0_d = (const float *)src0->data;
diff --git a/ggml/src/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh
index fe519f6a2..c91936728 100644
--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@@ -8,6 +8,7 @@
 #define CUDA_RELU_BLOCK_SIZE 256
 #define CUDA_SIGMOID_BLOCK_SIZE 256
 #define CUDA_HARDSIGMOID_BLOCK_SIZE 256
+#define CUDA_EXP_BLOCK_SIZE 256
 #define CUDA_HARDSWISH_BLOCK_SIZE 256
 #define CUDA_SQR_BLOCK_SIZE 256
 #define CUDA_SQRT_BLOCK_SIZE 256
@@ -32,6 +33,8 @@ void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
 void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
+void ggml_cuda_op_exp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
 void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
 void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 889a19944..efa88688c 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1543,6 +1543,36 @@ struct test_ssm_scan : public test_case {
     }
 };
 
+// GGML_OP_RWKV_WKV
+struct test_rwkv_wkv : public test_case {
+    const ggml_type type;
+
+    const int64_t head_count;
+    const int64_t head_size;
+    const int64_t n_seq_tokens;
+    const int64_t n_seqs;
+
+    std::string vars() override {
+        return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs);
+    }
+
+    test_rwkv_wkv(ggml_type type = GGML_TYPE_F32,
+            int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
+        : type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        const int64_t n_tokens = n_seq_tokens * n_seqs;
+        ggml_tensor * r   = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ 1, head_size, head_count, n_tokens }.data());
+        ggml_tensor * k   = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ head_size, 1, head_count, n_tokens }.data());
+        ggml_tensor * v   = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ 1, head_size, head_count, n_tokens }.data());
+        ggml_tensor * tf  = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size, head_count }.data());
+        ggml_tensor * td  = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ 1, head_size, head_count, n_tokens }.data());
+        ggml_tensor * s   = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size * head_size * head_count, n_seqs }.data());
+        ggml_tensor * out = ggml_rwkv_wkv(ctx, k, v, r, tf, td, s);
+        return out;
+    }
+};
+
 // GGML_OP_MUL_MAT
 struct test_mul_mat : public test_case {
     const ggml_type type_a;
@@ -3337,6 +3367,11 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
 
     test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 16, 1024, 32, 4));
 
+    test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 1, 1));
+    test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 32, 1));
+    test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 32, 4));
+    test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 128, 4));
+
 #if 1
     for (ggml_type type_a : base_types) {
         for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {

From ecd5d6b65be08927e62de1587d5fd22778cdc250 Mon Sep 17 00:00:00 2001
From: Shankar <gshankar.87@gmail.com>
Date: Sat, 21 Sep 2024 19:30:34 -0700
Subject: [PATCH 18/78] llama: remove redundant loop when constructing ubatch
 (#9574)

---
 src/llama.cpp | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index af8afd845..bc4e408e0 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3056,18 +3056,14 @@ struct llama_sbatch {
         } else {
             // simple split
             if (batch->n_seq_id) {
-                for (size_t i = 0; i < length; ++i) {
-                    ubatch.n_seq_id = batch->n_seq_id + seq.offset;
-                }
+                ubatch.n_seq_id = batch->n_seq_id + seq.offset;
             } else {
                 for (size_t i = 0; i < length; ++i) {
                     ubatch.n_seq_id[ubatch.n_seqs + i] = 1;
                 }
             }
             if (batch->seq_id) {
-                for (size_t i = 0; i < length; ++i) {
-                    ubatch.seq_id = batch->seq_id + seq.offset;
-                }
+                ubatch.seq_id = batch->seq_id + seq.offset;
             } else {
                 for (size_t i = 0; i < length; ++i) {
                     ubatch.seq_id[ubatch.n_seqs + i] = &seq.all_seq_id;

From a5b57b08ce1998f7046df75324e86b9e2561c7af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Sun, 22 Sep 2024 09:34:52 +0200
Subject: [PATCH 19/78] CUDA: enable Gemma FA for HIP/Pascal (#9581)

---
 ggml/src/ggml-cuda.cu       | 20 ++++++++++----------
 ggml/src/ggml-cuda/fattn.cu |  2 +-
 tests/test-backend-ops.cpp  |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index f94051198..bf21c643d 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2976,19 +2976,19 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
         case GGML_OP_LEAKY_RELU:
         case GGML_OP_RWKV_WKV:
             return true;
-        case GGML_OP_FLASH_ATTN_EXT:
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-            return (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) || op->src[0]->ne[0] == 128;
-#else
-            if (op->src[0]->ne[0] == 128) {
-                return true;
-            }
+        case GGML_OP_FLASH_ATTN_EXT: {
             if (op->src[0]->ne[0] ==  64 && op->src[1]->type == GGML_TYPE_F16) {
                 return true;
             }
-            return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA &&
-                op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+            if (op->src[0]->ne[0] == 128) {
+                return true;
+            }
+            if (op->src[0]->ne[0] == 256 && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) {
+                return true;
+            }
+            const int cc = ggml_cuda_info().devices[cuda_ctx->device].cc;
+            return cc >= CC_VOLTA && cc < CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
+        }
         case GGML_OP_CROSS_ENTROPY_LOSS:
         case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
         case GGML_OP_OPT_STEP_ADAMW:
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
index f28a19d40..83e5589a1 100644
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -314,7 +314,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
     }
 
     if (!fast_fp16_available(cc)) {
-        if (Q->ne[1] <= 8) {
+        if (Q->ne[1] <= 8 || Q->ne[0] == 256) {
             ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
         } else {
             ggml_cuda_flash_attn_ext_tile_f32(ctx, dst);
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index efa88688c..9a96cfc4c 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -3599,7 +3599,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
                     if (hs != 128 && logit_softcap != 0.0f) continue;
                     for (int nh : { 32, }) {
                         for (int kv : { 512, 1024, }) {
-                            for (int nb : { 1, 2, 4, 8, }) {
+                            for (int nb : { 1, 3, 32, 35, }) {
                                 for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
                                     test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV));
                                 }

From 912c331d3dba3a079815844208dc36164baa8cc7 Mon Sep 17 00:00:00 2001
From: Molly Sophia <mollysophia379@gmail.com>
Date: Sun, 22 Sep 2024 21:26:50 +0800
Subject: [PATCH 20/78] Fix merge error in #9454 (#9589)

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
---
 ggml/src/ggml-cuda.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index bf21c643d..5bd4660c3 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2351,6 +2351,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
             break;
         case GGML_OP_RWKV_WKV:
             ggml_cuda_op_rwkv_wkv(ctx, dst);
+            break;
         case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
             ggml_cuda_cross_entropy_loss_back(ctx, dst);
             break;

From c35e586ea57221844442c65a1172498c54971cb0 Mon Sep 17 00:00:00 2001
From: R0CKSTAR <xiaodong.ye@mthreads.com>
Date: Sun, 22 Sep 2024 22:55:49 +0800
Subject: [PATCH 21/78] musa: enable building fat binaries, enable unified
 memory, and disable Flash Attention on QY1 (MTT S80) (#9526)

* mtgpu: add mp_21 support

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>

* mtgpu: disable flash attention on qy1 (MTT S80); disable q3_k and mul_mat_batched_cublas

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>

* mtgpu: enable unified memory

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>

* mtgpu: map cublasOperation_t to mublasOperation_t (sync code to latest)

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>

---------

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
---
 Makefile                             |  2 +-
 ggml/src/CMakeLists.txt              |  2 +-
 ggml/src/ggml-cuda.cu                | 18 ++++++++++++++++--
 ggml/src/ggml-cuda/common.cuh        |  6 ++++++
 ggml/src/ggml-cuda/fattn-tile-f32.cu |  6 +++++-
 ggml/src/ggml-cuda/vendors/musa.h    |  2 ++
 6 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index f922f7083..8a903d7ed 100644
--- a/Makefile
+++ b/Makefile
@@ -611,7 +611,7 @@ ifdef GGML_CUDA
 
 		MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include
 		MK_LDFLAGS   += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64
-		MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_22
+		MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22
 	else
 		ifneq ('', '$(wildcard /opt/cuda)')
 			CUDA_PATH ?= /opt/cuda
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 527c22c68..6c691a4c5 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -364,7 +364,7 @@ if (GGML_CUDA)
         if (GGML_MUSA)
             set_source_files_properties(${GGML_SOURCES_CUDA} PROPERTIES LANGUAGE CXX)
             foreach(SOURCE ${GGML_SOURCES_CUDA})
-                set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_22")
+                set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22")
             endforeach()
         endif()
 
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 5bd4660c3..a0d256100 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -136,7 +136,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
     return res;
 #else
 
-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+#if !defined(GGML_USE_HIPBLAS)
     cudaError_t err;
     if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
     {
@@ -149,7 +149,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
     return err;
 #else
     return cudaMalloc(ptr, size);
-#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+#endif // !defined(GGML_USE_HIPBLAS)
 
 #endif
 }
@@ -2830,6 +2830,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
                 if (op->op == GGML_OP_MUL_MAT && a->ne[3] != b->ne[3]) {
                     return false;
                 }
+#ifdef GGML_USE_MUSA
+                if (b->type == GGML_TYPE_F16 && b->ne[2]*b->ne[3] > 1 &&
+                    !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
+                    return false;
+                }
+#endif // GGML_USE_MUSA
                 switch (a->type) {
                     case GGML_TYPE_F32:
                     case GGML_TYPE_F16:
@@ -2853,6 +2859,11 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
                     case GGML_TYPE_IQ3_XXS:
                     case GGML_TYPE_IQ4_NL:
                     case GGML_TYPE_IQ4_XS:
+#ifdef GGML_USE_MUSA
+                        if (a->type == GGML_TYPE_Q3_K) {
+                            return false;
+                        }
+#endif // GGML_USE_MUSA
                         return true;
                     default:
                         return false;
@@ -2978,6 +2989,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
         case GGML_OP_RWKV_WKV:
             return true;
         case GGML_OP_FLASH_ATTN_EXT: {
+#ifndef FLASH_ATTN_AVAILABLE
+            return false;
+#endif
             if (op->src[0]->ne[0] ==  64 && op->src[1]->type == GGML_TYPE_F16) {
                 return true;
             }
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 85eb200f0..6a4bcdba0 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -50,6 +50,8 @@
 #define CC_RDNA1      (CC_OFFSET_AMD + 1010)
 #define CC_RDNA2      (CC_OFFSET_AMD + 1030)
 #define CC_RDNA3      (CC_OFFSET_AMD + 1100)
+#define CC_QY1        210
+#define CC_QY2        220
 
 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
 
@@ -134,6 +136,10 @@ typedef float2 dfloat2;
 #define INT8_MMA_AVAILABLE
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
 
+#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
+#define FLASH_ATTN_AVAILABLE
+#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
+
 static constexpr bool fast_fp16_available(const int cc) {
     return cc >= CC_PASCAL && cc != 610;
 }
diff --git a/ggml/src/ggml-cuda/fattn-tile-f32.cu b/ggml/src/ggml-cuda/fattn-tile-f32.cu
index 827437ca0..f402195ce 100644
--- a/ggml/src/ggml-cuda/fattn-tile-f32.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu
@@ -44,13 +44,17 @@ static __global__ void flash_attn_tile_ext_f32(
         const int ne1,
         const int ne2,
         const int ne3) {
+#ifndef FLASH_ATTN_AVAILABLE
+    NO_DEVICE_CODE;
+    return;
+#endif // FLASH_ATTN_AVAILABLE
     // Skip unused kernel variants for faster compilation:
     if (use_logit_softcap && !(D == 128 || D == 256)) {
         NO_DEVICE_CODE;
         return;
     }
 
-    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
+    // In this kernel Q, K, V are matrices while i, j, k are matrix indices.
 
     const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
     const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h
index 8df571149..1604b8229 100644
--- a/ggml/src/ggml-cuda/vendors/musa.h
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@@ -26,6 +26,7 @@
 #define cublasSetStream mublasSetStream
 #define cublasSgemm mublasSgemm
 #define cublasStatus_t mublasStatus_t
+#define cublasOperation_t mublasOperation_t
 #define cublasGetStatusString mublasStatus_to_string
 #define cudaDataType_t musaDataType_t
 #define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
@@ -56,6 +57,7 @@
 #define cudaLaunchHostFunc musaLaunchHostFunc
 #define cudaMalloc musaMalloc
 #define cudaMallocHost musaMallocHost
+#define cudaMallocManaged musaMallocManaged
 #define cudaMemcpy musaMemcpy
 #define cudaMemcpyAsync musaMemcpyAsync
 #define cudaMemcpyPeerAsync musaMemcpyPeerAsync

From e62e9789cda3bf5573a747e55ec2a7ee32908f56 Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshanbiswas@fedoraproject.org>
Date: Mon, 23 Sep 2024 08:58:06 +0530
Subject: [PATCH 22/78] Revert "[SYCL] fallback mmvq (#9088)" (#9579)

This reverts commit 50addec9a532a6518146ab837a85504850627316.
---
 ggml/src/ggml-sycl.cpp        | 3 +--
 ggml/src/ggml-sycl/common.hpp | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp
index 16e6be4a0..6978a3192 100644
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -3496,8 +3496,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
 
     bool use_mul_mat_vec_q =  ggml_is_quantized(src0->type)
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE
-        && (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda || src1->ne[1] > MMVQ_MIN_BATCH_SIZE);
+        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
 
     bool use_mul_mat_q =  ggml_sycl_supports_mmq(src0->type)
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index 05947ccb7..bc0faa867 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -134,7 +134,6 @@ typedef sycl::float2 dfloat2;
 #endif // GGML_SYCL_F16
 
 #define MMVQ_MAX_BATCH_SIZE  8
-#define MMVQ_MIN_BATCH_SIZE  4
 
 static const int8_t kvalues_iq4nl[16]={-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
 

From bf9c1013ac40e5f1bd8e60b6d8bf16e0e8401445 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 23 Sep 2024 11:27:47 +0300
Subject: [PATCH 23/78] metal : use F32 prec for K*Q in vec FA (#9595)

ggml-ci
---
 ggml/src/ggml-metal.metal | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal
index f323ab5f4..2b2000323 100644
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@@ -2631,11 +2631,11 @@ kernel void kernel_flash_attn_ext_vec_f16(
         const short iv3 = iq3 / rv3;
 
         // load the queries from shared memory into local memory
-        half4 mq[D4];
+        float4 mq[D4];
 
         for (short ii = 0; ii < D4; ii += NW) {
             short i = ii + tiisg;
-            mq[i] = sq4[i];
+            mq[i] = (float4) sq4[i];
         }
 
         // pointer to the mask
@@ -2661,11 +2661,11 @@ kernel void kernel_flash_attn_ext_vec_f16(
                     for (short ii = 0; ii < D4; ii += NW) {
                         const short i = ii + tiisg;
 
-                        half4x4 mk;
-                        mk[0] = pk4[i + 0*(nb11/8)];
-                        mk[1] = pk4[i + 1*(nb11/8)];
-                        mk[2] = pk4[i + 2*(nb11/8)];
-                        mk[3] = pk4[i + 3*(nb11/8)];
+                        float4x4 mk;
+                        mk[0] = (float4) pk4[i + 0*(nb11/8)];
+                        mk[1] = (float4) pk4[i + 1*(nb11/8)];
+                        mk[2] = (float4) pk4[i + 2*(nb11/8)];
+                        mk[3] = (float4) pk4[i + 3*(nb11/8)];
 
                         mqk += (float4) (mq[i] * mk);
                     }

From 37f8c7b4c97784496cfd91040d55fa22f50b1d57 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 23 Sep 2024 11:28:02 +0300
Subject: [PATCH 24/78] perplexity : remove extra new lines after chunks
 (#9596)

---
 examples/perplexity/perplexity.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index cbd466656..87347135e 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -444,7 +444,6 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
             }
             LOG("%.2f minutes\n", total_seconds / 60.0);
         }
-        LOG("\n");
 
         //LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
         for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
@@ -638,7 +637,6 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
             }
             LOG("%.2f minutes\n", total_seconds / 60.0);
         }
-        LOG("\n");
 
         for (int seq = 0; seq < n_seq_batch; seq++) {
             const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first);

From 1e7b9299c6ccb5bbc55d3db7cfa9b51f3ab09b59 Mon Sep 17 00:00:00 2001
From: Srihari-mcw <96763064+Srihari-mcw@users.noreply.github.com>
Date: Mon, 23 Sep 2024 19:36:38 +0530
Subject: [PATCH 25/78] ggml : AVX512 gemm for Q4_0_8_8 (#9532)

* AVX512 version of ggml_gemm_q4_0_8x8_q8_0

* Remove zero vector parameter passing

* Rename functions and rearrange order of macros

* Edit commments

* style : minor adjustments

* Update x to start from 0

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 ggml/src/ggml-aarch64.c | 537 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 503 insertions(+), 34 deletions(-)

diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c
index 27375d0d7..2b01b4f93 100644
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@@ -39,11 +39,44 @@
 //
 #if defined(__AVX__)
 #if defined(__F16C__)
+#if defined(__AVX512F__)
+#define GGML_F32Cx8x2_LOAD(x, y)     _mm512_cvtph_ps(_mm256_set_m128i(_mm_loadu_si128((const __m128i *)(y)), _mm_loadu_si128((const __m128i *)(x))))
+#define GGML_F32Cx16_REPEAT_LOAD(x)  _mm512_cvtph_ps(_mm256_set_m128i(x, x))
+#endif
 // the  _mm256_cvt intrinsics require F16C
 #define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
 #define GGML_F32Cx8_REPEAT_LOAD(x, loadMask)     _mm256_cvtph_ps(_mm_shuffle_epi32(_mm_maskload_epi32((int const*)(x), loadMask), 68))
 #define GGML_F32Cx8_REARRANGE_LOAD(x, arrangeMask)     _mm256_cvtph_ps(_mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask))
 #else
+#if defined(__AVX512F__)
+static inline __m512 __avx512_f32cx8x2_load(ggml_fp16_t *x, ggml_fp16_t *y) {
+    float tmp[16];
+
+    for (int i = 0; i < 8; i++) {
+        tmp[i] = GGML_FP16_TO_FP32(x[i]);
+    }
+
+    for (int i = 0; i < 8; i++) {
+        tmp[i + 8] = GGML_FP16_TO_FP32(y[i]);
+    }
+
+    return _mm512_loadu_ps(tmp);
+}
+static inline __m512 __avx512_repeat_f32cx16_load(__m128i x) {
+    float tmp[16];
+    uint16_t tmphalf[8];
+    _mm_storeu_si128((__m128i*)tmphalf, x);
+
+    for (int i = 0; i < 4; i++) {
+        tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]);
+        tmp[i + 4] = GGML_FP16_TO_FP32(tmphalf[i]);
+        tmp[i + 8] = GGML_FP16_TO_FP32(tmphalf[i]);
+        tmp[i + 12] = GGML_FP16_TO_FP32(tmphalf[i]);
+    }
+
+    return _mm512_loadu_ps(tmp);
+}
+#endif
 static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
     float tmp[8];
 
@@ -78,30 +111,65 @@ static inline __m256 __avx_rearranged_f32cx8_load(ggml_fp16_t *x, __m128i arrang
 #define GGML_F32Cx8_LOAD(x)     __avx_f32cx8_load(x)
 #define GGML_F32Cx8_REPEAT_LOAD(x, loadMask)     __avx_repeat_f32cx8_load(x)
 #define GGML_F32Cx8_REARRANGE_LOAD(x, arrangeMask)     __avx_rearranged_f32cx8_load(x, arrangeMask)
+#if defined(__AVX512F__)
+#define GGML_F32Cx8x2_LOAD(x, y)     __avx512_f32cx8x2_load(x, y)
+#define GGML_F32Cx16_REPEAT_LOAD(x)  __avx512_repeat_f32cx16_load(x)
+#endif
 #endif
 #endif
 
 
 #if defined(__AVX2__) || defined(__AVX512F__)
-static inline __m256i sum_i16_pairs_int(const __m256i x) {
+#if defined(__AVX512F__)
+// add int16_t pairwise and return as 512 bit int vector
+static inline __m512i sum_i16_pairs_int_32x16(const __m512i x) {
+    const __m512i ones = _mm512_set1_epi16(1);
+    return _mm512_madd_epi16(ones, x);
+}
+
+static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) {
+#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
+    const __m512i zero = _mm512_setzero_si512();
+    return _mm512_dpbusd_epi32(zero, ax, sy);
+#else
+    // Perform multiplication and create 16-bit values
+    const __m512i dot = _mm512_maddubs_epi16(ax, sy);
+    return sum_i16_pairs_int_32x16(dot);
+#endif
+}
+
+// multiply int8_t, add results pairwise twice and return as 512 bit int vector
+static inline __m512i mul_sum_i8_pairs_int32x16(const __m512i x, const __m512i y) {
+    const __m512i zero = _mm512_setzero_si512();
+    // Get absolute values of x vectors
+    const __m512i ax = _mm512_abs_epi8(x);
+    // Sign the values of the y vectors
+    __mmask64 blt0 = _mm512_movepi8_mask(x);
+    const __m512i sy = _mm512_mask_sub_epi8(y, blt0, zero, y);
+    return mul_sum_us8_pairs_int32x16(ax, sy);
+}
+#endif
+
+// add int16_t pairwise and return as 256 bit int vector
+static inline __m256i sum_i16_pairs_int32x8(const __m256i x) {
     const __m256i ones = _mm256_set1_epi16(1);
     return _mm256_madd_epi16(ones, x);
 }
 
-static inline __m256i mul_sum_us8_pairs_int(const __m256i ax, const __m256i sy) {
+static inline __m256i mul_sum_us8_pairs_int32x8(const __m256i ax, const __m256i sy) {
 #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
     const __m256i zero = _mm256_setzero_si256();
     return _mm256_dpbusd_epi32(zero, ax, sy);
 #else
     // Perform multiplication and create 16-bit values
     const __m256i dot = _mm256_maddubs_epi16(ax, sy);
-    return sum_i16_pairs_int(dot);
+    return sum_i16_pairs_int32x8(dot);
 #endif
 }
 
 // Integer variant of the function defined in ggml-quants.c
-// multiply int8_t, add results pairwise twice and return as float vector
-static inline __m256i mul_sum_i8_pairs_int(const __m256i x, const __m256i y) {
+// multiply int8_t, add results pairwise twice and return as 256 bit int vector
+static inline __m256i mul_sum_i8_pairs_int32x8(const __m256i x, const __m256i y) {
 #if __AVXVNNIINT8__
     const __m256i zero = _mm256_setzero_si256();
     return _mm256_dpbssd_epi32(zero, x, y);
@@ -110,7 +178,7 @@ static inline __m256i mul_sum_i8_pairs_int(const __m256i x, const __m256i y) {
     const __m256i ax = _mm256_sign_epi8(x, x);
     // Sign the values of the y vectors
     const __m256i sy = _mm256_sign_epi8(y, x);
-    return mul_sum_us8_pairs_int(ax, sy);
+    return mul_sum_us8_pairs_int32x8(ax, sy);
 #endif
 }
 #endif
@@ -929,17 +997,17 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
                 // ...........................................................................
                 // B0(28-31) B4(28-31) B1(28-31) B5(28-31) B2(28-31) B6(28-31) B3(28-31) B7(28-31) with A0(28-31)
 
-                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(rhs_vec_0123_0 ,_mm256_shuffle_epi32(rhs_vec_4567_0, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 0)));
-                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_0, 177) ,rhs_vec_4567_0, 170), _mm256_shuffle_epi32(lhs_vec_0, 85)));
+                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(rhs_vec_0123_0 ,_mm256_shuffle_epi32(rhs_vec_4567_0, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 0)));
+                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_0, 177) ,rhs_vec_4567_0, 170), _mm256_shuffle_epi32(lhs_vec_0, 85)));
 
-                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(rhs_vec_0123_1 ,_mm256_shuffle_epi32(rhs_vec_4567_1, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 170)));
-                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_1, 177) ,rhs_vec_4567_1, 170), _mm256_shuffle_epi32(lhs_vec_0, 255)));
+                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(rhs_vec_0123_1 ,_mm256_shuffle_epi32(rhs_vec_4567_1, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 170)));
+                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_1, 177) ,rhs_vec_4567_1, 170), _mm256_shuffle_epi32(lhs_vec_0, 255)));
 
-                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(rhs_vec_0123_2 ,_mm256_shuffle_epi32(rhs_vec_4567_2, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 0)));
-                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_2, 177) ,rhs_vec_4567_2, 170), _mm256_shuffle_epi32(lhs_vec_1, 85)));
+                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(rhs_vec_0123_2 ,_mm256_shuffle_epi32(rhs_vec_4567_2, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 0)));
+                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_2, 177) ,rhs_vec_4567_2, 170), _mm256_shuffle_epi32(lhs_vec_1, 85)));
 
-                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(rhs_vec_0123_3 ,_mm256_shuffle_epi32(rhs_vec_4567_3, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 170)));
-                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_3, 177) ,rhs_vec_4567_3, 170), _mm256_shuffle_epi32(lhs_vec_1, 255)));
+                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(rhs_vec_0123_3 ,_mm256_shuffle_epi32(rhs_vec_4567_3, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 170)));
+                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_3, 177) ,rhs_vec_4567_3, 170), _mm256_shuffle_epi32(lhs_vec_1, 255)));
 
                 // Accumulated values multipled with appropriate scales
                 acc_row = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc), _mm256_mul_ps(col_scale_f32, row_scale_f32), acc_row);
@@ -2421,10 +2489,411 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
     __m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
     signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
     // Permute mask used for easier vector processing at later stages
-    __m256i requiredOrder = _mm256_set_epi32(3 ,2 ,1 ,0, 7 ,6, 5, 4);
+    __m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4);
+    int64_t xstart = 0;
+    int anr = nr - nr%16; // Used to align nr with boundary of 16
+#ifdef __AVX512F__
+    int anc = nc - nc%16; // Used to align nc with boundary of 16
+    // Mask to mask out nibbles from packed bytes expanded to 512 bit length
+    const __m512i m4bexpanded = _mm512_set1_epi8(0x0F);
+    // Lookup table to convert signed nibbles to signed bytes expanded to 512 bit length
+    __m512i signextendlutexpanded = _mm512_inserti32x8(_mm512_castsi256_si512(signextendlut), signextendlut, 1);
+
+    // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
+    for (; y < anr / 4; y += 4) {
+
+        const block_q8_0x4 * a_ptrs[4];
+
+        a_ptrs[0] = a_ptr_start + (y * nb);
+        for (int i = 0; i < 3; ++i) {
+            a_ptrs[i + 1] = a_ptrs[i] + nb;
+        }
+
+        // Take group of two block_q4_0x8 structures at each pass of the loop and perform dot product operation
+        for (int64_t x = 0; x < anc / 8; x += 2) {
+
+            const block_q4_0x8 * b_ptr_0 = b_ptr_start + ((x)     * b_nb);
+            const block_q4_0x8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
+
+            // Master FP accumulators
+            __m512 acc_rows[16];
+            for (int i = 0; i < 16; i++) {
+                acc_rows[i] = _mm512_setzero_ps();
+            }
+
+            for (int64_t b = 0; b < nb; b++) {
+                // Load the sixteen block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....BE,BF
+                const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs));
+                const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 32));
+                const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 64));
+                const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 96));
+
+                const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs));
+                const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 32));
+                const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
+                const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
+
+                // Save the values in the following vectors in the formats B0B1B4B5B8B9BCBD, B2B3B6B7BABBBEBF for further processing and storing of values
+                const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
+                const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
+                const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
+                const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
+
+                const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240);
+                const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240);
+                const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240);
+                const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240);
+
+                const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1);
+                const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1);
+                const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1);
+                const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1);
+
+                // 4-bit -> 8-bit - Sign is maintained
+                const __m512i rhs_mat_014589CD_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_0, m4bexpanded)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7) B8(0-7) B9(0-7) BC(0-7) BD(0-7)
+                const __m512i rhs_mat_2367ABEF_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4bexpanded)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7) BA(0-7) BB(0-7) BE(0-7) BF(0-7)
+
+                const __m512i rhs_mat_014589CD_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_1, m4bexpanded)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15) B8(8-15) B9(8-15) BC(8-15) BD(8-15)
+                const __m512i rhs_mat_2367ABEF_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4bexpanded)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15) BA(8-15) BB(8-15) BE(8-15) BF(8-15)
+
+                const __m512i rhs_mat_014589CD_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4bexpanded)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23) B8(16-23) B9(16-23) BC(16-23) BD(16-23)
+                const __m512i rhs_mat_2367ABEF_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4bexpanded)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23) BA(16-23) BB(16-23) BE(16-23) BF(16-23)
+
+                const __m512i rhs_mat_014589CD_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4bexpanded)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31) B8(24-31) B9(24-31) BC(24-31) BD(24-31)
+                const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31)
+
+                // Shuffle pattern one - right side input
+                const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
+                const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)
+
+                const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
+                const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)
+
+                const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
+                const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)
+
+                const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
+                const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)
+
+                // Shuffle pattern two - right side input
+
+                const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
+                const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)
+
+                const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
+                const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)
+
+                const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
+                const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)
+
+                const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
+                const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)
+
+                // Scale values - Load the weight scale values of two block_q4_0x8
+                const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
+
+                // Process LHS in pairs of rows
+                for (int rp = 0; rp < 4; rp++) {
+
+                    // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
+                    // Loaded as set of 128 bit vectors and repeated and stored into a 256 bit vector before again repeating into 512 bit vector
+                    __m256i lhs_mat_ymm_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs)));
+                    __m256i lhs_mat_ymm_01_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 0);
+                    __m256i lhs_mat_ymm_23_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 17);
+                    __m256i lhs_mat_ymm_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 32)));
+                    __m256i lhs_mat_ymm_01_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 0);
+                    __m256i lhs_mat_ymm_23_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 17);
+                    __m256i lhs_mat_ymm_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 64)));
+                    __m256i lhs_mat_ymm_01_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 0);
+                    __m256i lhs_mat_ymm_23_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 17);
+                    __m256i lhs_mat_ymm_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 96)));
+                    __m256i lhs_mat_ymm_01_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 0);
+                    __m256i lhs_mat_ymm_23_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 17);
+
+                    __m512i lhs_mat_01_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_0), lhs_mat_ymm_01_0, 1);
+                    __m512i lhs_mat_23_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_0), lhs_mat_ymm_23_0, 1);
+                    __m512i lhs_mat_01_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_1), lhs_mat_ymm_01_1, 1);
+                    __m512i lhs_mat_23_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_1), lhs_mat_ymm_23_1, 1);
+                    __m512i lhs_mat_01_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_2), lhs_mat_ymm_01_2, 1);
+                    __m512i lhs_mat_23_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_2), lhs_mat_ymm_23_2, 1);
+                    __m512i lhs_mat_01_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_3), lhs_mat_ymm_01_3, 1);
+                    __m512i lhs_mat_23_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_3), lhs_mat_ymm_23_3, 1);
+
+                    // Shuffle pattern one - left side input
+
+                    const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, 160);  //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
+                    const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, 160);  //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
+
+                    const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, 160);  //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
+                    const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, 160);  //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
+
+                    const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, 160);  //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
+                    const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, 160);  //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
+
+                    const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, 160);  //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
+                    const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, 160);  //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
+
+                    // Shuffle pattern two - left side input
+
+                    const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, 245);  //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
+                    const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, 245);  //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
+
+                    const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, 245);  //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
+                    const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, 245);  //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
+
+                    const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, 245);  //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
+                    const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, 245);  //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
+
+                    const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, 245);  //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
+                    const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, 245);  //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
+
+                    // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
+                    // Resembles MMLAs into 2x2 matrices in ARM Version
+                    __m512i iacc_mat_00_sp1 =
+                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_014589CD_0_sp1));
+                    __m512i iacc_mat_01_sp1 =
+                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_2367ABEF_0_sp1));
+                    __m512i iacc_mat_10_sp1 =
+                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_014589CD_0_sp1));
+                    __m512i iacc_mat_11_sp1 =
+                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_2367ABEF_0_sp1));
+                    __m512i iacc_mat_00_sp2 =
+                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_014589CD_0_sp2));
+                    __m512i iacc_mat_01_sp2 =
+                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_2367ABEF_0_sp2));
+                    __m512i iacc_mat_10_sp2 =
+                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_014589CD_0_sp2));
+                    __m512i iacc_mat_11_sp2 =
+                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_2367ABEF_0_sp2));
+
+                    // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
+                    __m512i iacc_mat_00 = _mm512_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
+                    __m512i iacc_mat_01 = _mm512_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
+                    __m512i iacc_mat_10 = _mm512_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
+                    __m512i iacc_mat_11 = _mm512_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
+
+
+                    // Straighten out to make 4 row vectors
+                    __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, 78));
+                    __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01);
+                    __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, 78));
+                    __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11);
+
+                    // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
+                    const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptrs[rp][b].d), loadMask), 68);
+                    const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
+
+                    // Multiply with appropiate scales and accumulate
+                    acc_rows[rp * 4]     = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)),   acc_rows[rp * 4]);
+                    acc_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)),  acc_rows[rp * 4 + 1]);
+                    acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
+                    acc_rows[rp * 4 + 3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_3), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[rp * 4 + 3]);
+                }
+            }
+
+            // Store the accumulated values
+            for (int i = 0; i < 16; i++) {
+                _mm512_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
+            }
+        }
+    }
+    // Take a block_q8_0x4 structures at each pass of the loop and perform dot product operation
+    for (; y < nr / 4; y ++) {
+
+        const block_q8_0x4 * a_ptr = a_ptr_start + (y * nb);
+
+        // Take group of two block_q4_0x8 structures at each pass of the loop and perform dot product operation
+        for (int64_t x = 0; x < anc / 8; x += 2) {
+
+            const block_q4_0x8 * b_ptr_0 = b_ptr_start + ((x)     * b_nb);
+            const block_q4_0x8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
+
+            // Master FP accumulators
+            __m512 acc_rows[4];
+            for (int i = 0; i < 4; i++) {
+                acc_rows[i] = _mm512_setzero_ps();
+            }
+
+            for (int64_t b = 0; b < nb; b++) {
+                // Load the sixteen block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....BE,BF
+                const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs));
+                const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 32));
+                const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 64));
+                const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 96));
+
+                const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs));
+                const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 32));
+                const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
+                const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
+
+                // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
+                const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
+                const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
+                const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
+                const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
+
+                const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240);
+                const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240);
+                const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240);
+                const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240);
+
+                const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1);
+                const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1);
+                const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1);
+                const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1);
+
+                // 4-bit -> 8-bit - Sign is maintained
+                const __m512i rhs_mat_014589CD_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_0, m4bexpanded)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7) B8(0-7) B9(0-7) BC(0-7) BD(0-7)
+                const __m512i rhs_mat_2367ABEF_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4bexpanded)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7) BA(0-7) BB(0-7) BE(0-7) BF(0-7)
+
+                const __m512i rhs_mat_014589CD_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_1, m4bexpanded)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15) B8(8-15) B9(8-15) BC(8-15) BD(8-15)
+                const __m512i rhs_mat_2367ABEF_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4bexpanded)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15) BA(8-15) BB(8-15) BE(8-15) BF(8-15)
+
+                const __m512i rhs_mat_014589CD_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4bexpanded)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23) B8(16-23) B9(16-23) BC(16-23) BD(16-23)
+                const __m512i rhs_mat_2367ABEF_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4bexpanded)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23) BA(16-23) BB(16-23) BE(16-23) BF(16-23)
+
+                const __m512i rhs_mat_014589CD_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4bexpanded)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31) B8(24-31) B9(24-31) BC(24-31) BD(24-31)
+                const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31)
+
+                // Shuffle pattern one - right side input
+                const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
+                const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)
+
+                const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
+                const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)
+
+                const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
+                const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)
+
+                const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
+                const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)
+
+                // Shuffle pattern two - right side input
+
+                const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
+                const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)
+
+                const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
+                const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)
+
+                const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
+                const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)
+
+                const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
+                const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)
+
+
+                // Scale values - Load the weight scale values of two block_q4_0x8
+                const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
+
+                // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
+                // Loaded as set of 128 bit vectors and repeated and stored into a 256 bit vector before again repeating into 512 bit vector
+                __m256i lhs_mat_ymm_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs)));
+                __m256i lhs_mat_ymm_01_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 0);
+                __m256i lhs_mat_ymm_23_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 17);
+                __m256i lhs_mat_ymm_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 32)));
+                __m256i lhs_mat_ymm_01_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 0);
+                __m256i lhs_mat_ymm_23_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 17);
+                __m256i lhs_mat_ymm_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 64)));
+                __m256i lhs_mat_ymm_01_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 0);
+                __m256i lhs_mat_ymm_23_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 17);
+                __m256i lhs_mat_ymm_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 96)));
+                __m256i lhs_mat_ymm_01_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 0);
+                __m256i lhs_mat_ymm_23_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 17);
+
+                __m512i lhs_mat_01_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_0), lhs_mat_ymm_01_0, 1);
+                __m512i lhs_mat_23_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_0), lhs_mat_ymm_23_0, 1);
+                __m512i lhs_mat_01_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_1), lhs_mat_ymm_01_1, 1);
+                __m512i lhs_mat_23_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_1), lhs_mat_ymm_23_1, 1);
+                __m512i lhs_mat_01_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_2), lhs_mat_ymm_01_2, 1);
+                __m512i lhs_mat_23_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_2), lhs_mat_ymm_23_2, 1);
+                __m512i lhs_mat_01_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_3), lhs_mat_ymm_01_3, 1);
+                __m512i lhs_mat_23_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_3), lhs_mat_ymm_23_3, 1);
+
+                // Shuffle pattern one - left side input
+
+                const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, 160);  //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
+                const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, 160);  //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
+
+                const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, 160);  //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
+                const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, 160);  //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
+
+                const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, 160);  //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
+                const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, 160);  //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
+
+                const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, 160);  //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
+                const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, 160);  //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
+
+                // Shuffle pattern two - left side input
+
+                const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, 245);  //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
+                const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, 245);  //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
+
+                const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, 245);  //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
+                const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, 245);  //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
+
+                const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, 245);  //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
+                const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, 245);  //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
+
+                const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, 245);  //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
+                const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, 245);  //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
+
+                // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
+                // Resembles MMLAs into 2x2 matrices in ARM Version
+                __m512i iacc_mat_00_sp1 =
+                    _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_014589CD_0_sp1));
+                __m512i iacc_mat_01_sp1 =
+                    _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_2367ABEF_0_sp1));
+                __m512i iacc_mat_10_sp1 =
+                    _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_014589CD_0_sp1));
+                __m512i iacc_mat_11_sp1 =
+                    _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_2367ABEF_0_sp1));
+                __m512i iacc_mat_00_sp2 =
+                    _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_014589CD_0_sp2));
+                __m512i iacc_mat_01_sp2 =
+                    _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_2367ABEF_0_sp2));
+                __m512i iacc_mat_10_sp2 =
+                    _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_014589CD_0_sp2));
+                __m512i iacc_mat_11_sp2 =
+                    _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_2367ABEF_0_sp2));
+
+                // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
+                __m512i iacc_mat_00 = _mm512_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
+                __m512i iacc_mat_01 = _mm512_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
+                __m512i iacc_mat_10 = _mm512_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
+                __m512i iacc_mat_11 = _mm512_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
+
+
+                // Straighten out to make 4 row vectors
+                __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, 78));
+                __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01);
+                __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, 78));
+                __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11);
+
+                // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
+                const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptr[b].d), loadMask), 68);
+                const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
+
+                // Multiply with appropiate scales and accumulate
+                acc_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)),   acc_rows[0]);
+                acc_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)),  acc_rows[1]);
+                acc_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
+                acc_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_3), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[3]);
+            }
+
+            // Store the accumulated values
+            for (int i = 0; i < 4; i++) {
+                _mm512_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
+            }
+        }
+    }
+    if (anc != nc) {
+        xstart = anc/8;
+        y = 0;
+    }
+#endif // __AVX512F__
 
     // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
-    int anr = nr - nr %16; // Used to align nr with boundary of 16
 
     for (; y < anr / 4; y += 4) {
         const block_q8_0x4 * a_ptrs[4];
@@ -2435,7 +2904,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
         }
 
         // Take group of eight block_q4_0x8 structures at each pass of the loop and perform dot product operation
-        for (int64_t x = 0; x < nc / 8; x++) {
+        for (int64_t x = xstart; x < nc / 8; x++) {
 
             const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
 
@@ -2547,21 +3016,21 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
                     // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
                     // Resembles MMLAs into 2x2 matrices in ARM Version
                     __m256i iacc_mat_00_sp1 =
-                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int(lhs_mat_01_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp1, rhs_mat_0145_0_sp1));
+                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_0145_0_sp1));
                     __m256i iacc_mat_01_sp1 =
-                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int(lhs_mat_01_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp1, rhs_mat_2367_0_sp1));
+                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_2367_0_sp1));
                     __m256i iacc_mat_10_sp1 =
-                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int(lhs_mat_23_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp1, rhs_mat_0145_0_sp1));
+                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_0145_0_sp1));
                     __m256i iacc_mat_11_sp1 =
-                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int(lhs_mat_23_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp1, rhs_mat_2367_0_sp1));
+                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_2367_0_sp1));
                     __m256i iacc_mat_00_sp2 =
-                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int(lhs_mat_01_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp2, rhs_mat_0145_0_sp2));
+                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_0145_0_sp2));
                     __m256i iacc_mat_01_sp2 =
-                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int(lhs_mat_01_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp2, rhs_mat_2367_0_sp2));
+                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_2367_0_sp2));
                     __m256i iacc_mat_10_sp2 =
-                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int(lhs_mat_23_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp2, rhs_mat_0145_0_sp2));
+                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_0145_0_sp2));
                     __m256i iacc_mat_11_sp2 =
-                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int(lhs_mat_23_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp2, rhs_mat_2367_0_sp2));
+                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_2367_0_sp2));
 
                     // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
                     __m256i iacc_mat_00 = _mm256_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
@@ -2599,7 +3068,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
         const block_q8_0x4 * a_ptr = a_ptr_start + (y * nb);
 
         // Load the eight block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
-        for (int64_t x = 0; x < nc / 8; x++) {
+        for (int64_t x = xstart; x < nc / 8; x++) {
 
             const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
 
@@ -2711,21 +3180,21 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
                 // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
                 // Resembles MMLAs into 2x2 matrices in ARM Version
                 __m256i iacc_mat_00_sp1 =
-                    _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int(lhs_mat_01_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp1, rhs_mat_0145_0_sp1));
+                    _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_0145_0_sp1));
                 __m256i iacc_mat_01_sp1 =
-                    _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int(lhs_mat_01_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp1, rhs_mat_2367_0_sp1));
+                    _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_2367_0_sp1));
                 __m256i iacc_mat_10_sp1 =
-                    _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int(lhs_mat_23_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp1, rhs_mat_0145_0_sp1));
+                    _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_0145_0_sp1));
                 __m256i iacc_mat_11_sp1 =
-                    _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int(lhs_mat_23_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp1, rhs_mat_2367_0_sp1));
+                    _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_2367_0_sp1));
                 __m256i iacc_mat_00_sp2 =
-                    _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int(lhs_mat_01_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp2, rhs_mat_0145_0_sp2));
+                    _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_0145_0_sp2));
                 __m256i iacc_mat_01_sp2 =
-                    _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int(lhs_mat_01_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp2, rhs_mat_2367_0_sp2));
+                    _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_2367_0_sp2));
                 __m256i iacc_mat_10_sp2 =
-                    _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int(lhs_mat_23_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp2, rhs_mat_0145_0_sp2));
+                    _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_0145_0_sp2));
                 __m256i iacc_mat_11_sp2 =
-                    _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int(lhs_mat_23_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp2, rhs_mat_2367_0_sp2));
+                    _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_2367_0_sp2));
 
                 // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
                 __m256i iacc_mat_00 = _mm256_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);

From f3979df762b75a2b1c0f622a2cd15d1bc60f037f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 23 Sep 2024 18:43:40 +0300
Subject: [PATCH 26/78] flake.lock: Update (#9586)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Flake lock file updates:

• Updated input 'nixpkgs':
    'github:NixOS/nixpkgs/4f807e8940284ad7925ebd0a0993d2a1791acb2f?narHash=sha256-IiA3jfbR7K/B5%2B9byVi9BZGWTD4VSbWe8VLpp9B/iYk%3D' (2024-09-11)
  → 'github:NixOS/nixpkgs/c04d5652cfa9742b1d519688f65d1bbccea9eb7e?narHash=sha256-PmUr/2GQGvFTIJ6/Tvsins7Q43KTMvMFhvG6oaYK%2BWk%3D' (2024-09-19)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 flake.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flake.lock b/flake.lock
index 0db5ff92a..6333a09f0 100644
--- a/flake.lock
+++ b/flake.lock
@@ -20,11 +20,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1726062873,
-        "narHash": "sha256-IiA3jfbR7K/B5+9byVi9BZGWTD4VSbWe8VLpp9B/iYk=",
+        "lastModified": 1726755586,
+        "narHash": "sha256-PmUr/2GQGvFTIJ6/Tvsins7Q43KTMvMFhvG6oaYK+Wk=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "4f807e8940284ad7925ebd0a0993d2a1791acb2f",
+        "rev": "c04d5652cfa9742b1d519688f65d1bbccea9eb7e",
         "type": "github"
       },
       "original": {

From 1d48e98e4f3316bd2f6b187d288c7b6cb88d5cb3 Mon Sep 17 00:00:00 2001
From: Riceball LEE <snowyu.lee@gmail.com>
Date: Mon, 23 Sep 2024 23:58:17 +0800
Subject: [PATCH 27/78] readme : add programmable prompt engine language CLI
 (#9599)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 4d24dd591..ce954f713 100644
--- a/README.md
+++ b/README.md
@@ -112,6 +112,7 @@ Typically finetunes of the base models below are supported as well.
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
 - JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
+- JS/TS (Programmable Prompt Engine CLI): [offline-ai/cli](https://github.com/offline-ai/cli)
 - JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
 - Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)

From f0c7b5edf82aa200656fd88c11ae3a805d7130bf Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Mon, 23 Sep 2024 11:42:43 -0700
Subject: [PATCH 28/78] threads: improve ggml_barrier scaling with large number
 of threads (#9598)

Make sure n_barrier and n_barrier_passed do not share the cache line to avoid cache line bouncing.
This optimization shows performance improvements even for n_threads <= 8 cases.

Resurect TSAN (Thread Sanitizer) check so that we can avoid doing expensive read-modify-write
in the normal case and just use thread-fence as originally intended.

---
Here is the original description and suggestions from Willy Tarreau :

There's currently some false sharing between n_barrier and
n_barrier_passed that is amplified in ggml_barrier() by the fact that
all threads need to increment n_barrier when entering, while all
previous threads continue to read n_barrier_passed, waiting for the last
one to release them all. The side effect is that all these readers are
slowing down all new threads by making the cache line bounce back and
forth between readers and writers.

Just placing them in two distinct cache lines is sufficient to boost
the performance by 21% on a 80-core ARM server compared to the
no-openmp version, and by 3% compared to the openmp version.

Note that the variables could have been spread apart in the structure
as well, but it doesn't seem that the size of this threadpool struct is
critical so here we're simply aligning them.

Finally, the same issue was present when leaving the barrier since all
threads had to update the n_barrier_passed counter, though only one
would add a non-zero value. This alone is responsible for half of the
cost due to undesired serialization.

It might be possible that using a small array of n_barrier counters
could make things even faster on many-core systems, but it would likely
complicate the logic needed to detect the last thread.

Co-authored-by: Willy Tarreau <w@1wt.eu>
---
 ggml/src/ggml.c | 59 +++++++++++++++++++++++++++++++++++++------------
 1 file changed, 45 insertions(+), 14 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 201d5466a..96c09ca89 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -63,6 +63,25 @@ int ggml_sve_cnt_b = 0;
 #pragma warning(disable: 4702)
 #endif
 
+// Note: once we move threading into a separate C++ file
+// will use std::hardware_destructive_interference_size instead of hardcoding it here
+// and we'll use C++ attribute syntax.
+#define GGML_CACHE_LINE  64
+
+#if defined(__clang__) || defined(__GNUC__)
+#define GGML_CACHE_ALIGN __attribute__((aligned(GGML_CACHE_LINE)))
+#endif
+
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define GGML_TSAN_ENABLED 1
+#endif
+#else  // __has_feature
+#if defined(__SANITIZE_THREAD__)
+#define GGML_TSAN_ENABLED 1
+#endif
+#endif // __has_feature
+
 #if defined(_WIN32)
 
 #define WIN32_LEAN_AND_MEAN
@@ -72,6 +91,8 @@ int ggml_sve_cnt_b = 0;
 #include <windows.h>
 
 #if !defined(__clang__)
+#define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE))
+
 typedef volatile LONG atomic_int;
 typedef atomic_int atomic_bool;
 typedef atomic_int atomic_flag;
@@ -2007,8 +2028,8 @@ struct ggml_threadpool {
 
     // synchronization primitives
     atomic_int n_graph;       // incremented when there is work to be done (i.e each graph)
-    atomic_int n_barrier;
-    atomic_int n_barrier_passed;
+    atomic_int GGML_CACHE_ALIGN n_barrier;
+    atomic_int GGML_CACHE_ALIGN n_barrier_passed;
     atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
 
     // these are atomic as an annotation for thread-sanitizer
@@ -3196,20 +3217,27 @@ static void ggml_barrier(struct ggml_threadpool * tp) {
     // enter barrier (full seq-cst fence)
     int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst);
 
-    int last = 0;
     if (n_barrier == (n_threads - 1)) {
         // last thread
         atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
-        last = 1;
-    } else {
-        // wait for other threads
-        while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
-            ggml_thread_cpu_relax();
-        }
+
+        // exit barrier (fill seq-cst fence)
+        atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
+        return;
+    }
+
+    // wait for other threads
+    while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
+        ggml_thread_cpu_relax();
     }
 
     // exit barrier (full seq-cst fence)
-    atomic_fetch_add_explicit(&tp->n_barrier_passed, last, memory_order_seq_cst);
+    // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
+    #ifdef GGML_TSAN_ENABLED
+    atomic_fetch_add_explicit(&tp->n_barrier_passed, 0, memory_order_seq_cst);
+    #else
+    atomic_thread_fence(memory_order_seq_cst);
+    #endif
 #endif
 }
 
@@ -20240,10 +20268,13 @@ static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * s
 
 // sync thread state after polling
 static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) {
-    struct ggml_threadpool * threadpool = state->threadpool;
-    // this should just be atomic_thread_fence(seq_cst) but it confuses thread-sanitizer
-    // so instead we just use a dummy read-modify-write
-    atomic_fetch_add_explicit(&threadpool->n_graph, 0, memory_order_seq_cst);
+    // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
+    #ifdef GGML_TSAN_ENABLED
+    atomic_fetch_add_explicit(&state->threadpool->n_graph, 0, memory_order_seq_cst);
+    #else
+    atomic_thread_fence(memory_order_seq_cst);
+    #endif
+    UNUSED(state);
 }
 
 static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {

From 0b3bf966f47bf2ba88e5d4e3ed429602008c7e63 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Mon, 23 Sep 2024 22:23:54 +0200
Subject: [PATCH 29/78] server : add --no-context-shift option (#9607)

* server : add --no-context-shift option

* small fix

* Update examples/server/tests/features/embeddings.feature

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* tests : minor fix

* revert usage of GGML_ASSERT

* update server documentation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 common/arg.cpp                                |  2 +-
 examples/server/README.md                     | 20 +++---
 examples/server/server.cpp                    | 27 +++++++-
 .../server/tests/features/ctx_shift.feature   | 62 +++++++++++++++++++
 .../server/tests/features/embeddings.feature  | 22 +++++--
 examples/server/tests/features/steps/steps.py | 28 ++++++---
 6 files changed, 139 insertions(+), 22 deletions(-)
 create mode 100644 examples/server/tests/features/ctx_shift.feature

diff --git a/common/arg.cpp b/common/arg.cpp
index 922391069..c1ec3c4f9 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -691,7 +691,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
         [](gpt_params & params) {
             params.ctx_shift = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
     add_opt(llama_arg(
         {"--chunks"}, "N",
         format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
diff --git a/examples/server/README.md b/examples/server/README.md
index 326e05e1e..741950c8a 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -21,8 +21,6 @@ The project is under active development, and we are [looking for feedback and co
 | -------- | ----------- |
 | `-h, --help, --usage` | print usage and exit |
 | `--version` | show version and build info |
-| `-v, --verbose` | print verbose information |
-| `--verbosity N` | set specific verbosity level (default: 0) |
 | `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
 | `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
 | `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
@@ -40,15 +38,18 @@ The project is under active development, and we are [looking for feedback and co
 | `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
 | `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
 | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
+| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled) |
 | `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
 | `-p, --prompt PROMPT` | prompt to start generation with |
+| `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) |
 | `-f, --file FNAME` | a file containing the prompt (default: none) |
 | `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
 | `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
 | `--no-escape` | do not process escape sequences |
+| `-sp, --special` | special tokens output enabled (default: false) |
 | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
 | `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) |
-| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
+| `-s, --seed SEED` | RNG seed (default: 4294967295, use random seed for 4294967295) |
 | `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
 | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
 | `--penalize-nl` | penalize newline tokens (default: false) |
@@ -87,7 +88,7 @@ The project is under active development, and we are [looking for feedback and co
 | `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16) |
 | `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) |
 | `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
-| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env:  LLAMA_ARG_N_PARALLEL) |
+| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
 | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
 | `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
 | `--mlock` | force system to keep model in RAM rather than swapping or compressing |
@@ -128,12 +129,13 @@ The project is under active development, and we are [looking for feedback and co
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
 | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
 | `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) |
-| `--log-test` | Log test |
 | `--log-disable` | Log disable |
-| `--log-enable` | Log enable |
-| `--log-new` | Log new |
-| `--log-append` | Log append |
-| `--log-file FNAME` | Log file |
+| `--log-file FNAME` | Log to file |
+| `--log-colors` | Enable colored logging<br/>(env: LLAMA_LOG_COLORS) |
+| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
+| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored.<br/>(env: LLAMA_LOG_VERBOSITY) |
+| `--log-prefix` | Enable prefx in log messages<br/>(env: LLAMA_LOG_PREFIX) |
+| `--log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_LOG_TIMESTAMPS) |
 
 Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 0ca999994..8655c097a 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1180,6 +1180,15 @@ struct server_context {
             SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict);
         }
 
+        // if context shift is disabled, we stop when it reaches the context limit
+        if (slot.n_decoded >= slot.n_ctx) {
+            slot.truncated      = true;
+            slot.stopped_limit  = true;
+            slot.has_next_token = false;
+
+            SLT_DBG(slot, "stopped due to running out of context capacity, n_decoded = %d, n_ctx = %d\n", slot.n_decoded, slot.n_ctx);
+        }
+
         if (llama_token_is_eog(model, result.tok)) {
             slot.stopped_eos    = true;
             slot.has_next_token = false;
@@ -1480,7 +1489,7 @@ struct server_context {
             if (result.error) {
                 error_handler(result.data);
                 cancel_tasks(id_tasks);
-                break;
+                return;
             }
 
             size_t idx = result.data["index"];
@@ -1827,6 +1836,14 @@ struct server_context {
         for (server_slot & slot : slots) {
             if (slot.ga_n == 1) {
                 if (slot.is_processing() && (int) system_tokens.size() + slot.n_past >= slot.n_ctx - 1) {
+                    if (!params.ctx_shift) {
+                        // this check is redundant (for good)
+                        // we should never get here, because generation should already stopped in process_token()
+                        slot.release();
+                        send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER);
+                        continue;
+                    }
+
                     // Shift context
                     const int n_keep    = slot.params.n_keep + add_bos_token;
                     const int n_left    = (int) system_tokens.size() + slot.n_past - n_keep;
@@ -1961,6 +1978,14 @@ struct server_context {
                                 continue;
                             }
                         } else {
+                            if (!params.ctx_shift) {
+                                // if context shift is disabled, we make sure prompt size is smaller than KV size
+                                if ((int) system_tokens.size() + slot.n_prompt_tokens >= slot.n_ctx) {
+                                    slot.release();
+                                    send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST);
+                                    continue;
+                                }
+                            }
                             if (slot.params.n_keep < 0) {
                                 slot.params.n_keep = slot.n_prompt_tokens;
                             }
diff --git a/examples/server/tests/features/ctx_shift.feature b/examples/server/tests/features/ctx_shift.feature
new file mode 100644
index 000000000..ba3afcf06
--- /dev/null
+++ b/examples/server/tests/features/ctx_shift.feature
@@ -0,0 +1,62 @@
+@llama.cpp
+@ctx_shift
+Feature: llama.cpp server
+
+  Background: Server startup
+    Given a server listening on localhost:8080
+    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   a model file test-model.gguf
+    And   a model alias tinyllama-2
+    And   BOS token is 1
+    And   42 as server seed
+    And   256 KV cache size
+    And   32 as batch size
+    And   2 slots
+
+  Scenario: Inference with context shift
+    And   64 server max tokens to predict
+    Then  the server is starting
+    Then  the server is healthy
+    Given a prompt:
+    """
+    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
+    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
+    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+    """
+    And   a completion request with no api error
+    Then  64 tokens are predicted matching fun|Annaks|popcorns|pictry|bowl
+    And   the completion is  truncated
+    And   109 prompt tokens are processed
+
+  Scenario Outline: Inference without context shift
+    And   <n_predict> server max tokens to predict
+    And   disable context shifting
+    Then  the server is starting
+    Then  the server is healthy
+    Given a prompt:
+    """
+    Hi how are you
+    """
+    And   a completion request with no api error
+    Then  <n_token_output> tokens are predicted matching twind|Anna
+    And   the completion is <truncated> truncated
+    And   8 prompt tokens are processed
+    Examples:
+      | n_predict | n_token_output | truncated |
+      | 64        | 64             | not       |
+      | -1        | 120            |           |
+
+  Scenario: Inference without context shift (expected error: prompt too long)
+    And   disable context shifting
+    Then  the server is starting
+    Then  the server is healthy
+    Given a prompt:
+    """
+    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
+    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
+    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+    """
+    And   a completion request with 400 api error
+
diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature
index e1eade6cd..818ea3beb 100644
--- a/examples/server/tests/features/embeddings.feature
+++ b/examples/server/tests/features/embeddings.feature
@@ -10,11 +10,11 @@ Feature: llama.cpp server
     And   42 as server seed
     And   2 slots
     # the bert-bge-small model has context size of 512
-    # since the generated prompts are as big as the batch size, we need to set the batch size to 512
+    # since the generated prompts are as big as the batch size, we need to set the batch size to <= 512
     # ref: https://huggingface.co/BAAI/bge-small-en-v1.5/blob/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/config.json#L20
-    And   512 as batch size
-    And   512 as ubatch size
-    And   2048 KV cache size
+    And   128 as batch size
+    And   128 as ubatch size
+    And   512 KV cache size
     And   embeddings extraction
     Then  the server is starting
     Then  the server is healthy
@@ -26,6 +26,20 @@ Feature: llama.cpp server
     """
     Then embeddings are generated
 
+  Scenario: Embedding (error: prompt too long)
+    When embeddings are computed for:
+    """
+    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
+    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
+    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
+    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
+    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+    """
+    And  embeddings request with 500 api error
+
   Scenario: OAI Embeddings compatibility
     Given a model bert-bge-small
     When an OAI compatible embeddings computation request for:
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 062f084be..0fea0fe87 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -77,6 +77,7 @@ def step_server_config(context, server_fqdn: str, server_port: str):
     context.response_format = None
     context.temperature = None
     context.lora_file = None
+    context.disable_ctx_shift = False
 
     context.tasks_result = []
     context.concurrent_tasks = []
@@ -148,7 +149,7 @@ def step_n_slots(context, n_slots: int):
 
 @step('{n_predict:d} server max tokens to predict')
 def step_server_n_predict(context, n_predict: int):
-    context.n_server_predict = n_predict
+    context.n_server_predict = n_predict if n_predict > 0 else None
 
 
 @step('{slot_save_path} as slot save path')
@@ -180,6 +181,9 @@ def step_server_embeddings(context):
 def step_server_metrics(context):
     context.server_metrics = True
 
+@step('disable context shifting')
+def step_server_disable_ctx_shift(context):
+    context.disable_ctx_shift = True
 
 @step("the server is starting")
 def step_start_server(context):
@@ -257,7 +261,7 @@ async def step_all_slots_status(context, expected_slot_status_string: Literal['i
 @step('a completion request with {api_error} api error')
 @async_run_until_complete
 async def step_request_completion(context, api_error: Literal['raised'] | str):
-    expect_api_error = api_error == 'raised'
+    expect_api_error = api_error == 'raised' or api_error != 'no'
     seeds = await completions_seed(context, num_seeds=1)
     completion = await request_completion(context.prompts.pop(),
                                           seeds[0] if seeds is not None else seeds,
@@ -272,8 +276,11 @@ async def step_request_completion(context, api_error: Literal['raised'] | str):
     context.tasks_result.append(completion)
     if context.debug:
         print(f"Completion response: {completion}")
-    if expect_api_error:
+    if api_error == 'raised':
         assert completion == 401, f"completion must be an 401 status code: {completion}"
+    elif api_error.isdigit():
+        api_error_code = int(api_error)
+        assert completion == api_error_code, f"completion must be an {api_error_code} status code: {completion}"
 
 
 @step('{predicted_n:d} tokens are predicted matching {re_content}')
@@ -645,6 +652,9 @@ def step_assert_embeddings(context):
     for embedding in context.embeddings:
         assert_embeddings(embedding)
 
+@step('embeddings request with {api_error_code:d} api error')
+def step_assert_embeddings(context, api_error_code: int):
+    assert context.embeddings == api_error_code, f"embeddings request must return code {api_error_code}, but got {context.embeddings}"
 
 @step('an OAI compatible embeddings computation request for')
 @async_run_until_complete
@@ -1089,15 +1099,17 @@ async def oai_chat_completions(user_prompt,
     return completion_response
 
 
-async def request_embedding(content, seed, base_url=None) -> list[list[float]]:
+async def request_embedding(content, seed, base_url=None) -> list[list[float]] | int:
     async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
         async with session.post(f'{base_url}/embedding',
                                 json={
                                     "content": content,
                                 }) as response:
-            assert response.status == 200
-            response_json = await response.json()
-            return [response_json['embedding']]
+            if response.status == 200:
+                response_json = await response.json()
+                return [response_json['embedding']]
+            else:
+                return response.status
 
 
 async def request_oai_embeddings(input, seed,
@@ -1372,6 +1384,8 @@ def start_server_background(context):
         server_args.append('--verbose')
     if context.lora_file:
         server_args.extend(['--lora', context.lora_file])
+    if context.disable_ctx_shift:
+        server_args.extend(['--no-context-shift'])
 
     args = [str(arg) for arg in [context.server_path, *server_args]]
     print(f"bench: starting server with: {' '.join(args)}")

From 116efee0eef09d8c3c4c60b52fa01b56ddeb432c Mon Sep 17 00:00:00 2001
From: Ivan <nekotekina@gmail.com>
Date: Tue, 24 Sep 2024 03:14:24 +0300
Subject: [PATCH 30/78] cuda: add q8_0->f32 cpy operation (#9571)

llama: enable K-shift for quantized KV cache
It will fail on unsupported backends or quant types.
---
 ggml/src/ggml-cuda.cu     |  3 +++
 ggml/src/ggml-cuda/cpy.cu | 51 +++++++++++++++++++++++++++++++++++++++
 src/llama.cpp             | 37 +++++++++++++++++++++-------
 3 files changed, 82 insertions(+), 9 deletions(-)

diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index a0d256100..0bb7f2d99 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2899,6 +2899,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
                 if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
                     return true;
                 }
+                if (src0_type == GGML_TYPE_Q8_0 && src1_type == GGML_TYPE_F32) {
+                    return true;
+                }
                 if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
                     return true;
                 }
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
index 51deb75fd..54c0f66d2 100644
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -81,6 +81,17 @@ static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
     }
 }
 
+static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
+    const block_q8_0 * xi = (const block_q8_0 *) cxi;
+    float * dsti = (float *) cdsti;
+
+    const float d = (float)xi->d;
+
+    for (int j = 0; j < QK8_0; j++) {
+       dsti[j] = xi->qs[j] * d;
+    }
+}
+
 static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
     const float * xi = (const float *) cxi;
     block_q4_0 * dsti = (block_q4_0 *) cdsti;
@@ -288,6 +299,32 @@ static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
     cpy_blck(cx + x_offset, cdst + dst_offset);
 }
 
+template <cpy_kernel_t cpy_blck, int qk>
+static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
+                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+                                 const int nb12, const int nb13) {
+    const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
+
+    if (i >= ne) {
+        return;
+    }
+
+    const int i03 = i/(ne00 * ne01 * ne02);
+    const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+    const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+    const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
+    const int x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
+
+    const int i13 = i/(ne10 * ne11 * ne12);
+    const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
+    const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
+    const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
+    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
+
+    cpy_blck(cx + x_offset, cdst + dst_offset);
+}
+
 static void ggml_cpy_f16_f32_cuda(
     const char * cx, char * cdst, const int ne,
     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -329,6 +366,16 @@ static void ggml_cpy_f32_q8_0_cuda(
         (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
 
+static void ggml_cpy_q8_0_f32_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+
+    const int num_blocks = ne;
+    cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+}
+
 static void ggml_cpy_f32_q4_0_cuda(
     const char * cx, char * cdst, const int ne,
     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -437,6 +484,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
         ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
         ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
+        ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
         ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
@@ -471,6 +520,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
         return (void*) cpy_f32_f16<cpy_1_f32_f16>;
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
         return (void*) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
+    } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
+        return (void*) cpy_q_f32<cpy_blck_q8_0_f32, QK8_0>;
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
         return (void*) cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>;
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
diff --git a/src/llama.cpp b/src/llama.cpp
index bc4e408e0..e5e0d1a66 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -9930,17 +9930,36 @@ struct llm_build_context {
             const int64_t n_head_kv = hparams.n_head_kv(il);
             const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
             struct ggml_tensor * rope_factors = build_rope_factors(il);
-            struct ggml_tensor * tmp =
-                // we rotate only the first n_rot dimensions
-                ggml_rope_ext_inplace(ctx0,
-                        ggml_view_3d(ctx0, kv_self.k_l[il],
-                            n_embd_head_k, n_head_kv, n_ctx,
-                            ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
-                            ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                            0),
+            struct ggml_tensor * k =
+                ggml_view_3d(ctx0, kv_self.k_l[il],
+                    n_embd_head_k, n_head_kv, n_ctx,
+                    ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
+                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+                    0);
+
+            struct ggml_tensor * tmp;
+            if (ggml_is_quantized(k->type)) {
+                // dequantize to f32 -> RoPE -> quantize back
+                tmp = ggml_cast(ctx0, k, GGML_TYPE_F32);
+                cb(tmp, "K_f32", il);
+                for (auto * backend : lctx.backends) {
+                    // Figure out which backend KV cache belongs to
+                    if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft)) {
+                        ggml_backend_sched_set_tensor_backend(lctx.sched, tmp, backend);
+                        break;
+                    }
+                }
+                tmp = ggml_rope_ext_inplace(ctx0, tmp,
                         lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                         ext_factor, attn_factor, beta_fast, beta_slow);
-
+                cb(tmp, "K_shifted_f32", il);
+                tmp = ggml_cpy(ctx0, tmp, k);
+            } else {
+                // we rotate only the first n_rot dimensions
+                tmp = ggml_rope_ext_inplace(ctx0, k,
+                        lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+            }
             cb(tmp, "K_shifted", il);
             ggml_build_forward_expand(gf, tmp);
         }

From c087b6f11d3385f4293b6841ebfb755063479490 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Mon, 23 Sep 2024 21:18:48 -0700
Subject: [PATCH 31/78] threads: fix msvc build without openmp (#9615)

We're missing atomic_thread_fence() in MSVC builds when openmp is disabled.
---
 ggml/src/ggml.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 96c09ca89..d4aa0a81b 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -135,6 +135,9 @@ static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
 static void atomic_flag_clear(atomic_flag * ptr) {
     InterlockedExchange(ptr, 0);
 }
+static void atomic_thread_fence(memory_order mo) {
+    MemoryBarrier();
+}
 #else // clang
 #include <stdatomic.h>
 #endif

From b0f27361f3539a81d983a8b045f3c61e682d9fc0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 24 Sep 2024 09:03:17 +0300
Subject: [PATCH 32/78] sampling : avoid expensive softmax during greedy
 sampling (#9605)

* sampling : avoid expensive softmax during greedy sampling

ggml-ci

* speculative : fix default RNG seed + set sparams.n_probs

* Update tests/test-sampling.cpp

Co-authored-by: slaren <slarengh@gmail.com>

* sampling : add clarifying comment [no ci]

---------

Co-authored-by: slaren <slarengh@gmail.com>
---
 common/sampling.cpp                  | 10 ++++++-
 examples/speculative/speculative.cpp |  5 +++-
 include/llama.h                      |  1 +
 src/llama-sampling.cpp               |  7 +++--
 tests/test-sampling.cpp              | 42 +++++++++++++++++++++++++++-
 5 files changed, 59 insertions(+), 6 deletions(-)

diff --git a/common/sampling.cpp b/common/sampling.cpp
index e51d07611..3dc7f1120 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -209,7 +209,15 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
             GGML_ASSERT(false && "unknown mirostat version");
         }
     } else {
-        llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
+        if (params.n_probs > 0) {
+            // some use cases require to sample greedily, but still obtain the probabilities of the top tokens
+            // ref: https://github.com/ggerganov/llama.cpp/pull/9605
+            //
+            // the following will not produce exactly the same probs as applyging softmax to the full vocabulary, but
+            // it is much faster, since we avoid sorting all tokens and should give a good approximation
+            llama_sampler_chain_add(result->chain, llama_sampler_init_top_k(params.n_probs));
+            llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
+        }
         llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
     }
 
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index fbac21811..adf6255e1 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -32,6 +32,9 @@ struct seq_draft {
 int main(int argc, char ** argv) {
     gpt_params params;
 
+    // needed to get candidate probs even for temp <= 0.0
+    params.sparams.n_probs = 128;
+
     if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
         return 1;
     }
@@ -49,7 +52,7 @@ int main(int argc, char ** argv) {
     // probability threshold for splitting a draft branch (only for n_seq_dft > 1)
     const float p_split  = params.p_split;
 
-    std::default_random_engine rng(params.sparams.seed);
+    std::default_random_engine rng(params.sparams.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sparams.seed);
     std::uniform_real_distribution<> u_dist;
 
     // init llama.cpp
diff --git a/include/llama.h b/include/llama.h
index f316a87ba..132937a07 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1066,6 +1066,7 @@ extern "C" {
     LLAMA_API struct llama_sampler * llama_sampler_init_dist       (uint32_t seed);
 
     /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
+    /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
     LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void);
 
     /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 5299f5116..e255a8fc4 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -3,13 +3,14 @@
 #include "llama-vocab.h"
 #include "llama-grammar.h"
 
-#include <cassert>
 #include <algorithm>
-#include <cstring>
-#include <ctime>
+#include <cassert>
 #include <cfloat>
 #include <chrono>
 #include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
 #include <numeric>
 #include <random>
 #include <unordered_map>
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index d738b7a45..6e021c4c7 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -1,6 +1,5 @@
 #include "ggml.h"
 #include "llama.h"
-#include "llama-sampling.h"
 
 #ifdef NDEBUG
 #undef NDEBUG
@@ -249,6 +248,45 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
            samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p);
 }
 
+static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vector<llama_token_data> & data, int n_iter) {
+    std::vector<llama_token_data> cur(data.size());
+    std::copy(data.begin(), data.end(), cur.begin());
+    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+    llama_sampler_apply(cnstr, &cur_p);
+    llama_sampler_reset(cnstr);
+    const int64_t t_start = ggml_time_us();
+    for (int i = 0; i < n_iter; i++) {
+        std::copy(data.begin(), data.end(), cur.begin());
+        llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+        llama_sampler_apply(cnstr, &cur_p);
+        llama_sampler_reset(cnstr);
+    }
+    const int64_t t_end = ggml_time_us();
+    llama_sampler_free(cnstr);
+    printf("%-42s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
+}
+
+#define BENCH(__cnstr, __data, __n_iter) bench((__cnstr), #__cnstr, (__data), (__n_iter))
+
+static void test_perf() {
+    const int n_vocab = 1 << 17;
+
+    std::vector<llama_token_data> data;
+
+    data.reserve(n_vocab);
+    for (int i = 0; i < n_vocab; i++) {
+        const float logit = 2.0f*((float)(rand())/RAND_MAX - 0.5f);
+        data.emplace_back(llama_token_data{i, logit, 0.0f});
+    }
+
+    BENCH(llama_sampler_init_top_k    (40),      data, 32);
+    BENCH(llama_sampler_init_top_p    (0.8f, 1), data, 32);
+    BENCH(llama_sampler_init_min_p    (0.2f, 1), data, 32);
+    BENCH(llama_sampler_init_tail_free(0.5f, 1), data, 32);
+    BENCH(llama_sampler_init_typical  (0.5f, 1), data, 32);
+    BENCH(llama_sampler_init_softmax  (),        data, 32);
+}
+
 int main(void) {
     ggml_time_init();
 
@@ -316,5 +354,7 @@ int main(void) {
 
     printf("OK\n");
 
+    test_perf();
+
     return 0;
 }

From 0aa15011e315659640504731d1d05663837130fa Mon Sep 17 00:00:00 2001
From: StrangeBytesDev <141275258+StrangeBytesDev@users.noreply.github.com>
Date: Mon, 23 Sep 2024 23:04:39 -0700
Subject: [PATCH 33/78] server : add newline after chat example (#9616)

---
 examples/server/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 8655c097a..e5275a514 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3179,7 +3179,7 @@ int main(int argc, char ** argv) {
     }
 
     // print sample chat example to make it clear which template is used
-    LOG_INF("%s: chat template, built_in: %d, chat_example: '%s\n'", __func__, params.chat_template.empty(), llama_chat_format_example(ctx_server.model, params.chat_template).c_str());
+    LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), llama_chat_format_example(ctx_server.model, params.chat_template).c_str());
 
     ctx_server.queue_tasks.on_new_task(std::bind(
                 &server_context::process_single_task, &ctx_server, std::placeholders::_1));

From cea1486ecf34a1c7e014a9e290eb458f5a11f876 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 24 Sep 2024 10:15:35 +0300
Subject: [PATCH 34/78] log : add CONT level for continuing previous log entry
 (#9610)

---
 common/log.cpp             |  2 +-
 common/log.h               |  2 ++
 examples/infill/infill.cpp | 14 +++++++-------
 examples/main/main.cpp     | 28 ++++++++++++++--------------
 ggml/include/ggml.h        |  1 +
 src/llama-impl.h           |  2 ++
 src/llama.cpp              |  4 ++--
 7 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/common/log.cpp b/common/log.cpp
index 2825a227e..5a844ed59 100644
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -82,7 +82,7 @@ struct gpt_log_entry {
             }
         }
 
-        if (level != GGML_LOG_LEVEL_NONE && prefix) {
+        if (level != GGML_LOG_LEVEL_NONE && level != GGML_LOG_LEVEL_CONT && prefix) {
             if (timestamp) {
                 // [M.s.ms.us]
                 fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
diff --git a/common/log.h b/common/log.h
index d13f72d89..84f9b3ed7 100644
--- a/common/log.h
+++ b/common/log.h
@@ -83,8 +83,10 @@ void gpt_log_set_timestamps(struct gpt_log * log,       bool   timestamps); // w
 #define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  0,                 __VA_ARGS__)
 #define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0,                 __VA_ARGS__)
 #define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
+#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  0,                 __VA_ARGS__)
 
 #define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  verbosity, __VA_ARGS__)
 #define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  verbosity, __VA_ARGS__)
 #define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
 #define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
+#define LOG_CNTV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  verbosity, __VA_ARGS__)
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index 35607276a..d52425ae6 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -263,9 +263,9 @@ int main(int argc, char ** argv) {
         if (params.n_keep > 0) {
         LOG_INF("%s: static prompt based on n_keep: '", __func__);
             for (int i = 0; i < params.n_keep; i++) {
-                LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
             }
-            LOG("'\n");
+            LOG_CNT("'\n");
         }
         LOG_INF("\n");
     }
@@ -306,8 +306,8 @@ int main(int argc, char ** argv) {
 
     LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
 
-    LOG("\n");
-    LOG("\n#####  Infill mode  #####\n\n");
+    LOG_INF("\n");
+    LOG_INF("\n#####  Infill mode  #####\n\n");
     if (params.interactive) {
         const char *control_message;
         if (params.multiline_input) {
@@ -318,11 +318,11 @@ int main(int argc, char ** argv) {
                               " - To return control without starting a new line, end your input with '/'.\n"
                               " - If you want to submit another line, end your input with '\\'.\n";
         }
-        LOG("== Running in interactive mode. ==\n");
+        LOG_INF("== Running in interactive mode. ==\n");
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-        LOG(       " - Press Ctrl+C to interject at any time.\n");
+        LOG_INF(       " - Press Ctrl+C to interject at any time.\n");
 #endif
-        LOG(       "%s\n", control_message);
+        LOG_INF(       "%s\n", control_message);
 
         is_interacting = params.interactive_first;
     }
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index c3041f1fb..6bbb1e13e 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -385,9 +385,9 @@ int main(int argc, char ** argv) {
         if (params.n_keep > add_bos) {
             LOG_INF("%s: static prompt based on n_keep: '", __func__);
             for (int i = 0; i < params.n_keep; i++) {
-                LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
             }
-            LOG("'\n");
+            LOG_CNT("'\n");
         }
         LOG_INF("\n");
     }
@@ -409,40 +409,40 @@ int main(int argc, char ** argv) {
     }
 
     if (params.interactive) {
-        LOG("%s: interactive mode on.\n", __func__);
+        LOG_INF("%s: interactive mode on.\n", __func__);
 
         if (!params.antiprompt.empty()) {
             for (const auto & antiprompt : params.antiprompt) {
-                LOG("Reverse prompt: '%s'\n", antiprompt.c_str());
+                LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str());
                 if (params.verbose_prompt) {
                     auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
                     for (int i = 0; i < (int) tmp.size(); i++) {
-                        LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                        LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
                     }
                 }
             }
         }
 
         if (params.input_prefix_bos) {
-            LOG("Input prefix with BOS\n");
+            LOG_INF("Input prefix with BOS\n");
         }
 
         if (!params.input_prefix.empty()) {
-            LOG("Input prefix: '%s'\n", params.input_prefix.c_str());
+            LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
             if (params.verbose_prompt) {
                 auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
                 for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
                 }
             }
         }
 
         if (!params.input_suffix.empty()) {
-            LOG("Input suffix: '%s'\n", params.input_suffix.c_str());
+            LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
             if (params.verbose_prompt) {
                 auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
                 for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
                 }
             }
         }
@@ -474,7 +474,7 @@ int main(int argc, char ** argv) {
       //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
         LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
     }
-    LOG("\n");
+    LOG_INF("\n");
 
     if (params.interactive) {
         const char * control_message;
@@ -486,11 +486,11 @@ int main(int argc, char ** argv) {
                               " - To return control without starting a new line, end your input with '/'.\n"
                               " - If you want to submit another line, end your input with '\\'.\n";
         }
-        LOG("== Running in interactive mode. ==\n");
+        LOG_INF("== Running in interactive mode. ==\n");
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-        LOG(       " - Press Ctrl+C to interject at any time.\n");
+        LOG_INF(       " - Press Ctrl+C to interject at any time.\n");
 #endif
-        LOG(       "%s\n", control_message);
+        LOG_INF(       "%s\n", control_message);
 
         is_interacting = params.interactive_first;
     }
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 2035001e9..d6c45c948 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -570,6 +570,7 @@ extern "C" {
         GGML_LOG_LEVEL_WARN  = 2,
         GGML_LOG_LEVEL_ERROR = 3,
         GGML_LOG_LEVEL_DEBUG = 4,
+        GGML_LOG_LEVEL_CONT  = 5, // continue previous log
     };
 
     // this tensor...
diff --git a/src/llama-impl.h b/src/llama-impl.h
index 2bde75ec1..70f16b61c 100644
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@@ -28,6 +28,8 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
 #define LLAMA_LOG_INFO(...)  llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
 #define LLAMA_LOG_WARN(...)  llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
 #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
+#define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
+#define LLAMA_LOG_CONT(...)  llama_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
 
 //
 // helpers
diff --git a/src/llama.cpp b/src/llama.cpp
index e5e0d1a66..c1ba2b301 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -18671,9 +18671,9 @@ struct llama_model * llama_load_model_from_file(
             unsigned percentage = (unsigned) (100 * progress);
             while (percentage > *cur_percentage_p) {
                 *cur_percentage_p = percentage;
-                LLAMA_LOG(".");
+                LLAMA_LOG_CONT(".");
                 if (percentage >= 100) {
-                    LLAMA_LOG("\n");
+                    LLAMA_LOG_CONT("\n");
                 }
             }
             return true;

From 31ac5834fe75c296476658a124c06c84772aa641 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 24 Sep 2024 10:16:06 +0300
Subject: [PATCH 35/78] llama : keep track of all EOG tokens in the vocab
 (#9609)

ggml-ci
---
 src/llama-vocab.cpp |  6 +----
 src/llama-vocab.h   | 14 +++++++----
 src/llama.cpp       | 59 +++++++++++++++++++++++++++++++++++++++------
 3 files changed, 61 insertions(+), 18 deletions(-)

diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 2c007477e..a771eccda 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1570,11 +1570,7 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
 }
 
 bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
-    return token != -1 && (
-        token == llama_token_eos_impl(vocab) ||
-        token == llama_token_eot_impl(vocab) ||
-        token == llama_token_eom_impl(vocab)
-    );
+    return token != -1 && vocab.special_eog_ids.count(token) > 0;
 }
 
 bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token) {
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
index dc4b5f12f..cc46f642b 100644
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -6,6 +6,7 @@
 #include <vector>
 #include <unordered_map>
 #include <map>
+#include <set>
 
 struct llama_vocab {
     using id    = llama_token;
@@ -49,12 +50,15 @@ struct llama_vocab {
     id special_eot_id    = -1; // TODO: move above after "eos_id", and here add "file separator" token
     id special_eom_id    = -1;
 
+    // set of all tokens that cause "end of generation"
+    std::set<id> special_eog_ids;
+
     // tokenizer flags
-    bool tokenizer_add_space_prefix = false;
-    bool tokenizer_add_bos          = false;
-    bool tokenizer_add_eos          = false;
-    bool tokenizer_ignore_merges    = false;
-    bool tokenizer_clean_spaces     = false;  // clean_up_tokenization_spaces
+    bool tokenizer_add_space_prefix           = false;
+    bool tokenizer_add_bos                    = false;
+    bool tokenizer_add_eos                    = false;
+    bool tokenizer_ignore_merges              = false;
+    bool tokenizer_clean_spaces               = false;  // clean_up_tokenization_spaces
     bool tokenizer_remove_extra_whitespaces   = false;
     bool tokenizer_escape_whitespaces         = true;
     bool tokenizer_treat_whitespace_as_suffix = false;
diff --git a/src/llama.cpp b/src/llama.cpp
index c1ba2b301..a718de054 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6509,21 +6509,21 @@ static void llm_load_vocab(
         //       for now, we apply this workaround to find the EOT token based on its text
         if (vocab.special_eot_id == -1) {
             for (const auto & t : vocab.token_to_id) {
-                if (
+                if (false
                         // TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
                         //       need to fix convert script
                         //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
-                        (t.first == "<|eot_id|>" ||
-                         t.first == "<|im_end|>" ||
-                         t.first == "<|end|>" ||
-                         t.first == "<end_of_turn>" ||
-                         t.first == "<|endoftext|>"
-                        )
+                        || t.first == "<|eot_id|>"
+                        || t.first == "<|im_end|>"
+                        || t.first == "<|end|>"
+                        || t.first == "<end_of_turn>"
+                        || t.first == "<|endoftext|>"
+                        || t.first == "<EOT>"
                    ) {
                     vocab.special_eot_id = t.second;
                     if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                         LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                            __func__, t.first.c_str());
+                                __func__, t.first.c_str());
                         vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                     }
                     break;
@@ -6546,6 +6546,44 @@ static void llm_load_vocab(
                 }
             }
         }
+
+        // maintain a list of tokens that cause end-of-generation
+        // this is currently determined based on the token text, which is obviously not ideal
+        // ref: https://github.com/ggerganov/llama.cpp/issues/9606
+        vocab.special_eog_ids.clear();
+        for (const auto & t : vocab.token_to_id) {
+            if (false
+                    || t.first == "<|eot_id|>"
+                    || t.first == "<|im_end|>"
+                    || t.first == "<|end|>"
+                    || t.first == "<end_of_turn>"
+                    || t.first == "<|endoftext|>"
+                    || t.first == "<|eom_id|>"
+                    || t.first == "<EOT>"
+               ) {
+                vocab.special_eog_ids.insert(t.second);
+                if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                            __func__, t.first.c_str());
+                    vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                }
+            }
+        }
+
+        if (vocab.special_eos_id != -1 && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
+            vocab.special_eog_ids.insert(vocab.special_eos_id);
+            LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
+        }
+
+        if (vocab.special_eot_id != -1 && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
+            vocab.special_eog_ids.insert(vocab.special_eot_id);
+            LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
+        }
+
+        if (vocab.special_eom_id != -1 && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
+            vocab.special_eog_ids.insert(vocab.special_eom_id);
+            LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
+        }
     }
 
     // build special tokens cache
@@ -6749,6 +6787,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token        = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
     if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token        = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
     if (vocab.special_eot_id    != -1) { LLAMA_LOG_INFO( "%s: EOT token        = %d '%s'\n", __func__, vocab.special_eot_id,    vocab.id_to_token[vocab.special_eot_id].text.c_str() );    }
+    if (vocab.special_eom_id    != -1) { LLAMA_LOG_INFO( "%s: EOM token        = %d '%s'\n", __func__, vocab.special_eom_id,    vocab.id_to_token[vocab.special_eom_id].text.c_str() );    }
+
+    for (const auto & id : vocab.special_eog_ids) {
+        LLAMA_LOG_INFO( "%s: EOG token        = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
+    }
 
     LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
 

From c038931615d2525d732943fa8661e29aa361b192 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 20 Sep 2024 21:50:16 +0300
Subject: [PATCH 36/78] examples : adapt to ggml.h changes (ggml/0)

ggml-ci
---
 ggml/include/ggml.h | 3 +++
 ggml/src/ggml.c     | 1 -
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index d6c45c948..e24b8a319 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -1980,6 +1980,9 @@ extern "C" {
     typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
     typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
 
+#define GGML_N_TASKS_MAX (-1)
+    // n_tasks == GGML_N_TASKS_MAX means to use max number of tasks
+
     GGML_API struct ggml_tensor * ggml_map_custom1(
             struct ggml_context   * ctx,
             struct ggml_tensor    * a,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index d4aa0a81b..4b782b0c1 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -313,7 +313,6 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
 #define GGML_DEBUG 0
 #define GGML_GELU_FP16
 #define GGML_GELU_QUICK_FP16
-#define GGML_N_TASKS_MAX (-1)
 
 #define GGML_SOFT_MAX_UNROLL 4
 #define GGML_VEC_DOT_UNROLL  2

From bb5f8199754b5f055cf436711d14c58e7be28e12 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 24 Sep 2024 11:01:18 +0300
Subject: [PATCH 37/78] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index cf7b97d45..36eeed0cc 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-e7b23907cb2816e9951fe9b524d7127ab777297a
+336c10a4c3c8ec99af484b25a0cddd397a09cdb2

From 70392f1f81470607ba3afef04aa56c9f65587664 Mon Sep 17 00:00:00 2001
From: Eric Zhang <34133756+EZForever@users.noreply.github.com>
Date: Tue, 24 Sep 2024 16:03:21 +0800
Subject: [PATCH 38/78] ggml : add AVX512DQ requirement for AVX512 builds
 (#9622)

---
 ggml/src/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 6c691a4c5..cbc349500 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -1186,6 +1186,7 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
         endif()
         if (GGML_AVX512)
             list(APPEND ARCH_FLAGS -mavx512f)
+            list(APPEND ARCH_FLAGS -mavx512dq)
             list(APPEND ARCH_FLAGS -mavx512bw)
         endif()
         if (GGML_AVX512_VBMI)

From 904837e0cb2f5f01bf5d5901b7aa57a026860ae4 Mon Sep 17 00:00:00 2001
From: Dou Xinpeng <15529241576@163.com>
Date: Wed, 25 Sep 2024 11:30:38 +0800
Subject: [PATCH 39/78] cann: fix crash when llama-bench is running on multiple
 cann devices (#9627)

---
 ggml/src/ggml-cann/common.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h
index e6a570107..edfa49614 100644
--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@@ -227,6 +227,7 @@ struct ggml_backend_cann_context {
      * @brief Destructor for cleaning up resources.
      */
     ~ggml_backend_cann_context() {
+        ggml_cann_set_device(device);
         if (copy_event != nullptr) {
             ACL_CHECK(aclrtDestroyEvent(copy_event));
         }

From 3d6bf6919f7b10726421779cd344f2da05421c68 Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Wed, 25 Sep 2024 01:06:52 -0600
Subject: [PATCH 40/78] llama : add IBM Granite MoE architecture (#9438)

* feat(gguf-py): Add granitemoe architecture

This includes the addition of new tensor names for the new moe layers.
These may not be correct at this point due to the need for the hack in
gguf_writer.py to double-check the length of the shape for these layers.

Branch: GraniteMoE

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(convert_hf_to_gguf): Add GraniteMoeModel

GraniteMoe has the same configuration deltas as Granite

Branch: GraniteMoE

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(granitemoe convert): Split the double-sized input layer into gate and up

After a lot of staring and squinting, it's clear that the standard mixtral
expert implementation is equivalent to the vectorized parallel experts in
granite. The difference is that in granite, the w1 and w3 are concatenated
into a single tensor "input_linear." Rather than reimplementing all of the
math on the llama.cpp side, the much simpler route is to just split this
tensor during conversion and follow the standard mixtral route.

Branch: GraniteMoE

Co-Authored-By: alex.brooks@ibm.com

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(granitemoe): Implement granitemoe

GraniteMoE follows the mixtral architecture (once the input_linear layers
are split into gate_exps/up_exps). The main delta is the addition of the
same four multipliers used in Granite.

Branch: GraniteMoE

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* Typo fix in docstring

Co-Authored-By: ggerganov@gmail.com

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(conversion): Simplify tensor name mapping in conversion

Branch: GraniteMoE

Co-Authored-By: git@compilade.net
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(convert): Remove unused tensor name mappings

Branch: GraniteMoE

Co-Authored-By: git@compilade.net
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(convert): Sanity check on merged FFN tensor sizes

Branch: GraniteMoE

Co-Authored-By: git@compilade.net
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix: Allow "output" layer in granite moe architecture (convert and cpp)

Branch: GraniteMoE

Co-Authored-By: git@compilade.net
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(granite): Add missing 'output' tensor for Granite

This is a fix for the previous `granite` architecture PR. Recent snapshots
have included this (`lm_head.weights`) as part of the architecture

Branch: GraniteMoE

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

---------

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 convert_hf_to_gguf.py          | 33 +++++++++++++++++++++++++++++++--
 gguf-py/gguf/constants.py      | 18 ++++++++++++++++++
 gguf-py/gguf/tensor_mapping.py | 20 +++++++++++---------
 src/llama.cpp                  | 30 ++++++++++++++++++++++++++++--
 4 files changed, 88 insertions(+), 13 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index ff4c9226f..7be609054 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4102,16 +4102,45 @@ class GraniteModel(LlamaModel):
         #   consistency
         if attention_scale := self.hparams.get("attention_multiplier"):
             self.gguf_writer.add_attention_scale(attention_scale)
+            logger.info("gguf: (granite) attention_scale = %s", attention_scale)
         if embedding_scale := self.hparams.get("embedding_multiplier"):
             self.gguf_writer.add_embedding_scale(embedding_scale)
+            logger.info("gguf: (granite) embedding_scale = %s", embedding_scale)
         if residual_scale := self.hparams.get("residual_multiplier"):
             self.gguf_writer.add_residual_scale(residual_scale)
-        if logits_scaling := self.hparams.get("logits_scaling"):
-            self.gguf_writer.add_logit_scale(logits_scaling)
+            logger.info("gguf: (granite) residual_scale = %s", residual_scale)
+        if logits_scale := self.hparams.get("logits_scaling"):
+            self.gguf_writer.add_logit_scale(logits_scale)
+            logger.info("gguf: (granite) logits_scale = %s", logits_scale)
+
+
+@Model.register("GraniteMoeForCausalLM")
+class GraniteMoeModel(GraniteModel):
+    """Conversion for IBM's GraniteMoeForCausalLM"""
+    model_arch = gguf.MODEL_ARCH.GRANITE_MOE
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        """In modeling_granitemoe, the JetMoe implementation of parallel experts
+        is used. This essentially merges w1 and w3 into a single tensor with 2x
+        the hidden size that is then split during forward. To keep compatibility
+        with existing mixtral support, we pull them apart here.
+        """
+
+        if name.endswith("block_sparse_moe.input_linear.weight"):
+            ffn_dim = self.hparams["intermediate_size"]
+            assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
+            gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
+            return [
+                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
+                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
+            ]
+
+        return super().modify_tensors(data_torch, name, bid)
 
 
 ###### CONVERSION LOGIC ######
 
+
 # tree of lazy tensors
 class LazyTorchTensor(gguf.LazyBase):
     _tensor_type = torch.Tensor
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index b36a60d49..560eee916 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -235,6 +235,7 @@ class MODEL_ARCH(IntEnum):
     NEMOTRON     = auto()
     EXAONE       = auto()
     GRANITE      = auto()
+    GRANITE_MOE  = auto()
 
 
 class MODEL_TENSOR(IntEnum):
@@ -392,6 +393,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
     MODEL_ARCH.NEMOTRON:       "nemotron",
     MODEL_ARCH.EXAONE:         "exaone",
     MODEL_ARCH.GRANITE:        "granite",
+    MODEL_ARCH.GRANITE_MOE:    "granitemoe",
 }
 
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -1232,6 +1234,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
     MODEL_ARCH.GRANITE: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
         MODEL_TENSOR.ATTN_NORM,
         MODEL_TENSOR.ATTN_Q,
         MODEL_TENSOR.ATTN_K,
@@ -1242,6 +1245,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
+    MODEL_ARCH.GRANITE_MOE: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+    ],
     # TODO
 }
 
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 2ebfa2b43..4e850726e 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -251,11 +251,12 @@ class TensorNameMap:
         ),
 
         MODEL_TENSOR.FFN_GATE_INP: (
-            "layers.{bid}.feed_forward.gate",             # mixtral
-            "model.layers.{bid}.block_sparse_moe.gate",   # mixtral
-            "model.layers.{bid}.mlp.gate",                # qwen2moe olmoe
-            "transformer.decoder_layer.{bid}.router",     # Grok
-            "transformer.blocks.{bid}.ffn.router.layer",  # dbrx
+            "layers.{bid}.feed_forward.gate",                   # mixtral
+            "model.layers.{bid}.block_sparse_moe.gate",         # mixtral
+            "model.layers.{bid}.mlp.gate",                      # qwen2moe olmoe
+            "transformer.decoder_layer.{bid}.router",           # Grok
+            "transformer.blocks.{bid}.ffn.router.layer",        # dbrx
+            "model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
         ),
 
         MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
@@ -364,10 +365,11 @@ class TensorNameMap:
         ),
 
         MODEL_TENSOR.FFN_DOWN_EXP: (
-            "layers.{bid}.feed_forward.experts.w2",          # mixtral (merged)
-            "transformer.decoder_layer.{bid}.moe.linear_1",  # Grok (merged)
-            "transformer.blocks.{bid}.ffn.experts.mlp.w2",   # dbrx
-            "model.layers.{bid}.mlp.experts.down_proj",      # qwen2moe olmoe (merged)
+            "layers.{bid}.feed_forward.experts.w2",              # mixtral (merged)
+            "transformer.decoder_layer.{bid}.moe.linear_1",      # Grok (merged)
+            "transformer.blocks.{bid}.ffn.experts.mlp.w2",       # dbrx
+            "model.layers.{bid}.mlp.experts.down_proj",          # qwen2moe olmoe (merged)
+            "model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
         ),
 
         MODEL_TENSOR.FFN_DOWN_SHEXP: (
diff --git a/src/llama.cpp b/src/llama.cpp
index a718de054..0accb1492 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -215,6 +215,7 @@ enum llm_arch {
     LLM_ARCH_EXAONE,
     LLM_ARCH_RWKV6,
     LLM_ARCH_GRANITE,
+    LLM_ARCH_GRANITE_MOE,
     LLM_ARCH_UNKNOWN,
 };
 
@@ -266,6 +267,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_EXAONE,          "exaone"       },
     { LLM_ARCH_RWKV6,           "rwkv6"        },
     { LLM_ARCH_GRANITE,         "granite"      },
+    { LLM_ARCH_GRANITE_MOE,     "granitemoe"   },
     { LLM_ARCH_UNKNOWN,         "(unknown)"    },
 };
 
@@ -1467,6 +1469,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
         {
             { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
             { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
             { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
             { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
             { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
@@ -1478,6 +1481,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         },
     },
+    {
+        LLM_ARCH_GRANITE_MOE,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
+        },
+    },
     {
         LLM_ARCH_UNKNOWN,
         {
@@ -2396,7 +2417,7 @@ struct llama_hparams {
     float f_max_alibi_bias = 0.0f;
     float f_logit_scale    = 0.0f;
 
-    // Additional scale factors (Granite)
+    // Additional scale factors (Granite/Granite MoE)
     float f_residual_scale  = 0.0f;
     float f_embedding_scale = 0.0f;
     float f_attention_scale = 0.0f;
@@ -6048,6 +6069,7 @@ static void llm_load_hparams(
                 }
             } break;
         case LLM_ARCH_GRANITE:
+        case LLM_ARCH_GRANITE_MOE:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
@@ -6056,6 +6078,7 @@ static void llm_load_hparams(
                 ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
 
                 switch (hparams.n_layer) {
+                    case 32: model.type = e_model::MODEL_3B; break;
                     case 40: model.type = e_model::MODEL_3B; break;
                     // Add additional layer/vocab/etc checks here for other model sizes
                     default: model.type = e_model::MODEL_UNKNOWN;
@@ -6810,7 +6833,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
         LLAMA_LOG_INFO("%s: n_ff_shexp       = %d\n",     __func__, hparams.n_ff_shexp);
     }
 
-    if (model.arch == LLM_ARCH_GRANITE) {
+    if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
         LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
         LLAMA_LOG_INFO("%s: f_residual_scale  = %f\n", __func__, hparams.f_residual_scale);
         LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
@@ -6984,6 +7007,7 @@ static bool llm_load_tensors(
             case LLM_ARCH_REFACT:
             case LLM_ARCH_MINICPM:
             case LLM_ARCH_GRANITE:
+            case LLM_ARCH_GRANITE_MOE:
                 {
                     model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
 
@@ -15930,6 +15954,7 @@ static struct ggml_cgraph * llama_build_graph(
     switch (model.arch) {
         case LLM_ARCH_LLAMA:
         case LLM_ARCH_GRANITE:
+        case LLM_ARCH_GRANITE_MOE:
             {
                 result = llm.build_llama();
             } break;
@@ -19231,6 +19256,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_DEEPSEEK2:
         case LLM_ARCH_CHATGLM:
         case LLM_ARCH_GRANITE:
+        case LLM_ARCH_GRANITE_MOE:
             return LLAMA_ROPE_TYPE_NORM;
 
         // the pairs of head values are offset by n_rot/2

From afbbfaa537a96f562c34df4542930fa951b40d9e Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Wed, 25 Sep 2024 14:05:13 +0200
Subject: [PATCH 41/78] server : add more env vars, improve gen-docs (#9635)

* server : add more env vars, improve gen-docs

* update server docs

* LLAMA_ARG_NO_CONTEXT_SHIFT
---
 common/arg.cpp                 |  56 ++++++++--------
 examples/gen-docs/gen-docs.cpp |  85 +++++++++++++++--------
 examples/server/README.md      | 119 +++++++++++++++++++--------------
 examples/server/server.cpp     |   4 ++
 4 files changed, 157 insertions(+), 107 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index c1ec3c4f9..6880117ed 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -691,7 +691,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
         [](gpt_params & params) {
             params.ctx_shift = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
     add_opt(llama_arg(
         {"--chunks"}, "N",
         format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@@ -1102,7 +1102,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
             else { throw std::invalid_argument("invalid value"); }
         }
-    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
     add_opt(llama_arg(
         {"--attention"}, "{causal,non,causal}",
         "attention type for embeddings, use model default if unspecified",
@@ -1121,77 +1121,77 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
             else { throw std::invalid_argument("invalid value"); }
         }
-    ));
+    ).set_env("LLAMA_ARG_ROPE_SCALING_TYPE"));
     add_opt(llama_arg(
         {"--rope-scale"}, "N",
         "RoPE context scaling factor, expands context by a factor of N",
         [](gpt_params & params, const std::string & value) {
             params.rope_freq_scale = 1.0f / std::stof(value);
         }
-    ));
+    ).set_env("LLAMA_ARG_ROPE_SCALE"));
     add_opt(llama_arg(
         {"--rope-freq-base"}, "N",
         "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
         [](gpt_params & params, const std::string & value) {
             params.rope_freq_base = std::stof(value);
         }
-    ));
+    ).set_env("LLAMA_ARG_ROPE_FREQ_BASE"));
     add_opt(llama_arg(
         {"--rope-freq-scale"}, "N",
         "RoPE frequency scaling factor, expands context by a factor of 1/N",
         [](gpt_params & params, const std::string & value) {
             params.rope_freq_scale = std::stof(value);
         }
-    ));
+    ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE"));
     add_opt(llama_arg(
         {"--yarn-orig-ctx"}, "N",
         format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
         [](gpt_params & params, int value) {
             params.yarn_orig_ctx = value;
         }
-    ));
+    ).set_env("LLAMA_ARG_YARN_ORIG_CTX"));
     add_opt(llama_arg(
         {"--yarn-ext-factor"}, "N",
         format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
         [](gpt_params & params, const std::string & value) {
             params.yarn_ext_factor = std::stof(value);
         }
-    ));
+    ).set_env("LLAMA_ARG_YARN_EXT_FACTOR"));
     add_opt(llama_arg(
         {"--yarn-attn-factor"}, "N",
         format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
         [](gpt_params & params, const std::string & value) {
             params.yarn_attn_factor = std::stof(value);
         }
-    ));
+    ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR"));
     add_opt(llama_arg(
         {"--yarn-beta-slow"}, "N",
         format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
         [](gpt_params & params, const std::string & value) {
             params.yarn_beta_slow = std::stof(value);
         }
-    ));
+    ).set_env("LLAMA_ARG_YARN_BETA_SLOW"));
     add_opt(llama_arg(
         {"--yarn-beta-fast"}, "N",
         format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
         [](gpt_params & params, const std::string & value) {
             params.yarn_beta_fast = std::stof(value);
         }
-    ));
+    ).set_env("LLAMA_ARG_YARN_BETA_FAST"));
     add_opt(llama_arg(
         {"-gan", "--grp-attn-n"}, "N",
         format("group-attention factor (default: %d)", params.grp_attn_n),
         [](gpt_params & params, int value) {
             params.grp_attn_n = value;
         }
-    ));
+    ).set_env("LLAMA_ARG_GRP_ATTN_N"));
     add_opt(llama_arg(
         {"-gaw", "--grp-attn-w"}, "N",
         format("group-attention width (default: %.1f)", (double)params.grp_attn_w),
         [](gpt_params & params, int value) {
             params.grp_attn_w = value;
         }
-    ));
+    ).set_env("LLAMA_ARG_GRP_ATTN_W"));
     add_opt(llama_arg(
         {"-dkvc", "--dump-kv-cache"},
         "verbose print of the KV cache",
@@ -1205,7 +1205,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
         [](gpt_params & params) {
             params.no_kv_offload = true;
         }
-    ));
+    ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
     add_opt(llama_arg(
         {"-ctk", "--cache-type-k"}, "TYPE",
         format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
@@ -1213,7 +1213,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             // TODO: get the type right here
             params.cache_type_k = value;
         }
-    ));
+    ).set_env("LLAMA_ARG_CACHE_TYPE_K"));
     add_opt(llama_arg(
         {"-ctv", "--cache-type-v"}, "TYPE",
         format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
@@ -1221,7 +1221,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             // TODO: get the type right here
             params.cache_type_v = value;
         }
-    ));
+    ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
     add_opt(llama_arg(
         {"--perplexity", "--all-logits"},
         format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
@@ -1355,7 +1355,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
         [](gpt_params & params, const std::string & value) {
             params.rpc_servers = value;
         }
-    ));
+    ).set_env("LLAMA_ARG_RPC"));
 #endif
     add_opt(llama_arg(
         {"--mlock"},
@@ -1363,14 +1363,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
         [](gpt_params & params) {
             params.use_mlock = true;
         }
-    ));
+    ).set_env("LLAMA_ARG_MLOCK"));
     add_opt(llama_arg(
         {"--no-mmap"},
         "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
         [](gpt_params & params) {
             params.use_mmap = false;
         }
-    ));
+    ).set_env("LLAMA_ARG_NO_MMAP"));
     add_opt(llama_arg(
         {"--numa"}, "TYPE",
         "attempt optimizations that help on some NUMA systems\n"
@@ -1385,7 +1385,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
             else { throw std::invalid_argument("invalid value"); }
         }
-    ));
+    ).set_env("LLAMA_ARG_NUMA"));
     add_opt(llama_arg(
         {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
         "number of layers to store in VRAM",
@@ -1433,7 +1433,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
                 fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n");
             }
         }
-    ));
+    ).set_env("LLAMA_ARG_SPLIT_MODE"));
     add_opt(llama_arg(
         {"-ts", "--tensor-split"}, "N0,N1,N2,...",
         "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
@@ -1460,7 +1460,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
                 fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n");
             }
         }
-    ));
+    ).set_env("LLAMA_ARG_TENSOR_SPLIT"));
     add_opt(llama_arg(
         {"-mg", "--main-gpu"}, "INDEX",
         format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
@@ -1470,7 +1470,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
                 fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
             }
         }
-    ));
+    ).set_env("LLAMA_ARG_MAIN_GPU"));
     add_opt(llama_arg(
         {"--check-tensors"},
         format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
@@ -1533,7 +1533,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
         [](gpt_params & params, const std::string & value) {
             params.model_alias = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
     add_opt(llama_arg(
         {"-m", "--model"}, "FNAME",
         ex == LLAMA_EXAMPLE_EXPORT_LORA
@@ -1741,7 +1741,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
         [](gpt_params & params, const std::string & value) {
             params.public_path = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
     add_opt(llama_arg(
         {"--embedding", "--embeddings"},
         format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
@@ -1779,14 +1779,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
         [](gpt_params & params, const std::string & value) {
             params.ssl_file_key = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE"));
     add_opt(llama_arg(
         {"--ssl-cert-file"}, "FNAME",
         "path to file a PEM-encoded SSL certificate",
         [](gpt_params & params, const std::string & value) {
             params.ssl_file_cert = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
     add_opt(llama_arg(
         {"-to", "--timeout"}, "N",
         format("server read/write timeout in seconds (default: %d)", params.timeout_read),
@@ -1794,7 +1794,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.timeout_read  = value;
             params.timeout_write = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT"));
     add_opt(llama_arg(
         {"--threads-http"}, "N",
         format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
diff --git a/examples/gen-docs/gen-docs.cpp b/examples/gen-docs/gen-docs.cpp
index b6d4725fd..4b19a9dc2 100644
--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@@ -6,42 +6,73 @@
 
 // Export usage message (-h) to markdown format
 
+static void write_table_header(std::ofstream & file) {
+    file << "| Argument | Explanation |\n";
+    file << "| -------- | ----------- |\n";
+}
+
+static void write_table_entry(std::ofstream & file, const llama_arg & opt) {
+    file << "| `";
+    // args
+    for (const auto & arg : opt.args) {
+    if (arg == opt.args.front()) {
+            file << arg;
+            if (opt.args.size() > 1) file << ", ";
+        } else {
+            file << arg << (arg != opt.args.back() ? ", " : "");
+        }
+    }
+    // value hint
+    if (opt.value_hint) {
+        std::string md_value_hint(opt.value_hint);
+        string_replace_all(md_value_hint, "|", "\\|");
+        file << " " << md_value_hint;
+    }
+    if (opt.value_hint_2) {
+        std::string md_value_hint_2(opt.value_hint_2);
+        string_replace_all(md_value_hint_2, "|", "\\|");
+        file << " " << md_value_hint_2;
+    }
+    // help text
+    std::string md_help(opt.help);
+    string_replace_all(md_help, "\n", "<br/>");
+    string_replace_all(md_help, "|", "\\|");
+    file << "` | " << md_help << " |\n";
+}
+
+static void write_table(std::ofstream & file, std::vector<llama_arg *> & opts) {
+    write_table_header(file);
+    for (const auto & opt : opts) {
+        write_table_entry(file, *opt);
+    }
+}
+
 static void export_md(std::string fname, llama_example ex) {
     std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
 
     gpt_params params;
     auto ctx_arg = gpt_params_parser_init(params, ex);
 
-    file << "| Argument | Explanation |\n";
-    file << "| -------- | ----------- |\n";
+    std::vector<llama_arg *> common_options;
+    std::vector<llama_arg *> sparam_options;
+    std::vector<llama_arg *> specific_options;
     for (auto & opt : ctx_arg.options) {
-        file << "| `";
-        // args
-        for (const auto & arg : opt.args) {
-        if (arg == opt.args.front()) {
-                file << arg;
-                if (opt.args.size() > 1) file << ", ";
-            } else {
-                file << arg << (arg != opt.args.back() ? ", " : "");
-            }
+        // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
+        if (opt.is_sparam) {
+            sparam_options.push_back(&opt);
+        } else if (opt.in_example(ctx_arg.ex)) {
+            specific_options.push_back(&opt);
+        } else {
+            common_options.push_back(&opt);
         }
-        // value hint
-        if (opt.value_hint) {
-            std::string md_value_hint(opt.value_hint);
-            string_replace_all(md_value_hint, "|", "\\|");
-            file << " " << md_value_hint;
-        }
-        if (opt.value_hint_2) {
-            std::string md_value_hint_2(opt.value_hint_2);
-            string_replace_all(md_value_hint_2, "|", "\\|");
-            file << " " << md_value_hint_2;
-        }
-        // help text
-        std::string md_help(opt.help);
-        string_replace_all(md_help, "\n", "<br/>");
-        string_replace_all(md_help, "|", "\\|");
-        file << "` | " << md_help << " |\n";
     }
+
+    file << "**Common params**\n\n";
+    write_table(file, common_options);
+    file << "\n\n**Sampling params**\n\n";
+    write_table(file, sparam_options);
+    file << "\n\n**Example-specific params**\n\n";
+    write_table(file, specific_options);
 }
 
 int main(int, char **) {
diff --git a/examples/server/README.md b/examples/server/README.md
index 741950c8a..dfca07f98 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -17,6 +17,8 @@ The project is under active development, and we are [looking for feedback and co
 
 ## Usage
 
+**Common params**
+
 | Argument | Explanation |
 | -------- | ----------- |
 | `-h, --help, --usage` | print usage and exit |
@@ -38,7 +40,6 @@ The project is under active development, and we are [looking for feedback and co
 | `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
 | `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
 | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
-| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled) |
 | `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
 | `-p, --prompt PROMPT` | prompt to start generation with |
 | `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) |
@@ -46,8 +47,56 @@ The project is under active development, and we are [looking for feedback and co
 | `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
 | `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
 | `--no-escape` | do not process escape sequences |
-| `-sp, --special` | special tokens output enabled (default: false) |
-| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
+| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
+| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: LLAMA_ARG_ROPE_SCALE) |
+| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
+| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N<br/>(env: LLAMA_ARG_ROPE_FREQ_SCALE) |
+| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)<br/>(env: LLAMA_ARG_YARN_ORIG_CTX) |
+| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)<br/>(env: LLAMA_ARG_YARN_EXT_FACTOR) |
+| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
+| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
+| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
+| `-gan, --grp-attn-n N` | group-attention factor (default: 1)<br/>(env: LLAMA_ARG_GRP_ATTN_N) |
+| `-gaw, --grp-attn-w N` | group-attention width (default: 512.0)<br/>(env: LLAMA_ARG_GRP_ATTN_W) |
+| `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
+| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
+| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
+| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
+| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
+| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
+| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
+| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) |
+| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
+| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
+| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
+| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
+| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
+| `--check-tensors` | check model tensor data for invalid values (default: false) |
+| `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
+| `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
+| `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
+| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
+| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
+| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
+| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
+| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
+| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
+| `-hff, --hf-file FILE` | Hugging Face model file (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
+| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
+| `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) |
+| `--log-disable` | Log disable |
+| `--log-file FNAME` | Log to file |
+| `--log-colors` | Enable colored logging<br/>(env: LLAMA_LOG_COLORS) |
+| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
+| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored.<br/>(env: LLAMA_LOG_VERBOSITY) |
+| `--log-prefix` | Enable prefx in log messages<br/>(env: LLAMA_LOG_PREFIX) |
+| `--log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_LOG_TIMESTAMPS) |
+
+
+**Sampling params**
+
+| Argument | Explanation |
+| -------- | ----------- |
 | `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) |
 | `-s, --seed SEED` | RNG seed (default: 4294967295, use random seed for 4294967295) |
 | `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
@@ -72,54 +121,28 @@ The project is under active development, and we are [looking for feedback and co
 | `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
 | `--grammar-file FNAME` | file to read grammar from |
 | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
-| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model |
-| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N |
-| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model) |
-| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N |
-| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size) |
-| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation) |
-| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0) |
-| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0) |
-| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0) |
-| `-gan, --grp-attn-n N` | group-attention factor (default: 1) |
-| `-gaw, --grp-attn-w N` | group-attention width (default: 512.0) |
-| `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
-| `-nkvo, --no-kv-offload` | disable KV offload |
-| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16) |
-| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) |
-| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
-| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
+
+
+**Example-specific params**
+
+| Argument | Explanation |
+| -------- | ----------- |
+| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
+| `-sp, --special` | special tokens output enabled (default: false) |
+| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
+| `--pooling {none,mean,cls,last}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
 | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
 | `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
-| `--mlock` | force system to keep model in RAM rather than swapping or compressing |
-| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
-| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
-| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
-| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |
-| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 |
-| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) |
-| `--check-tensors` | check model tensor data for invalid values (default: false) |
-| `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
-| `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
-| `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
-| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
-| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
-| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
-| `-a, --alias STRING` | set alias for model name (to be used by REST API) |
-| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
-| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
-| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
-| `-hff, --hf-file FILE` | Hugging Face model file (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
-| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
+| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_ALIAS) |
 | `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
 | `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
-| `--path PATH` | path to serve static files from (default: ) |
+| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
 | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
 | `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
 | `--api-key-file FNAME` | path to file containing API keys (default: none) |
-| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key |
-| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate |
-| `-to, --timeout N` | server read/write timeout in seconds (default: 600) |
+| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
+| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
+| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
 | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
 | `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
 | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
@@ -128,14 +151,6 @@ The project is under active development, and we are [looking for feedback and co
 | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted:<br/>https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
 | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
-| `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) |
-| `--log-disable` | Log disable |
-| `--log-file FNAME` | Log to file |
-| `--log-colors` | Enable colored logging<br/>(env: LLAMA_LOG_COLORS) |
-| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
-| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored.<br/>(env: LLAMA_LOG_VERBOSITY) |
-| `--log-prefix` | Enable prefx in log messages<br/>(env: LLAMA_LOG_PREFIX) |
-| `--log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_LOG_TIMESTAMPS) |
 
 Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index e5275a514..61ff09bb2 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2356,6 +2356,10 @@ int main(int argc, char ** argv) {
         svr.reset(new httplib::Server());
     }
 #else
+    if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
+        LOG_ERR("Server is built without SSL support\n");
+        return 1;
+    }
     svr.reset(new httplib::Server());
 #endif
 

From 1e436302188a704ac9ea044af03193648806f19c Mon Sep 17 00:00:00 2001
From: Charles Xu <63788048+chaxu01@users.noreply.github.com>
Date: Wed, 25 Sep 2024 15:12:20 +0200
Subject: [PATCH 42/78] ggml : remove assert for AArch64 GEMV and GEMM Q4
 kernels (#9217)

* ggml : remove assert for AArch64 GEMV and GEMM Q4 kernels

* added fallback mechanism when the offline re-quantized model is not
optimized for the underlying target.

* fix for build errors

* remove prints from the low-level code

* Rebase to the latest upstream
---
 ggml/src/ggml-aarch64.c | 3226 +++++++++++++++++++--------------------
 1 file changed, 1591 insertions(+), 1635 deletions(-)

diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c
index 2b01b4f93..8912de63d 100644
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@@ -1,4 +1,7 @@
-// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-License-Identifier: MIT
+//
+
 #define GGML_COMMON_IMPL_C
 #include "ggml-common.h"
 
@@ -595,6 +598,15 @@ size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_
     return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
 }
 
+// Return the number of byte lanes in the SVE vector if SVE is supported; otherwise, returns 0 if SVE is not supported.
+static int sve_lane_count(void) {
+#if defined(__ARM_FEATURE_SVE)
+    return ggml_sve_cnt_b;
+#else
+    return 0;
+#endif
+}
+
 void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
     const int qk = QK8_0;
     const int nb = n / qk;
@@ -614,73 +626,67 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(ncols_interleaved);
     UNUSED(blocklen);
 
-#if defined(__ARM_FEATURE_SVE)
-    if (ggml_sve_cnt_b == QK8_0) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
-                    "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
-    }
-#endif
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
-    GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
-                "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
-#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
-    const void * b_ptr = vx;
-    const void * a_ptr = vy;
-    float * res_ptr = s;
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
+    if (ggml_cpu_has_neon()) {
+        const void * b_ptr = vx;
+        const void * a_ptr = vy;
+        float * res_ptr = s;
 
-    __asm__ __volatile__(
-        "movi v31.16b, #0x4\n"
-        "movi v30.16b, #0xf0\n"
-        "add %x[b_ptr], %x[b_ptr], #0x8\n"
-        "1:"  // Column loop
-        "add x22, %x[a_ptr], #0x2\n"
-        "movi v29.16b, #0x0\n"
-        "mov x21, %x[nb]\n"
-        "2:"  // Block loop
-        "ldr q28, [%x[b_ptr], #0x0]\n"
-        "ldr q27, [x22, #0x0]\n"
-        "movi v26.4s, #0x0\n"
-        "sub x20, x22, #0x2\n"
-        "ldr q25, [x22, #0x10]\n"
-        "ldr q24, [%x[b_ptr], #0x10]\n"
-        "sub x21, x21, #0x1\n"
-        "add x22, x22, #0x22\n"
-        "ldr q23, [%x[b_ptr], #0x20]\n"
-        "ldr q22, [%x[b_ptr], #0x30]\n"
-        "ld1r { v21.8h }, [x20]\n"
-        "ldr q20, [%x[b_ptr], #-0x8]\n"
-        "sshl v16.16b, v28.16b, v31.16b\n"
-        "and v28.16b, v28.16b, v30.16b\n"
-        "sshl v19.16b, v24.16b, v31.16b\n"
-        "and v24.16b, v24.16b, v30.16b\n"
-        "add %x[b_ptr], %x[b_ptr], #0x48\n"
-        "sshl v18.16b, v23.16b, v31.16b\n"
-        "and v23.16b, v23.16b, v30.16b\n"
-        ".inst 0x4f9be21a  // sdot v26.4s, v16.16b, v27.4b[0]\n"
-        "sshl v17.16b, v22.16b, v31.16b\n"
-        "and v22.16b, v22.16b, v30.16b\n"
-        "fcvtl v21.4s, v21.4h\n"
-        "fcvtl v16.4s, v20.4h\n"
-        ".inst 0x4f99e39a  // sdot v26.4s, v28.16b, v25.4b[0]\n"
-        "fmul v16.4s, v16.4s, v21.4s\n"
-        ".inst 0x4fbbe27a  // sdot v26.4s, v19.16b, v27.4b[1]\n"
-        ".inst 0x4fb9e31a  // sdot v26.4s, v24.16b, v25.4b[1]\n"
-        ".inst 0x4f9bea5a  // sdot v26.4s, v18.16b, v27.4b[2]\n"
-        ".inst 0x4f99eafa  // sdot v26.4s, v23.16b, v25.4b[2]\n"
-        ".inst 0x4fbbea3a  // sdot v26.4s, v17.16b, v27.4b[3]\n"
-        ".inst 0x4fb9eada  // sdot v26.4s, v22.16b, v25.4b[3]\n"
-        "scvtf v26.4s, v26.4s, #0x4\n"
-        "fmla v29.4s, v26.4s, v16.4s\n"
-        "cbnz x21, 2b\n"
-        "sub %x[nc], %x[nc], #0x4\n"
-        "str q29, [%x[res_ptr], #0x0]\n"
-        "add %x[res_ptr], %x[res_ptr], #0x10\n"
-        "cbnz %x[nc], 1b\n"
-        : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
-        : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
-        : "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22"
-    );
-#else
+        __asm__ __volatile__(
+            "movi v31.16b, #0x4\n"
+            "movi v30.16b, #0xf0\n"
+            "add %x[b_ptr], %x[b_ptr], #0x8\n"
+            "1:"  // Column loop
+            "add x22, %x[a_ptr], #0x2\n"
+            "movi v29.16b, #0x0\n"
+            "mov x21, %x[nb]\n"
+            "2:"  // Block loop
+            "ldr q28, [%x[b_ptr], #0x0]\n"
+            "ldr q27, [x22, #0x0]\n"
+            "movi v26.4s, #0x0\n"
+            "sub x20, x22, #0x2\n"
+            "ldr q25, [x22, #0x10]\n"
+            "ldr q24, [%x[b_ptr], #0x10]\n"
+            "sub x21, x21, #0x1\n"
+            "add x22, x22, #0x22\n"
+            "ldr q23, [%x[b_ptr], #0x20]\n"
+            "ldr q22, [%x[b_ptr], #0x30]\n"
+            "ld1r { v21.8h }, [x20]\n"
+            "ldr q20, [%x[b_ptr], #-0x8]\n"
+            "sshl v16.16b, v28.16b, v31.16b\n"
+            "and v28.16b, v28.16b, v30.16b\n"
+            "sshl v19.16b, v24.16b, v31.16b\n"
+            "and v24.16b, v24.16b, v30.16b\n"
+            "add %x[b_ptr], %x[b_ptr], #0x48\n"
+            "sshl v18.16b, v23.16b, v31.16b\n"
+            "and v23.16b, v23.16b, v30.16b\n"
+            ".inst 0x4f9be21a  // sdot v26.4s, v16.16b, v27.4b[0]\n"
+            "sshl v17.16b, v22.16b, v31.16b\n"
+            "and v22.16b, v22.16b, v30.16b\n"
+            "fcvtl v21.4s, v21.4h\n"
+            "fcvtl v16.4s, v20.4h\n"
+            ".inst 0x4f99e39a  // sdot v26.4s, v28.16b, v25.4b[0]\n"
+            "fmul v16.4s, v16.4s, v21.4s\n"
+            ".inst 0x4fbbe27a  // sdot v26.4s, v19.16b, v27.4b[1]\n"
+            ".inst 0x4fb9e31a  // sdot v26.4s, v24.16b, v25.4b[1]\n"
+            ".inst 0x4f9bea5a  // sdot v26.4s, v18.16b, v27.4b[2]\n"
+            ".inst 0x4f99eafa  // sdot v26.4s, v23.16b, v25.4b[2]\n"
+            ".inst 0x4fbbea3a  // sdot v26.4s, v17.16b, v27.4b[3]\n"
+            ".inst 0x4fb9eada  // sdot v26.4s, v22.16b, v25.4b[3]\n"
+            "scvtf v26.4s, v26.4s, #0x4\n"
+            "fmla v29.4s, v26.4s, v16.4s\n"
+            "cbnz x21, 2b\n"
+            "sub %x[nc], %x[nc], #0x4\n"
+            "str q29, [%x[res_ptr], #0x0]\n"
+            "add %x[res_ptr], %x[res_ptr], #0x10\n"
+            "cbnz %x[nc], 1b\n"
+            : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
+            : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
+            : "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22"
+            );
+        return;
+    }
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
     float sumf[4];
     int sumi;
 
@@ -704,7 +710,6 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
         }
         for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
     }
-#endif
 }
 
 void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
@@ -726,79 +731,72 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(ncols_interleaved);
     UNUSED(blocklen);
 
-#if defined(__ARM_FEATURE_SVE)
-    if (ggml_sve_cnt_b == QK8_0) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
-                    "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
-    }
-#endif
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
-    const void * b_ptr = vx;
-    const void * a_ptr = vy;
-    float * res_ptr = s;
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
+        const void * b_ptr = vx;
+        const void * a_ptr = vy;
+        float * res_ptr = s;
 
-    __asm__ __volatile__(
-        "movi v2.16b, #0x4\n"
-        "movi v1.16b, #0xf0\n"
-        "add %x[b_ptr], %x[b_ptr], #0x8\n"
-        "1:"  // Column loop
-        "add x23, %x[a_ptr], #0x2\n"
-        "movi v0.16b, #0x0\n"
-        "mov x22, %x[nb]\n"
-        "2:"  // Block loop
-        "ldr q31, [%x[b_ptr], #0x0]\n"
-        "ldr q30, [%x[b_ptr], #0x10]\n"
-        "mov x21, x23\n"
-        "movi v29.4s, #0x0\n"
-        "ldr q28, [%x[b_ptr], #0x20]\n"
-        "ldr q27, [%x[b_ptr], #0x30]\n"
-        "movi v26.4s, #0x0\n"
-        "sub x20, x23, #0x2\n"
-        "ld1r { v25.8h }, [x20]\n"
-        "ldr q24, [%x[b_ptr], #-0x8]\n"
-        "sub x22, x22, #0x1\n"
-        "add x23, x23, #0x22\n"
-        "ld1r { v23.2d }, [x21], #0x8\n"
-        "sshl v22.16b, v31.16b, v2.16b\n"
-        "sshl v16.16b, v30.16b, v2.16b\n"
-        "add %x[b_ptr], %x[b_ptr], #0x48\n"
-        "ld1r { v21.2d }, [x21], #0x8\n"
-        "sshl v20.16b, v28.16b, v2.16b\n"
-        "sshl v19.16b, v27.16b, v2.16b\n"
-        "ld1r { v18.2d }, [x21], #0x8\n"
-        "ld1r { v17.2d }, [x21], #0x8\n"
-        "and v31.16b, v31.16b, v1.16b\n"
-        "and v30.16b, v30.16b, v1.16b\n"
-        ".inst 0x4e9796dd  // sdot v29.4s, v22.16b, v23.16b\n"
-        ".inst 0x4e97961a  // sdot v26.4s, v16.16b, v23.16b\n"
-        "and v28.16b, v28.16b, v1.16b\n"
-        "and v27.16b, v27.16b, v1.16b\n"
-        "fcvtl v25.4s, v25.4h\n"
-        "fcvtl v16.4s, v24.4h\n"
-        ".inst 0x4e95969d  // sdot v29.4s, v20.16b, v21.16b\n"
-        ".inst 0x4e95967a  // sdot v26.4s, v19.16b, v21.16b\n"
-        "fmul v16.4s, v16.4s, v25.4s\n"
-        ".inst 0x4e9297fd  // sdot v29.4s, v31.16b, v18.16b\n"
-        ".inst 0x4e9297da  // sdot v26.4s, v30.16b, v18.16b\n"
-        ".inst 0x4e91979d  // sdot v29.4s, v28.16b, v17.16b\n"
-        ".inst 0x4e91977a  // sdot v26.4s, v27.16b, v17.16b\n"
-        "addp v29.4s, v29.4s, v26.4s\n"
-        "scvtf v29.4s, v29.4s, #0x4\n"
-        "fmla v0.4s, v29.4s, v16.4s\n"
-        "cbnz x22, 2b\n"
-        "sub %x[nc], %x[nc], #0x4\n"
-        "str q0, [%x[res_ptr], #0x0]\n"
-        "add %x[res_ptr], %x[res_ptr], #0x10\n"
-        "cbnz %x[nc], 1b\n"
-        : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
-        : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
-        : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
-    );
-#elif defined(__ARM_NEON) && defined(__aarch64__)
-    GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
-                "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
-                "performance");
-#else
+        __asm__ __volatile__(
+            "movi v2.16b, #0x4\n"
+            "movi v1.16b, #0xf0\n"
+            "add %x[b_ptr], %x[b_ptr], #0x8\n"
+            "1:"  // Column loop
+            "add x23, %x[a_ptr], #0x2\n"
+            "movi v0.16b, #0x0\n"
+            "mov x22, %x[nb]\n"
+            "2:"  // Block loop
+            "ldr q31, [%x[b_ptr], #0x0]\n"
+            "ldr q30, [%x[b_ptr], #0x10]\n"
+            "mov x21, x23\n"
+            "movi v29.4s, #0x0\n"
+            "ldr q28, [%x[b_ptr], #0x20]\n"
+            "ldr q27, [%x[b_ptr], #0x30]\n"
+            "movi v26.4s, #0x0\n"
+            "sub x20, x23, #0x2\n"
+            "ld1r { v25.8h }, [x20]\n"
+            "ldr q24, [%x[b_ptr], #-0x8]\n"
+            "sub x22, x22, #0x1\n"
+            "add x23, x23, #0x22\n"
+            "ld1r { v23.2d }, [x21], #0x8\n"
+            "sshl v22.16b, v31.16b, v2.16b\n"
+            "sshl v16.16b, v30.16b, v2.16b\n"
+            "add %x[b_ptr], %x[b_ptr], #0x48\n"
+            "ld1r { v21.2d }, [x21], #0x8\n"
+            "sshl v20.16b, v28.16b, v2.16b\n"
+            "sshl v19.16b, v27.16b, v2.16b\n"
+            "ld1r { v18.2d }, [x21], #0x8\n"
+            "ld1r { v17.2d }, [x21], #0x8\n"
+            "and v31.16b, v31.16b, v1.16b\n"
+            "and v30.16b, v30.16b, v1.16b\n"
+            ".inst 0x4e9796dd  // sdot v29.4s, v22.16b, v23.16b\n"
+            ".inst 0x4e97961a  // sdot v26.4s, v16.16b, v23.16b\n"
+            "and v28.16b, v28.16b, v1.16b\n"
+            "and v27.16b, v27.16b, v1.16b\n"
+            "fcvtl v25.4s, v25.4h\n"
+            "fcvtl v16.4s, v24.4h\n"
+            ".inst 0x4e95969d  // sdot v29.4s, v20.16b, v21.16b\n"
+            ".inst 0x4e95967a  // sdot v26.4s, v19.16b, v21.16b\n"
+            "fmul v16.4s, v16.4s, v25.4s\n"
+            ".inst 0x4e9297fd  // sdot v29.4s, v31.16b, v18.16b\n"
+            ".inst 0x4e9297da  // sdot v26.4s, v30.16b, v18.16b\n"
+            ".inst 0x4e91979d  // sdot v29.4s, v28.16b, v17.16b\n"
+            ".inst 0x4e91977a  // sdot v26.4s, v27.16b, v17.16b\n"
+            "addp v29.4s, v29.4s, v26.4s\n"
+            "scvtf v29.4s, v29.4s, #0x4\n"
+            "fmla v0.4s, v29.4s, v16.4s\n"
+            "cbnz x22, 2b\n"
+            "sub %x[nc], %x[nc], #0x4\n"
+            "str q0, [%x[res_ptr], #0x0]\n"
+            "add %x[res_ptr], %x[res_ptr], #0x10\n"
+            "cbnz %x[nc], 1b\n"
+            : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
+            : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
+            : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
+        );
+        return;
+    }
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
     float sumf[4];
     int sumi;
 
@@ -822,7 +820,6 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
         }
         for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
     }
-#endif
 }
 
 void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
@@ -844,8 +841,9 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(ncols_interleaved);
     UNUSED(blocklen);
 
-#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
-    if (ggml_sve_cnt_b == QK8_0) {
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE)
+    if (ggml_cpu_has_sve() && sve_lane_count() == QK8_0) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;
         float * res_ptr = s;
@@ -910,24 +908,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
         );
         return;
     }
-    else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-        GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
-                    "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
-                    "performance");
-    }
-    else if (ggml_cpu_has_neon()) {
-        GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
-                    "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
-                    "quantization format for optimal performance");
-    }
-#endif
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
-    GGML_ASSERT(ggml_cpu_has_sve() &&
-                "__ARM_FEATURE_SVE not defined, use the Q4_0_4_8 quantization format for optimal performance");
-#elif defined(__ARM_NEON) && defined(__aarch64__)
-    GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
-                "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
-                "performance");
+#endif // #if defined(__ARM_FEATURE_SVE)
 #elif defined(__AVX2__)
     // Lookup table to convert signed nibbles to signed bytes
     __m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
@@ -1018,31 +999,33 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
             _mm256_storeu_ps(s + (y * nr + x * 8), acc_row);
         }
     }
-#else
-    float sumf[8];
-    int sumi;
+    return;
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
+    {
+        float sumf[8];
+        int sumi;
 
-    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
-    for (int x = 0; x < nc / ncols_interleaved; x++) {
-        const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
 
-        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
-        for (int l = 0; l < nb; l++) {
-            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
-                for (int j = 0; j < ncols_interleaved; j++) {
-                    sumi = 0;
-                    for (int i = 0; i < blocklen; ++i) {
-                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
-                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
-                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+            for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int j = 0; j < ncols_interleaved; j++) {
+                        sumi = 0;
+                        for (int i = 0; i < blocklen; ++i) {
+                            const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                            const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                            sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+                        }
+                        sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
                     }
-                    sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
                 }
             }
+            for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
         }
-        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
     }
-#endif
 }
 
 void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
@@ -1065,505 +1048,500 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(ncols_interleaved);
     UNUSED(blocklen);
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (ggml_sve_cnt_b == QK8_0) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
-                    "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
+    if (ggml_cpu_has_neon()) {
+        const void * b_ptr = vx;
+        const void * a_ptr = vy;
+        float * res_ptr = s;
+        size_t res_stride = bs * sizeof(float);
+
+        __asm__ __volatile__(
+            "mov x10, %x[nr]\n"
+            "mov x9, #0x88\n"
+            "cmp x10, #0x10\n"
+            "mul x9, %x[nb], x9\n"
+            "blt 4f\n"
+            "1:"  // Row loop
+            "add x28, %x[b_ptr], #0x8\n"
+            "mov x27, %x[nc]\n"
+            "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
+            "2:"  // Column loop
+            "add x25, %x[a_ptr], #0x8\n"
+            "movi v15.16b, #0x0\n"
+            "movi v19.16b, #0x0\n"
+            "mov x24, %x[nb]\n"
+            "add x23, x25, x9\n"
+            "movi v18.16b, #0x0\n"
+            "movi v14.16b, #0x0\n"
+            "add x22, x23, x9\n"
+            "movi v11.16b, #0x0\n"
+            "movi v13.16b, #0x0\n"
+            "add x21, x22, x9\n"
+            "movi v23.16b, #0x0\n"
+            "movi v16.16b, #0x0\n"
+            "movi v25.16b, #0x0\n"
+            "movi v7.16b, #0x0\n"
+            "movi v0.16b, #0x0\n"
+            "movi v4.16b, #0x0\n"
+            "movi v5.16b, #0x0\n"
+            "movi v21.16b, #0x0\n"
+            "movi v8.16b, #0x0\n"
+            "movi v1.16b, #0x0\n"
+            "3:"  // Block loop
+            "ldr q3, [x28, #0x0]\n"
+            "ldr q31, [x25, #0x0]\n"
+            "movi v28.16b, #0x4\n"
+            "movi v10.4s, #0x0\n"
+            "ldr q22, [x28, #0x10]\n"
+            "ldr q6, [x25, #0x10]\n"
+            "movi v29.4s, #0x0\n"
+            "movi v9.4s, #0x0\n"
+            "ldr q27, [x28, #0x20]\n"
+            "ldr q30, [x28, #0x30]\n"
+            "movi v20.4s, #0x0\n"
+            "movi v24.16b, #0xf0\n"
+            "ldr d2, [x25, #-0x8]\n"
+            "ldr d26, [x23, #-0x8]\n"
+            "sshl v12.16b, v3.16b, v28.16b\n"
+            "sub x20, x28, #0x8\n"
+            "ldr d17, [x20, #0x0]\n"
+            "and v3.16b, v3.16b, v24.16b\n"
+            "subs x24, x24, #0x1\n"
+            "add x28, x28, #0x48\n"
+            ".inst 0x4f9fe18a  // sdot v10.4s, v12.16b, v31.4b[0]\n"
+            ".inst 0x4fbfe19d  // sdot v29.4s, v12.16b, v31.4b[1]\n"
+            ".inst 0x4f9fe989  // sdot v9.4s, v12.16b, v31.4b[2]\n"
+            ".inst 0x4fbfe994  // sdot v20.4s, v12.16b, v31.4b[3]\n"
+            "sshl v31.16b, v22.16b, v28.16b\n"
+            "and v22.16b, v22.16b, v24.16b\n"
+            "fcvtl v17.4s, v17.4h\n"
+            "fcvtl v2.4s, v2.4h\n"
+            "fcvtl v26.4s, v26.4h\n"
+            ".inst 0x4f86e3ea  // sdot v10.4s, v31.16b, v6.4b[0]\n"
+            ".inst 0x4fa6e3fd  // sdot v29.4s, v31.16b, v6.4b[1]\n"
+            ".inst 0x4f86ebe9  // sdot v9.4s, v31.16b, v6.4b[2]\n"
+            ".inst 0x4fa6ebf4  // sdot v20.4s, v31.16b, v6.4b[3]\n"
+            "sshl v6.16b, v27.16b, v28.16b\n"
+            "sshl v28.16b, v30.16b, v28.16b\n"
+            "and v27.16b, v27.16b, v24.16b\n"
+            "and v30.16b, v30.16b, v24.16b\n"
+            "ldr q24, [x25, #0x20]\n"
+            ".inst 0x4f98e0ca  // sdot v10.4s, v6.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e0dd  // sdot v29.4s, v6.16b, v24.4b[1]\n"
+            ".inst 0x4f98e8c9  // sdot v9.4s, v6.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e8d4  // sdot v20.4s, v6.16b, v24.4b[3]\n"
+            "ldr q24, [x25, #0x30]\n"
+            ".inst 0x4f98e38a  // sdot v10.4s, v28.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e39d  // sdot v29.4s, v28.16b, v24.4b[1]\n"
+            ".inst 0x4f98eb89  // sdot v9.4s, v28.16b, v24.4b[2]\n"
+            ".inst 0x4fb8eb94  // sdot v20.4s, v28.16b, v24.4b[3]\n"
+            "ldr q24, [x25, #0x40]\n"
+            ".inst 0x4f98e06a  // sdot v10.4s, v3.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e07d  // sdot v29.4s, v3.16b, v24.4b[1]\n"
+            ".inst 0x4f98e869  // sdot v9.4s, v3.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e874  // sdot v20.4s, v3.16b, v24.4b[3]\n"
+            "ldr q24, [x25, #0x50]\n"
+            ".inst 0x4f98e2ca  // sdot v10.4s, v22.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e2dd  // sdot v29.4s, v22.16b, v24.4b[1]\n"
+            ".inst 0x4f98eac9  // sdot v9.4s, v22.16b, v24.4b[2]\n"
+            ".inst 0x4fb8ead4  // sdot v20.4s, v22.16b, v24.4b[3]\n"
+            "ldr q24, [x25, #0x60]\n"
+            ".inst 0x4f98e36a  // sdot v10.4s, v27.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e37d  // sdot v29.4s, v27.16b, v24.4b[1]\n"
+            ".inst 0x4f98eb69  // sdot v9.4s, v27.16b, v24.4b[2]\n"
+            ".inst 0x4fb8eb74  // sdot v20.4s, v27.16b, v24.4b[3]\n"
+            "ldr q24, [x25, #0x70]\n"
+            "add x25, x25, #0x88\n"
+            ".inst 0x4f98e3ca  // sdot v10.4s, v30.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e3dd  // sdot v29.4s, v30.16b, v24.4b[1]\n"
+            ".inst 0x4f98ebc9  // sdot v9.4s, v30.16b, v24.4b[2]\n"
+            ".inst 0x4fb8ebd4  // sdot v20.4s, v30.16b, v24.4b[3]\n"
+            "fmul v24.4s, v17.4s, v2.s[0]\n"
+            "scvtf v10.4s, v10.4s, #0x4\n"
+            "scvtf v29.4s, v29.4s, #0x4\n"
+            "scvtf v9.4s, v9.4s, #0x4\n"
+            "scvtf v20.4s, v20.4s, #0x4\n"
+            "fmla v15.4s, v10.4s, v24.4s\n"
+            "ldr q24, [x23, #0x0]\n"
+            "fmul v10.4s, v17.4s, v2.s[1]\n"
+            "fmla v19.4s, v29.4s, v10.4s\n"
+            "ldr q10, [x23, #0x10]\n"
+            "fmul v29.4s, v17.4s, v2.s[2]\n"
+            "fmul v2.4s, v17.4s, v2.s[3]\n"
+            "fmla v18.4s, v9.4s, v29.4s\n"
+            "movi v9.4s, #0x0\n"
+            "movi v29.4s, #0x0\n"
+            ".inst 0x4f98e189  // sdot v9.4s, v12.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e19d  // sdot v29.4s, v12.16b, v24.4b[1]\n"
+            "fmla v14.4s, v20.4s, v2.4s\n"
+            "movi v20.4s, #0x0\n"
+            "movi v2.4s, #0x0\n"
+            ".inst 0x4f98e994  // sdot v20.4s, v12.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e982  // sdot v2.4s, v12.16b, v24.4b[3]\n"
+            "ldr q24, [x23, #0x20]\n"
+            ".inst 0x4f8ae3e9  // sdot v9.4s, v31.16b, v10.4b[0]\n"
+            ".inst 0x4faae3fd  // sdot v29.4s, v31.16b, v10.4b[1]\n"
+            ".inst 0x4f8aebf4  // sdot v20.4s, v31.16b, v10.4b[2]\n"
+            ".inst 0x4faaebe2  // sdot v2.4s, v31.16b, v10.4b[3]\n"
+            "ldr q10, [x23, #0x30]\n"
+            ".inst 0x4f98e0c9  // sdot v9.4s, v6.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e0dd  // sdot v29.4s, v6.16b, v24.4b[1]\n"
+            ".inst 0x4f98e8d4  // sdot v20.4s, v6.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e8c2  // sdot v2.4s, v6.16b, v24.4b[3]\n"
+            "ldr q24, [x23, #0x40]\n"
+            ".inst 0x4f8ae389  // sdot v9.4s, v28.16b, v10.4b[0]\n"
+            ".inst 0x4faae39d  // sdot v29.4s, v28.16b, v10.4b[1]\n"
+            ".inst 0x4f8aeb94  // sdot v20.4s, v28.16b, v10.4b[2]\n"
+            ".inst 0x4faaeb82  // sdot v2.4s, v28.16b, v10.4b[3]\n"
+            "ldr q10, [x23, #0x50]\n"
+            ".inst 0x4f98e069  // sdot v9.4s, v3.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e07d  // sdot v29.4s, v3.16b, v24.4b[1]\n"
+            ".inst 0x4f98e874  // sdot v20.4s, v3.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e862  // sdot v2.4s, v3.16b, v24.4b[3]\n"
+            "ldr q24, [x23, #0x60]\n"
+            ".inst 0x4f8ae2c9  // sdot v9.4s, v22.16b, v10.4b[0]\n"
+            ".inst 0x4faae2dd  // sdot v29.4s, v22.16b, v10.4b[1]\n"
+            ".inst 0x4f8aead4  // sdot v20.4s, v22.16b, v10.4b[2]\n"
+            ".inst 0x4faaeac2  // sdot v2.4s, v22.16b, v10.4b[3]\n"
+            "ldr q10, [x23, #0x70]\n"
+            "add x23, x23, #0x88\n"
+            ".inst 0x4f98e369  // sdot v9.4s, v27.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e37d  // sdot v29.4s, v27.16b, v24.4b[1]\n"
+            ".inst 0x4f98eb74  // sdot v20.4s, v27.16b, v24.4b[2]\n"
+            ".inst 0x4fb8eb62  // sdot v2.4s, v27.16b, v24.4b[3]\n"
+            "ldr q24, [x22, #0x0]\n"
+            ".inst 0x4f8ae3c9  // sdot v9.4s, v30.16b, v10.4b[0]\n"
+            ".inst 0x4faae3dd  // sdot v29.4s, v30.16b, v10.4b[1]\n"
+            ".inst 0x4f8aebd4  // sdot v20.4s, v30.16b, v10.4b[2]\n"
+            ".inst 0x4faaebc2  // sdot v2.4s, v30.16b, v10.4b[3]\n"
+            "fmul v10.4s, v17.4s, v26.s[0]\n"
+            "scvtf v9.4s, v9.4s, #0x4\n"
+            "scvtf v29.4s, v29.4s, #0x4\n"
+            "scvtf v20.4s, v20.4s, #0x4\n"
+            "scvtf v2.4s, v2.4s, #0x4\n"
+            "fmla v11.4s, v9.4s, v10.4s\n"
+            "ldr q9, [x22, #0x10]\n"
+            "fmul v10.4s, v17.4s, v26.s[1]\n"
+            "fmla v13.4s, v29.4s, v10.4s\n"
+            "ldr d29, [x22, #-0x8]\n"
+            "fmul v10.4s, v17.4s, v26.s[2]\n"
+            "fmul v26.4s, v17.4s, v26.s[3]\n"
+            "fcvtl v29.4s, v29.4h\n"
+            "fmla v23.4s, v20.4s, v10.4s\n"
+            "movi v20.4s, #0x0\n"
+            "movi v10.4s, #0x0\n"
+            "fmla v16.4s, v2.4s, v26.4s\n"
+            "movi v26.4s, #0x0\n"
+            "movi v2.4s, #0x0\n"
+            ".inst 0x4f98e194  // sdot v20.4s, v12.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e18a  // sdot v10.4s, v12.16b, v24.4b[1]\n"
+            ".inst 0x4f98e99a  // sdot v26.4s, v12.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e982  // sdot v2.4s, v12.16b, v24.4b[3]\n"
+            "ldr q24, [x22, #0x20]\n"
+            ".inst 0x4f89e3f4  // sdot v20.4s, v31.16b, v9.4b[0]\n"
+            ".inst 0x4fa9e3ea  // sdot v10.4s, v31.16b, v9.4b[1]\n"
+            ".inst 0x4f89ebfa  // sdot v26.4s, v31.16b, v9.4b[2]\n"
+            ".inst 0x4fa9ebe2  // sdot v2.4s, v31.16b, v9.4b[3]\n"
+            "ldr q9, [x22, #0x30]\n"
+            ".inst 0x4f98e0d4  // sdot v20.4s, v6.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e0ca  // sdot v10.4s, v6.16b, v24.4b[1]\n"
+            ".inst 0x4f98e8da  // sdot v26.4s, v6.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e8c2  // sdot v2.4s, v6.16b, v24.4b[3]\n"
+            "ldr q24, [x22, #0x40]\n"
+            ".inst 0x4f89e394  // sdot v20.4s, v28.16b, v9.4b[0]\n"
+            ".inst 0x4fa9e38a  // sdot v10.4s, v28.16b, v9.4b[1]\n"
+            ".inst 0x4f89eb9a  // sdot v26.4s, v28.16b, v9.4b[2]\n"
+            ".inst 0x4fa9eb82  // sdot v2.4s, v28.16b, v9.4b[3]\n"
+            "ldr q9, [x22, #0x50]\n"
+            ".inst 0x4f98e074  // sdot v20.4s, v3.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e06a  // sdot v10.4s, v3.16b, v24.4b[1]\n"
+            ".inst 0x4f98e87a  // sdot v26.4s, v3.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e862  // sdot v2.4s, v3.16b, v24.4b[3]\n"
+            "ldr q24, [x22, #0x60]\n"
+            ".inst 0x4f89e2d4  // sdot v20.4s, v22.16b, v9.4b[0]\n"
+            ".inst 0x4fa9e2ca  // sdot v10.4s, v22.16b, v9.4b[1]\n"
+            ".inst 0x4f89eada  // sdot v26.4s, v22.16b, v9.4b[2]\n"
+            ".inst 0x4fa9eac2  // sdot v2.4s, v22.16b, v9.4b[3]\n"
+            "ldr q9, [x22, #0x70]\n"
+            "add x22, x22, #0x88\n"
+            ".inst 0x4f98e374  // sdot v20.4s, v27.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e36a  // sdot v10.4s, v27.16b, v24.4b[1]\n"
+            ".inst 0x4f98eb7a  // sdot v26.4s, v27.16b, v24.4b[2]\n"
+            ".inst 0x4fb8eb62  // sdot v2.4s, v27.16b, v24.4b[3]\n"
+            "ldr q24, [x21, #0x0]\n"
+            ".inst 0x4f89e3d4  // sdot v20.4s, v30.16b, v9.4b[0]\n"
+            ".inst 0x4fa9e3ca  // sdot v10.4s, v30.16b, v9.4b[1]\n"
+            ".inst 0x4f89ebda  // sdot v26.4s, v30.16b, v9.4b[2]\n"
+            ".inst 0x4fa9ebc2  // sdot v2.4s, v30.16b, v9.4b[3]\n"
+            "fmul v9.4s, v17.4s, v29.s[0]\n"
+            "scvtf v20.4s, v20.4s, #0x4\n"
+            "scvtf v10.4s, v10.4s, #0x4\n"
+            "scvtf v26.4s, v26.4s, #0x4\n"
+            "scvtf v2.4s, v2.4s, #0x4\n"
+            "fmla v25.4s, v20.4s, v9.4s\n"
+            "ldr q9, [x21, #0x10]\n"
+            "fmul v20.4s, v17.4s, v29.s[1]\n"
+            "fmla v7.4s, v10.4s, v20.4s\n"
+            "ldr d20, [x21, #-0x8]\n"
+            "fmul v10.4s, v17.4s, v29.s[2]\n"
+            "fmul v29.4s, v17.4s, v29.s[3]\n"
+            "fcvtl v20.4s, v20.4h\n"
+            "fmla v0.4s, v26.4s, v10.4s\n"
+            "movi v26.4s, #0x0\n"
+            "movi v10.4s, #0x0\n"
+            "fmla v4.4s, v2.4s, v29.4s\n"
+            "movi v2.4s, #0x0\n"
+            "movi v29.4s, #0x0\n"
+            ".inst 0x4f98e19a  // sdot v26.4s, v12.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e18a  // sdot v10.4s, v12.16b, v24.4b[1]\n"
+            ".inst 0x4f98e982  // sdot v2.4s, v12.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e99d  // sdot v29.4s, v12.16b, v24.4b[3]\n"
+            "ldr q12, [x21, #0x20]\n"
+            "fmul v24.4s, v17.4s, v20.s[0]\n"
+            ".inst 0x4f89e3fa  // sdot v26.4s, v31.16b, v9.4b[0]\n"
+            ".inst 0x4fa9e3ea  // sdot v10.4s, v31.16b, v9.4b[1]\n"
+            ".inst 0x4f89ebe2  // sdot v2.4s, v31.16b, v9.4b[2]\n"
+            ".inst 0x4fa9ebfd  // sdot v29.4s, v31.16b, v9.4b[3]\n"
+            "ldr q9, [x21, #0x30]\n"
+            "fmul v31.4s, v17.4s, v20.s[1]\n"
+            ".inst 0x4f8ce0da  // sdot v26.4s, v6.16b, v12.4b[0]\n"
+            ".inst 0x4face0ca  // sdot v10.4s, v6.16b, v12.4b[1]\n"
+            ".inst 0x4f8ce8c2  // sdot v2.4s, v6.16b, v12.4b[2]\n"
+            ".inst 0x4face8dd  // sdot v29.4s, v6.16b, v12.4b[3]\n"
+            "ldr q12, [x21, #0x40]\n"
+            "fmul v6.4s, v17.4s, v20.s[2]\n"
+            "fmul v20.4s, v17.4s, v20.s[3]\n"
+            ".inst 0x4f89e39a  // sdot v26.4s, v28.16b, v9.4b[0]\n"
+            ".inst 0x4fa9e38a  // sdot v10.4s, v28.16b, v9.4b[1]\n"
+            ".inst 0x4f89eb82  // sdot v2.4s, v28.16b, v9.4b[2]\n"
+            ".inst 0x4fa9eb9d  // sdot v29.4s, v28.16b, v9.4b[3]\n"
+            "ldr q9, [x21, #0x50]\n"
+            ".inst 0x4f8ce07a  // sdot v26.4s, v3.16b, v12.4b[0]\n"
+            ".inst 0x4face06a  // sdot v10.4s, v3.16b, v12.4b[1]\n"
+            ".inst 0x4f8ce862  // sdot v2.4s, v3.16b, v12.4b[2]\n"
+            ".inst 0x4face87d  // sdot v29.4s, v3.16b, v12.4b[3]\n"
+            "ldr q12, [x21, #0x60]\n"
+            ".inst 0x4f89e2da  // sdot v26.4s, v22.16b, v9.4b[0]\n"
+            ".inst 0x4fa9e2ca  // sdot v10.4s, v22.16b, v9.4b[1]\n"
+            ".inst 0x4f89eac2  // sdot v2.4s, v22.16b, v9.4b[2]\n"
+            ".inst 0x4fa9eadd  // sdot v29.4s, v22.16b, v9.4b[3]\n"
+            "ldr q17, [x21, #0x70]\n"
+            "add x21, x21, #0x88\n"
+            ".inst 0x4f8ce37a  // sdot v26.4s, v27.16b, v12.4b[0]\n"
+            ".inst 0x4face36a  // sdot v10.4s, v27.16b, v12.4b[1]\n"
+            ".inst 0x4f8ceb62  // sdot v2.4s, v27.16b, v12.4b[2]\n"
+            ".inst 0x4faceb7d  // sdot v29.4s, v27.16b, v12.4b[3]\n"
+            ".inst 0x4f91e3da  // sdot v26.4s, v30.16b, v17.4b[0]\n"
+            ".inst 0x4fb1e3ca  // sdot v10.4s, v30.16b, v17.4b[1]\n"
+            ".inst 0x4f91ebc2  // sdot v2.4s, v30.16b, v17.4b[2]\n"
+            ".inst 0x4fb1ebdd  // sdot v29.4s, v30.16b, v17.4b[3]\n"
+            "scvtf v26.4s, v26.4s, #0x4\n"
+            "scvtf v10.4s, v10.4s, #0x4\n"
+            "fmla v5.4s, v26.4s, v24.4s\n"
+            "scvtf v2.4s, v2.4s, #0x4\n"
+            "scvtf v29.4s, v29.4s, #0x4\n"
+            "fmla v21.4s, v10.4s, v31.4s\n"
+            "fmla v8.4s, v2.4s, v6.4s\n"
+            "fmla v1.4s, v29.4s, v20.4s\n"
+            "bgt 3b\n"
+            "mov x20, %x[res_ptr]\n"
+            "subs x27, x27, #0x4\n"
+            "add %x[res_ptr], %x[res_ptr], #0x10\n"
+            "str q15, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q19, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q18, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q14, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q11, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q13, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q23, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q16, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q25, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q7, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q0, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q4, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q5, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q21, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q8, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q1, [x20, #0x0]\n"
+            "bne 2b\n"
+            "mov x20, #0x4\n"
+            "sub x10, x10, #0x10\n"
+            "cmp x10, #0x10\n"
+            "mov %x[res_ptr], x26\n"
+            "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
+            "bge 1b\n"
+            "4:"  // Row loop skip
+            "cbz x10, 9f\n"
+            "5:"  // Row tail: Row loop
+            "add x24, %x[b_ptr], #0x8\n"
+            "mov x23, %x[nc]\n"
+            "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
+            "6:"  // Row tail: Column loop
+            "movi v15.16b, #0x0\n"
+            "movi v19.16b, #0x0\n"
+            "add x25, %x[a_ptr], #0x8\n"
+            "mov x21, %x[nb]\n"
+            "movi v18.16b, #0x0\n"
+            "movi v14.16b, #0x0\n"
+            "7:"  // Row tail: Block loop
+            "ldr q7, [x24, #0x0]\n"
+            "ldr q5, [x25, #0x0]\n"
+            "movi v9.16b, #0x4\n"
+            "movi v4.4s, #0x0\n"
+            "ldr q3, [x24, #0x10]\n"
+            "ldr q2, [x25, #0x10]\n"
+            "movi v1.4s, #0x0\n"
+            "movi v0.4s, #0x0\n"
+            "ldr q13, [x24, #0x20]\n"
+            "ldr q31, [x25, #0x20]\n"
+            "movi v30.4s, #0x0\n"
+            "movi v29.16b, #0xf0\n"
+            "ldr q28, [x24, #0x30]\n"
+            "ldr q27, [x25, #0x30]\n"
+            "sshl v20.16b, v7.16b, v9.16b\n"
+            "sub x20, x24, #0x8\n"
+            "ldr q26, [x25, #0x40]\n"
+            "ldr q25, [x25, #0x50]\n"
+            "sshl v17.16b, v3.16b, v9.16b\n"
+            "and v7.16b, v7.16b, v29.16b\n"
+            "ldr q24, [x25, #0x60]\n"
+            "ldr q16, [x25, #0x70]\n"
+            "sshl v22.16b, v13.16b, v9.16b\n"
+            "and v3.16b, v3.16b, v29.16b\n"
+            "ldr d21, [x20, #0x0]\n"
+            "ldr d12, [x25, #-0x8]\n"
+            ".inst 0x4f85e284  // sdot v4.4s, v20.16b, v5.4b[0]\n"
+            ".inst 0x4fa5e281  // sdot v1.4s, v20.16b, v5.4b[1]\n"
+            ".inst 0x4f85ea80  // sdot v0.4s, v20.16b, v5.4b[2]\n"
+            ".inst 0x4fa5ea9e  // sdot v30.4s, v20.16b, v5.4b[3]\n"
+            "sshl v9.16b, v28.16b, v9.16b\n"
+            "subs x21, x21, #0x1\n"
+            "and v13.16b, v13.16b, v29.16b\n"
+            "and v28.16b, v28.16b, v29.16b\n"
+            "add x25, x25, #0x88\n"
+            "add x24, x24, #0x48\n"
+            "fcvtl v21.4s, v21.4h\n"
+            "fcvtl v12.4s, v12.4h\n"
+            ".inst 0x4f82e224  // sdot v4.4s, v17.16b, v2.4b[0]\n"
+            ".inst 0x4fa2e221  // sdot v1.4s, v17.16b, v2.4b[1]\n"
+            ".inst 0x4f82ea20  // sdot v0.4s, v17.16b, v2.4b[2]\n"
+            ".inst 0x4fa2ea3e  // sdot v30.4s, v17.16b, v2.4b[3]\n"
+            "fmul v11.4s, v21.4s, v12.s[0]\n"
+            "fmul v23.4s, v21.4s, v12.s[1]\n"
+            "fmul v17.4s, v21.4s, v12.s[2]\n"
+            ".inst 0x4f9fe2c4  // sdot v4.4s, v22.16b, v31.4b[0]\n"
+            "fmul v6.4s, v21.4s, v12.s[3]\n"
+            ".inst 0x4fbfe2c1  // sdot v1.4s, v22.16b, v31.4b[1]\n"
+            ".inst 0x4f9feac0  // sdot v0.4s, v22.16b, v31.4b[2]\n"
+            ".inst 0x4fbfeade  // sdot v30.4s, v22.16b, v31.4b[3]\n"
+            ".inst 0x4f9be124  // sdot v4.4s, v9.16b, v27.4b[0]\n"
+            ".inst 0x4fbbe121  // sdot v1.4s, v9.16b, v27.4b[1]\n"
+            ".inst 0x4f9be920  // sdot v0.4s, v9.16b, v27.4b[2]\n"
+            ".inst 0x4fbbe93e  // sdot v30.4s, v9.16b, v27.4b[3]\n"
+            ".inst 0x4f9ae0e4  // sdot v4.4s, v7.16b, v26.4b[0]\n"
+            ".inst 0x4fbae0e1  // sdot v1.4s, v7.16b, v26.4b[1]\n"
+            ".inst 0x4f9ae8e0  // sdot v0.4s, v7.16b, v26.4b[2]\n"
+            ".inst 0x4fbae8fe  // sdot v30.4s, v7.16b, v26.4b[3]\n"
+            ".inst 0x4f99e064  // sdot v4.4s, v3.16b, v25.4b[0]\n"
+            ".inst 0x4fb9e061  // sdot v1.4s, v3.16b, v25.4b[1]\n"
+            ".inst 0x4f99e860  // sdot v0.4s, v3.16b, v25.4b[2]\n"
+            ".inst 0x4fb9e87e  // sdot v30.4s, v3.16b, v25.4b[3]\n"
+            ".inst 0x4f98e1a4  // sdot v4.4s, v13.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e1a1  // sdot v1.4s, v13.16b, v24.4b[1]\n"
+            ".inst 0x4f98e9a0  // sdot v0.4s, v13.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e9be  // sdot v30.4s, v13.16b, v24.4b[3]\n"
+            ".inst 0x4f90e384  // sdot v4.4s, v28.16b, v16.4b[0]\n"
+            ".inst 0x4fb0e381  // sdot v1.4s, v28.16b, v16.4b[1]\n"
+            ".inst 0x4f90eb80  // sdot v0.4s, v28.16b, v16.4b[2]\n"
+            ".inst 0x4fb0eb9e  // sdot v30.4s, v28.16b, v16.4b[3]\n"
+            "scvtf v4.4s, v4.4s, #0x4\n"
+            "scvtf v1.4s, v1.4s, #0x4\n"
+            "scvtf v0.4s, v0.4s, #0x4\n"
+            "fmla v15.4s, v4.4s, v11.4s\n"
+            "scvtf v30.4s, v30.4s, #0x4\n"
+            "fmla v19.4s, v1.4s, v23.4s\n"
+            "fmla v18.4s, v0.4s, v17.4s\n"
+            "fmla v14.4s, v30.4s, v6.4s\n"
+            "bgt 7b\n"
+            "mov x20, %x[res_ptr]\n"
+            "cmp x10, #0x1\n"
+            "str q15, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "cmp x10, #0x2\n"
+            "str q19, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "cmp x10, #0x3\n"
+            "str q18, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "str q14, [x20, #0x0]\n"
+            "8:"  // Row tail: Accumulator store skip
+            "subs x23, x23, #0x4\n"
+            "add %x[res_ptr], %x[res_ptr], #0x10\n"
+            "bne 6b\n"
+            "subs x10, x10, #0x4\n"
+            "add %x[a_ptr], %x[a_ptr], x9\n"
+            "mov %x[res_ptr], x22\n"
+            "bgt 5b\n"
+            "9:"  // Row tail: Row loop skip
+            : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+            : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
+            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+        );
+        return;
     }
-#endif
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
-    GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
-                "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
-#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
-    const void * b_ptr = vx;
-    const void * a_ptr = vy;
-    float * res_ptr = s;
-    size_t res_stride = bs * sizeof(float);
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
+    {
+        float sumf[4][4];
+        int sumi;
 
-    __asm__ __volatile__(
-        "mov x10, %x[nr]\n"
-        "mov x9, #0x88\n"
-        "cmp x10, #0x10\n"
-        "mul x9, %x[nb], x9\n"
-        "blt 4f\n"
-        "1:"  // Row loop
-        "add x28, %x[b_ptr], #0x8\n"
-        "mov x27, %x[nc]\n"
-        "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
-        "2:"  // Column loop
-        "add x25, %x[a_ptr], #0x8\n"
-        "movi v15.16b, #0x0\n"
-        "movi v19.16b, #0x0\n"
-        "mov x24, %x[nb]\n"
-        "add x23, x25, x9\n"
-        "movi v18.16b, #0x0\n"
-        "movi v14.16b, #0x0\n"
-        "add x22, x23, x9\n"
-        "movi v11.16b, #0x0\n"
-        "movi v13.16b, #0x0\n"
-        "add x21, x22, x9\n"
-        "movi v23.16b, #0x0\n"
-        "movi v16.16b, #0x0\n"
-        "movi v25.16b, #0x0\n"
-        "movi v7.16b, #0x0\n"
-        "movi v0.16b, #0x0\n"
-        "movi v4.16b, #0x0\n"
-        "movi v5.16b, #0x0\n"
-        "movi v21.16b, #0x0\n"
-        "movi v8.16b, #0x0\n"
-        "movi v1.16b, #0x0\n"
-        "3:"  // Block loop
-        "ldr q3, [x28, #0x0]\n"
-        "ldr q31, [x25, #0x0]\n"
-        "movi v28.16b, #0x4\n"
-        "movi v10.4s, #0x0\n"
-        "ldr q22, [x28, #0x10]\n"
-        "ldr q6, [x25, #0x10]\n"
-        "movi v29.4s, #0x0\n"
-        "movi v9.4s, #0x0\n"
-        "ldr q27, [x28, #0x20]\n"
-        "ldr q30, [x28, #0x30]\n"
-        "movi v20.4s, #0x0\n"
-        "movi v24.16b, #0xf0\n"
-        "ldr d2, [x25, #-0x8]\n"
-        "ldr d26, [x23, #-0x8]\n"
-        "sshl v12.16b, v3.16b, v28.16b\n"
-        "sub x20, x28, #0x8\n"
-        "ldr d17, [x20, #0x0]\n"
-        "and v3.16b, v3.16b, v24.16b\n"
-        "subs x24, x24, #0x1\n"
-        "add x28, x28, #0x48\n"
-        ".inst 0x4f9fe18a  // sdot v10.4s, v12.16b, v31.4b[0]\n"
-        ".inst 0x4fbfe19d  // sdot v29.4s, v12.16b, v31.4b[1]\n"
-        ".inst 0x4f9fe989  // sdot v9.4s, v12.16b, v31.4b[2]\n"
-        ".inst 0x4fbfe994  // sdot v20.4s, v12.16b, v31.4b[3]\n"
-        "sshl v31.16b, v22.16b, v28.16b\n"
-        "and v22.16b, v22.16b, v24.16b\n"
-        "fcvtl v17.4s, v17.4h\n"
-        "fcvtl v2.4s, v2.4h\n"
-        "fcvtl v26.4s, v26.4h\n"
-        ".inst 0x4f86e3ea  // sdot v10.4s, v31.16b, v6.4b[0]\n"
-        ".inst 0x4fa6e3fd  // sdot v29.4s, v31.16b, v6.4b[1]\n"
-        ".inst 0x4f86ebe9  // sdot v9.4s, v31.16b, v6.4b[2]\n"
-        ".inst 0x4fa6ebf4  // sdot v20.4s, v31.16b, v6.4b[3]\n"
-        "sshl v6.16b, v27.16b, v28.16b\n"
-        "sshl v28.16b, v30.16b, v28.16b\n"
-        "and v27.16b, v27.16b, v24.16b\n"
-        "and v30.16b, v30.16b, v24.16b\n"
-        "ldr q24, [x25, #0x20]\n"
-        ".inst 0x4f98e0ca  // sdot v10.4s, v6.16b, v24.4b[0]\n"
-        ".inst 0x4fb8e0dd  // sdot v29.4s, v6.16b, v24.4b[1]\n"
-        ".inst 0x4f98e8c9  // sdot v9.4s, v6.16b, v24.4b[2]\n"
-        ".inst 0x4fb8e8d4  // sdot v20.4s, v6.16b, v24.4b[3]\n"
-        "ldr q24, [x25, #0x30]\n"
-        ".inst 0x4f98e38a  // sdot v10.4s, v28.16b, v24.4b[0]\n"
-        ".inst 0x4fb8e39d  // sdot v29.4s, v28.16b, v24.4b[1]\n"
-        ".inst 0x4f98eb89  // sdot v9.4s, v28.16b, v24.4b[2]\n"
-        ".inst 0x4fb8eb94  // sdot v20.4s, v28.16b, v24.4b[3]\n"
-        "ldr q24, [x25, #0x40]\n"
-        ".inst 0x4f98e06a  // sdot v10.4s, v3.16b, v24.4b[0]\n"
-        ".inst 0x4fb8e07d  // sdot v29.4s, v3.16b, v24.4b[1]\n"
-        ".inst 0x4f98e869  // sdot v9.4s, v3.16b, v24.4b[2]\n"
-        ".inst 0x4fb8e874  // sdot v20.4s, v3.16b, v24.4b[3]\n"
-        "ldr q24, [x25, #0x50]\n"
-        ".inst 0x4f98e2ca  // sdot v10.4s, v22.16b, v24.4b[0]\n"
-        ".inst 0x4fb8e2dd  // sdot v29.4s, v22.16b, v24.4b[1]\n"
-        ".inst 0x4f98eac9  // sdot v9.4s, v22.16b, v24.4b[2]\n"
-        ".inst 0x4fb8ead4  // sdot v20.4s, v22.16b, v24.4b[3]\n"
-        "ldr q24, [x25, #0x60]\n"
-        ".inst 0x4f98e36a  // sdot v10.4s, v27.16b, v24.4b[0]\n"
-        ".inst 0x4fb8e37d  // sdot v29.4s, v27.16b, v24.4b[1]\n"
-        ".inst 0x4f98eb69  // sdot v9.4s, v27.16b, v24.4b[2]\n"
-        ".inst 0x4fb8eb74  // sdot v20.4s, v27.16b, v24.4b[3]\n"
-        "ldr q24, [x25, #0x70]\n"
-        "add x25, x25, #0x88\n"
-        ".inst 0x4f98e3ca  // sdot v10.4s, v30.16b, v24.4b[0]\n"
-        ".inst 0x4fb8e3dd  // sdot v29.4s, v30.16b, v24.4b[1]\n"
-        ".inst 0x4f98ebc9  // sdot v9.4s, v30.16b, v24.4b[2]\n"
-        ".inst 0x4fb8ebd4  // sdot v20.4s, v30.16b, v24.4b[3]\n"
-        "fmul v24.4s, v17.4s, v2.s[0]\n"
-        "scvtf v10.4s, v10.4s, #0x4\n"
-        "scvtf v29.4s, v29.4s, #0x4\n"
-        "scvtf v9.4s, v9.4s, #0x4\n"
-        "scvtf v20.4s, v20.4s, #0x4\n"
-        "fmla v15.4s, v10.4s, v24.4s\n"
-        "ldr q24, [x23, #0x0]\n"
-        "fmul v10.4s, v17.4s, v2.s[1]\n"
-        "fmla v19.4s, v29.4s, v10.4s\n"
-        "ldr q10, [x23, #0x10]\n"
-        "fmul v29.4s, v17.4s, v2.s[2]\n"
-        "fmul v2.4s, v17.4s, v2.s[3]\n"
-        "fmla v18.4s, v9.4s, v29.4s\n"
-        "movi v9.4s, #0x0\n"
-        "movi v29.4s, #0x0\n"
-        ".inst 0x4f98e189  // sdot v9.4s, v12.16b, v24.4b[0]\n"
-        ".inst 0x4fb8e19d  // sdot v29.4s, v12.16b, v24.4b[1]\n"
-        "fmla v14.4s, v20.4s, v2.4s\n"
-        "movi v20.4s, #0x0\n"
-        "movi v2.4s, #0x0\n"
-        ".inst 0x4f98e994  // sdot v20.4s, v12.16b, v24.4b[2]\n"
-        ".inst 0x4fb8e982  // sdot v2.4s, v12.16b, v24.4b[3]\n"
-        "ldr q24, [x23, #0x20]\n"
-        ".inst 0x4f8ae3e9  // sdot v9.4s, v31.16b, v10.4b[0]\n"
-        ".inst 0x4faae3fd  // sdot v29.4s, v31.16b, v10.4b[1]\n"
-        ".inst 0x4f8aebf4  // sdot v20.4s, v31.16b, v10.4b[2]\n"
-        ".inst 0x4faaebe2  // sdot v2.4s, v31.16b, v10.4b[3]\n"
-        "ldr q10, [x23, #0x30]\n"
-        ".inst 0x4f98e0c9  // sdot v9.4s, v6.16b, v24.4b[0]\n"
-        ".inst 0x4fb8e0dd  // sdot v29.4s, v6.16b, v24.4b[1]\n"
-        ".inst 0x4f98e8d4  // sdot v20.4s, v6.16b, v24.4b[2]\n"
-        ".inst 0x4fb8e8c2  // sdot v2.4s, v6.16b, v24.4b[3]\n"
-        "ldr q24, [x23, #0x40]\n"
-        ".inst 0x4f8ae389  // sdot v9.4s, v28.16b, v10.4b[0]\n"
-        ".inst 0x4faae39d  // sdot v29.4s, v28.16b, v10.4b[1]\n"
-        ".inst 0x4f8aeb94  // sdot v20.4s, v28.16b, v10.4b[2]\n"
-        ".inst 0x4faaeb82  // sdot v2.4s, v28.16b, v10.4b[3]\n"
-        "ldr q10, [x23, #0x50]\n"
-        ".inst 0x4f98e069  // sdot v9.4s, v3.16b, v24.4b[0]\n"
-        ".inst 0x4fb8e07d  // sdot v29.4s, v3.16b, v24.4b[1]\n"
-        ".inst 0x4f98e874  // sdot v20.4s, v3.16b, v24.4b[2]\n"
-        ".inst 0x4fb8e862  // sdot v2.4s, v3.16b, v24.4b[3]\n"
-        "ldr q24, [x23, #0x60]\n"
-        ".inst 0x4f8ae2c9  // sdot v9.4s, v22.16b, v10.4b[0]\n"
-        ".inst 0x4faae2dd  // sdot v29.4s, v22.16b, v10.4b[1]\n"
-        ".inst 0x4f8aead4  // sdot v20.4s, v22.16b, v10.4b[2]\n"
-        ".inst 0x4faaeac2  // sdot v2.4s, v22.16b, v10.4b[3]\n"
-        "ldr q10, [x23, #0x70]\n"
-        "add x23, x23, #0x88\n"
-        ".inst 0x4f98e369  // sdot v9.4s, v27.16b, v24.4b[0]\n"
-        ".inst 0x4fb8e37d  // sdot v29.4s, v27.16b, v24.4b[1]\n"
-        ".inst 0x4f98eb74  // sdot v20.4s, v27.16b, v24.4b[2]\n"
-        ".inst 0x4fb8eb62  // sdot v2.4s, v27.16b, v24.4b[3]\n"
-        "ldr q24, [x22, #0x0]\n"
-        ".inst 0x4f8ae3c9  // sdot v9.4s, v30.16b, v10.4b[0]\n"
-        ".inst 0x4faae3dd  // sdot v29.4s, v30.16b, v10.4b[1]\n"
-        ".inst 0x4f8aebd4  // sdot v20.4s, v30.16b, v10.4b[2]\n"
-        ".inst 0x4faaebc2  // sdot v2.4s, v30.16b, v10.4b[3]\n"
-        "fmul v10.4s, v17.4s, v26.s[0]\n"
-        "scvtf v9.4s, v9.4s, #0x4\n"
-        "scvtf v29.4s, v29.4s, #0x4\n"
-        "scvtf v20.4s, v20.4s, #0x4\n"
-        "scvtf v2.4s, v2.4s, #0x4\n"
-        "fmla v11.4s, v9.4s, v10.4s\n"
-        "ldr q9, [x22, #0x10]\n"
-        "fmul v10.4s, v17.4s, v26.s[1]\n"
-        "fmla v13.4s, v29.4s, v10.4s\n"
-        "ldr d29, [x22, #-0x8]\n"
-        "fmul v10.4s, v17.4s, v26.s[2]\n"
-        "fmul v26.4s, v17.4s, v26.s[3]\n"
-        "fcvtl v29.4s, v29.4h\n"
-        "fmla v23.4s, v20.4s, v10.4s\n"
-        "movi v20.4s, #0x0\n"
-        "movi v10.4s, #0x0\n"
-        "fmla v16.4s, v2.4s, v26.4s\n"
-        "movi v26.4s, #0x0\n"
-        "movi v2.4s, #0x0\n"
-        ".inst 0x4f98e194  // sdot v20.4s, v12.16b, v24.4b[0]\n"
-        ".inst 0x4fb8e18a  // sdot v10.4s, v12.16b, v24.4b[1]\n"
-        ".inst 0x4f98e99a  // sdot v26.4s, v12.16b, v24.4b[2]\n"
-        ".inst 0x4fb8e982  // sdot v2.4s, v12.16b, v24.4b[3]\n"
-        "ldr q24, [x22, #0x20]\n"
-        ".inst 0x4f89e3f4  // sdot v20.4s, v31.16b, v9.4b[0]\n"
-        ".inst 0x4fa9e3ea  // sdot v10.4s, v31.16b, v9.4b[1]\n"
-        ".inst 0x4f89ebfa  // sdot v26.4s, v31.16b, v9.4b[2]\n"
-        ".inst 0x4fa9ebe2  // sdot v2.4s, v31.16b, v9.4b[3]\n"
-        "ldr q9, [x22, #0x30]\n"
-        ".inst 0x4f98e0d4  // sdot v20.4s, v6.16b, v24.4b[0]\n"
-        ".inst 0x4fb8e0ca  // sdot v10.4s, v6.16b, v24.4b[1]\n"
-        ".inst 0x4f98e8da  // sdot v26.4s, v6.16b, v24.4b[2]\n"
-        ".inst 0x4fb8e8c2  // sdot v2.4s, v6.16b, v24.4b[3]\n"
-        "ldr q24, [x22, #0x40]\n"
-        ".inst 0x4f89e394  // sdot v20.4s, v28.16b, v9.4b[0]\n"
-        ".inst 0x4fa9e38a  // sdot v10.4s, v28.16b, v9.4b[1]\n"
-        ".inst 0x4f89eb9a  // sdot v26.4s, v28.16b, v9.4b[2]\n"
-        ".inst 0x4fa9eb82  // sdot v2.4s, v28.16b, v9.4b[3]\n"
-        "ldr q9, [x22, #0x50]\n"
-        ".inst 0x4f98e074  // sdot v20.4s, v3.16b, v24.4b[0]\n"
-        ".inst 0x4fb8e06a  // sdot v10.4s, v3.16b, v24.4b[1]\n"
-        ".inst 0x4f98e87a  // sdot v26.4s, v3.16b, v24.4b[2]\n"
-        ".inst 0x4fb8e862  // sdot v2.4s, v3.16b, v24.4b[3]\n"
-        "ldr q24, [x22, #0x60]\n"
-        ".inst 0x4f89e2d4  // sdot v20.4s, v22.16b, v9.4b[0]\n"
-        ".inst 0x4fa9e2ca  // sdot v10.4s, v22.16b, v9.4b[1]\n"
-        ".inst 0x4f89eada  // sdot v26.4s, v22.16b, v9.4b[2]\n"
-        ".inst 0x4fa9eac2  // sdot v2.4s, v22.16b, v9.4b[3]\n"
-        "ldr q9, [x22, #0x70]\n"
-        "add x22, x22, #0x88\n"
-        ".inst 0x4f98e374  // sdot v20.4s, v27.16b, v24.4b[0]\n"
-        ".inst 0x4fb8e36a  // sdot v10.4s, v27.16b, v24.4b[1]\n"
-        ".inst 0x4f98eb7a  // sdot v26.4s, v27.16b, v24.4b[2]\n"
-        ".inst 0x4fb8eb62  // sdot v2.4s, v27.16b, v24.4b[3]\n"
-        "ldr q24, [x21, #0x0]\n"
-        ".inst 0x4f89e3d4  // sdot v20.4s, v30.16b, v9.4b[0]\n"
-        ".inst 0x4fa9e3ca  // sdot v10.4s, v30.16b, v9.4b[1]\n"
-        ".inst 0x4f89ebda  // sdot v26.4s, v30.16b, v9.4b[2]\n"
-        ".inst 0x4fa9ebc2  // sdot v2.4s, v30.16b, v9.4b[3]\n"
-        "fmul v9.4s, v17.4s, v29.s[0]\n"
-        "scvtf v20.4s, v20.4s, #0x4\n"
-        "scvtf v10.4s, v10.4s, #0x4\n"
-        "scvtf v26.4s, v26.4s, #0x4\n"
-        "scvtf v2.4s, v2.4s, #0x4\n"
-        "fmla v25.4s, v20.4s, v9.4s\n"
-        "ldr q9, [x21, #0x10]\n"
-        "fmul v20.4s, v17.4s, v29.s[1]\n"
-        "fmla v7.4s, v10.4s, v20.4s\n"
-        "ldr d20, [x21, #-0x8]\n"
-        "fmul v10.4s, v17.4s, v29.s[2]\n"
-        "fmul v29.4s, v17.4s, v29.s[3]\n"
-        "fcvtl v20.4s, v20.4h\n"
-        "fmla v0.4s, v26.4s, v10.4s\n"
-        "movi v26.4s, #0x0\n"
-        "movi v10.4s, #0x0\n"
-        "fmla v4.4s, v2.4s, v29.4s\n"
-        "movi v2.4s, #0x0\n"
-        "movi v29.4s, #0x0\n"
-        ".inst 0x4f98e19a  // sdot v26.4s, v12.16b, v24.4b[0]\n"
-        ".inst 0x4fb8e18a  // sdot v10.4s, v12.16b, v24.4b[1]\n"
-        ".inst 0x4f98e982  // sdot v2.4s, v12.16b, v24.4b[2]\n"
-        ".inst 0x4fb8e99d  // sdot v29.4s, v12.16b, v24.4b[3]\n"
-        "ldr q12, [x21, #0x20]\n"
-        "fmul v24.4s, v17.4s, v20.s[0]\n"
-        ".inst 0x4f89e3fa  // sdot v26.4s, v31.16b, v9.4b[0]\n"
-        ".inst 0x4fa9e3ea  // sdot v10.4s, v31.16b, v9.4b[1]\n"
-        ".inst 0x4f89ebe2  // sdot v2.4s, v31.16b, v9.4b[2]\n"
-        ".inst 0x4fa9ebfd  // sdot v29.4s, v31.16b, v9.4b[3]\n"
-        "ldr q9, [x21, #0x30]\n"
-        "fmul v31.4s, v17.4s, v20.s[1]\n"
-        ".inst 0x4f8ce0da  // sdot v26.4s, v6.16b, v12.4b[0]\n"
-        ".inst 0x4face0ca  // sdot v10.4s, v6.16b, v12.4b[1]\n"
-        ".inst 0x4f8ce8c2  // sdot v2.4s, v6.16b, v12.4b[2]\n"
-        ".inst 0x4face8dd  // sdot v29.4s, v6.16b, v12.4b[3]\n"
-        "ldr q12, [x21, #0x40]\n"
-        "fmul v6.4s, v17.4s, v20.s[2]\n"
-        "fmul v20.4s, v17.4s, v20.s[3]\n"
-        ".inst 0x4f89e39a  // sdot v26.4s, v28.16b, v9.4b[0]\n"
-        ".inst 0x4fa9e38a  // sdot v10.4s, v28.16b, v9.4b[1]\n"
-        ".inst 0x4f89eb82  // sdot v2.4s, v28.16b, v9.4b[2]\n"
-        ".inst 0x4fa9eb9d  // sdot v29.4s, v28.16b, v9.4b[3]\n"
-        "ldr q9, [x21, #0x50]\n"
-        ".inst 0x4f8ce07a  // sdot v26.4s, v3.16b, v12.4b[0]\n"
-        ".inst 0x4face06a  // sdot v10.4s, v3.16b, v12.4b[1]\n"
-        ".inst 0x4f8ce862  // sdot v2.4s, v3.16b, v12.4b[2]\n"
-        ".inst 0x4face87d  // sdot v29.4s, v3.16b, v12.4b[3]\n"
-        "ldr q12, [x21, #0x60]\n"
-        ".inst 0x4f89e2da  // sdot v26.4s, v22.16b, v9.4b[0]\n"
-        ".inst 0x4fa9e2ca  // sdot v10.4s, v22.16b, v9.4b[1]\n"
-        ".inst 0x4f89eac2  // sdot v2.4s, v22.16b, v9.4b[2]\n"
-        ".inst 0x4fa9eadd  // sdot v29.4s, v22.16b, v9.4b[3]\n"
-        "ldr q17, [x21, #0x70]\n"
-        "add x21, x21, #0x88\n"
-        ".inst 0x4f8ce37a  // sdot v26.4s, v27.16b, v12.4b[0]\n"
-        ".inst 0x4face36a  // sdot v10.4s, v27.16b, v12.4b[1]\n"
-        ".inst 0x4f8ceb62  // sdot v2.4s, v27.16b, v12.4b[2]\n"
-        ".inst 0x4faceb7d  // sdot v29.4s, v27.16b, v12.4b[3]\n"
-        ".inst 0x4f91e3da  // sdot v26.4s, v30.16b, v17.4b[0]\n"
-        ".inst 0x4fb1e3ca  // sdot v10.4s, v30.16b, v17.4b[1]\n"
-        ".inst 0x4f91ebc2  // sdot v2.4s, v30.16b, v17.4b[2]\n"
-        ".inst 0x4fb1ebdd  // sdot v29.4s, v30.16b, v17.4b[3]\n"
-        "scvtf v26.4s, v26.4s, #0x4\n"
-        "scvtf v10.4s, v10.4s, #0x4\n"
-        "fmla v5.4s, v26.4s, v24.4s\n"
-        "scvtf v2.4s, v2.4s, #0x4\n"
-        "scvtf v29.4s, v29.4s, #0x4\n"
-        "fmla v21.4s, v10.4s, v31.4s\n"
-        "fmla v8.4s, v2.4s, v6.4s\n"
-        "fmla v1.4s, v29.4s, v20.4s\n"
-        "bgt 3b\n"
-        "mov x20, %x[res_ptr]\n"
-        "subs x27, x27, #0x4\n"
-        "add %x[res_ptr], %x[res_ptr], #0x10\n"
-        "str q15, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q19, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q18, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q14, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q11, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q13, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q23, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q16, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q25, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q7, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q0, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q4, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q5, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q21, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q8, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q1, [x20, #0x0]\n"
-        "bne 2b\n"
-        "mov x20, #0x4\n"
-        "sub x10, x10, #0x10\n"
-        "cmp x10, #0x10\n"
-        "mov %x[res_ptr], x26\n"
-        "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
-        "bge 1b\n"
-        "4:"  // Row loop skip
-        "cbz x10, 9f\n"
-        "5:"  // Row tail: Row loop
-        "add x24, %x[b_ptr], #0x8\n"
-        "mov x23, %x[nc]\n"
-        "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
-        "6:"  // Row tail: Column loop
-        "movi v15.16b, #0x0\n"
-        "movi v19.16b, #0x0\n"
-        "add x25, %x[a_ptr], #0x8\n"
-        "mov x21, %x[nb]\n"
-        "movi v18.16b, #0x0\n"
-        "movi v14.16b, #0x0\n"
-        "7:"  // Row tail: Block loop
-        "ldr q7, [x24, #0x0]\n"
-        "ldr q5, [x25, #0x0]\n"
-        "movi v9.16b, #0x4\n"
-        "movi v4.4s, #0x0\n"
-        "ldr q3, [x24, #0x10]\n"
-        "ldr q2, [x25, #0x10]\n"
-        "movi v1.4s, #0x0\n"
-        "movi v0.4s, #0x0\n"
-        "ldr q13, [x24, #0x20]\n"
-        "ldr q31, [x25, #0x20]\n"
-        "movi v30.4s, #0x0\n"
-        "movi v29.16b, #0xf0\n"
-        "ldr q28, [x24, #0x30]\n"
-        "ldr q27, [x25, #0x30]\n"
-        "sshl v20.16b, v7.16b, v9.16b\n"
-        "sub x20, x24, #0x8\n"
-        "ldr q26, [x25, #0x40]\n"
-        "ldr q25, [x25, #0x50]\n"
-        "sshl v17.16b, v3.16b, v9.16b\n"
-        "and v7.16b, v7.16b, v29.16b\n"
-        "ldr q24, [x25, #0x60]\n"
-        "ldr q16, [x25, #0x70]\n"
-        "sshl v22.16b, v13.16b, v9.16b\n"
-        "and v3.16b, v3.16b, v29.16b\n"
-        "ldr d21, [x20, #0x0]\n"
-        "ldr d12, [x25, #-0x8]\n"
-        ".inst 0x4f85e284  // sdot v4.4s, v20.16b, v5.4b[0]\n"
-        ".inst 0x4fa5e281  // sdot v1.4s, v20.16b, v5.4b[1]\n"
-        ".inst 0x4f85ea80  // sdot v0.4s, v20.16b, v5.4b[2]\n"
-        ".inst 0x4fa5ea9e  // sdot v30.4s, v20.16b, v5.4b[3]\n"
-        "sshl v9.16b, v28.16b, v9.16b\n"
-        "subs x21, x21, #0x1\n"
-        "and v13.16b, v13.16b, v29.16b\n"
-        "and v28.16b, v28.16b, v29.16b\n"
-        "add x25, x25, #0x88\n"
-        "add x24, x24, #0x48\n"
-        "fcvtl v21.4s, v21.4h\n"
-        "fcvtl v12.4s, v12.4h\n"
-        ".inst 0x4f82e224  // sdot v4.4s, v17.16b, v2.4b[0]\n"
-        ".inst 0x4fa2e221  // sdot v1.4s, v17.16b, v2.4b[1]\n"
-        ".inst 0x4f82ea20  // sdot v0.4s, v17.16b, v2.4b[2]\n"
-        ".inst 0x4fa2ea3e  // sdot v30.4s, v17.16b, v2.4b[3]\n"
-        "fmul v11.4s, v21.4s, v12.s[0]\n"
-        "fmul v23.4s, v21.4s, v12.s[1]\n"
-        "fmul v17.4s, v21.4s, v12.s[2]\n"
-        ".inst 0x4f9fe2c4  // sdot v4.4s, v22.16b, v31.4b[0]\n"
-        "fmul v6.4s, v21.4s, v12.s[3]\n"
-        ".inst 0x4fbfe2c1  // sdot v1.4s, v22.16b, v31.4b[1]\n"
-        ".inst 0x4f9feac0  // sdot v0.4s, v22.16b, v31.4b[2]\n"
-        ".inst 0x4fbfeade  // sdot v30.4s, v22.16b, v31.4b[3]\n"
-        ".inst 0x4f9be124  // sdot v4.4s, v9.16b, v27.4b[0]\n"
-        ".inst 0x4fbbe121  // sdot v1.4s, v9.16b, v27.4b[1]\n"
-        ".inst 0x4f9be920  // sdot v0.4s, v9.16b, v27.4b[2]\n"
-        ".inst 0x4fbbe93e  // sdot v30.4s, v9.16b, v27.4b[3]\n"
-        ".inst 0x4f9ae0e4  // sdot v4.4s, v7.16b, v26.4b[0]\n"
-        ".inst 0x4fbae0e1  // sdot v1.4s, v7.16b, v26.4b[1]\n"
-        ".inst 0x4f9ae8e0  // sdot v0.4s, v7.16b, v26.4b[2]\n"
-        ".inst 0x4fbae8fe  // sdot v30.4s, v7.16b, v26.4b[3]\n"
-        ".inst 0x4f99e064  // sdot v4.4s, v3.16b, v25.4b[0]\n"
-        ".inst 0x4fb9e061  // sdot v1.4s, v3.16b, v25.4b[1]\n"
-        ".inst 0x4f99e860  // sdot v0.4s, v3.16b, v25.4b[2]\n"
-        ".inst 0x4fb9e87e  // sdot v30.4s, v3.16b, v25.4b[3]\n"
-        ".inst 0x4f98e1a4  // sdot v4.4s, v13.16b, v24.4b[0]\n"
-        ".inst 0x4fb8e1a1  // sdot v1.4s, v13.16b, v24.4b[1]\n"
-        ".inst 0x4f98e9a0  // sdot v0.4s, v13.16b, v24.4b[2]\n"
-        ".inst 0x4fb8e9be  // sdot v30.4s, v13.16b, v24.4b[3]\n"
-        ".inst 0x4f90e384  // sdot v4.4s, v28.16b, v16.4b[0]\n"
-        ".inst 0x4fb0e381  // sdot v1.4s, v28.16b, v16.4b[1]\n"
-        ".inst 0x4f90eb80  // sdot v0.4s, v28.16b, v16.4b[2]\n"
-        ".inst 0x4fb0eb9e  // sdot v30.4s, v28.16b, v16.4b[3]\n"
-        "scvtf v4.4s, v4.4s, #0x4\n"
-        "scvtf v1.4s, v1.4s, #0x4\n"
-        "scvtf v0.4s, v0.4s, #0x4\n"
-        "fmla v15.4s, v4.4s, v11.4s\n"
-        "scvtf v30.4s, v30.4s, #0x4\n"
-        "fmla v19.4s, v1.4s, v23.4s\n"
-        "fmla v18.4s, v0.4s, v17.4s\n"
-        "fmla v14.4s, v30.4s, v6.4s\n"
-        "bgt 7b\n"
-        "mov x20, %x[res_ptr]\n"
-        "cmp x10, #0x1\n"
-        "str q15, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "ble 8f\n"
-        "cmp x10, #0x2\n"
-        "str q19, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "ble 8f\n"
-        "cmp x10, #0x3\n"
-        "str q18, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "ble 8f\n"
-        "str q14, [x20, #0x0]\n"
-        "8:"  // Row tail: Accumulator store skip
-        "subs x23, x23, #0x4\n"
-        "add %x[res_ptr], %x[res_ptr], #0x10\n"
-        "bne 6b\n"
-        "subs x10, x10, #0x4\n"
-        "add %x[a_ptr], %x[a_ptr], x9\n"
-        "mov %x[res_ptr], x22\n"
-        "bgt 5b\n"
-        "9:"  // Row tail: Row loop skip
-        : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
-        : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
-        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
-    );
-#else
-    float sumf[4][4];
-    int sumi;
-
-    for (int y = 0; y < nr / 4; y++) {
-        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
-        for (int x = 0; x < nc / ncols_interleaved; x++) {
-            const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
-            for (int m = 0; m < 4; m++) {
-                for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
-            }
-            for (int l = 0; l < nb; l++) {
-                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
-                    for (int m = 0; m < 4; m++) {
-                        for (int j = 0; j < ncols_interleaved; j++) {
-                            sumi = 0;
-                            for (int i = 0; i < blocklen; ++i) {
-                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
-                                const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
-                                sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
-                                         (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
+        for (int y = 0; y < nr / 4; y++) {
+            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+            for (int x = 0; x < nc / ncols_interleaved; x++) {
+                const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+                for (int m = 0; m < 4; m++) {
+                    for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+                }
+                for (int l = 0; l < nb; l++) {
+                    for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                        for (int m = 0; m < 4; m++) {
+                            for (int j = 0; j < ncols_interleaved; j++) {
+                                sumi = 0;
+                                for (int i = 0; i < blocklen; ++i) {
+                                    const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                                    const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                                    sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                            (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
+                                }
+                                sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
                             }
-                            sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
                         }
                     }
                 }
-            }
-            for (int m = 0; m < 4; m++) {
-                for (int j = 0; j < ncols_interleaved; j++)
-                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+                for (int m = 0; m < 4; m++) {
+                    for (int j = 0; j < ncols_interleaved; j++)
+                        s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+                }
             }
         }
     }
-#endif
 }
 
 void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
@@ -1586,413 +1564,406 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(ncols_interleaved);
     UNUSED(blocklen);
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (ggml_sve_cnt_b == QK8_0) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
-                    "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
-    }
-#endif
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
-    const void * b_ptr = vx;
-    const void * a_ptr = vy;
-    float * res_ptr = s;
-    size_t res_stride = bs * sizeof(float);
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
+        const void * b_ptr = vx;
+        const void * a_ptr = vy;
+        float * res_ptr = s;
+        size_t res_stride = bs * sizeof(float);
 
-    __asm__ __volatile__(
-        "mov x10, %x[nr]\n"
-        "mov x9, #0x88\n"
-        "cmp x10, #0x10\n"
-        "mul x9, %x[nb], x9\n"
-        "blt 4f\n"
-        "1:"  // Row loop
-        "add x28, %x[b_ptr], #0x8\n"
-        "mov x27, %x[nc]\n"
-        "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
-        "2:"  // Column loop
-        "add x25, %x[a_ptr], #0x8\n"
-        "movi v2.16b, #0x0\n"
-        "movi v10.16b, #0x0\n"
-        "mov x24, %x[nb]\n"
-        "add x23, x25, x9\n"
-        "movi v12.16b, #0x0\n"
-        "movi v28.16b, #0x0\n"
-        "add x22, x23, x9\n"
-        "movi v11.16b, #0x0\n"
-        "movi v13.16b, #0x0\n"
-        "add x21, x22, x9\n"
-        "movi v22.16b, #0x0\n"
-        "movi v23.16b, #0x0\n"
-        "movi v25.16b, #0x0\n"
-        "movi v5.16b, #0x0\n"
-        "movi v7.16b, #0x0\n"
-        "movi v4.16b, #0x0\n"
-        "movi v6.16b, #0x0\n"
-        "movi v30.16b, #0x0\n"
-        "movi v24.16b, #0x0\n"
-        "movi v14.16b, #0x0\n"
-        "3:"  // Block loop
-        "ldr q21, [x28, #0x0]\n"
-        "ldr q16, [x28, #0x10]\n"
-        "movi v1.16b, #0x4\n"
-        "movi v19.4s, #0x0\n"
-        "ldr q27, [x25, #0x0]\n"
-        "ldr q15, [x25, #0x10]\n"
-        "movi v26.4s, #0x0\n"
-        "movi v18.4s, #0x0\n"
-        "ldr q29, [x28, #0x20]\n"
-        "ldr q3, [x28, #0x30]\n"
-        "movi v17.4s, #0x0\n"
-        "movi v0.16b, #0xf0\n"
-        "ldr d20, [x25, #-0x8]\n"
-        "ldr d9, [x23, #-0x8]\n"
-        "sshl v8.16b, v21.16b, v1.16b\n"
-        "sshl v31.16b, v16.16b, v1.16b\n"
-        "and v21.16b, v21.16b, v0.16b\n"
-        "and v16.16b, v16.16b, v0.16b\n"
-        "sub x20, x28, #0x8\n"
-        "subs x24, x24, #0x1\n"
-        "add x28, x28, #0x48\n"
-        ".inst 0x4e88a773  // smmla v19.4s, v27.16b, v8.16b\n"
-        ".inst 0x4e9fa77a  // smmla v26.4s, v27.16b, v31.16b\n"
-        "ldr q27, [x25, #0x20]\n"
-        ".inst 0x4e88a5f2  // smmla v18.4s, v15.16b, v8.16b\n"
-        ".inst 0x4e9fa5f1  // smmla v17.4s, v15.16b, v31.16b\n"
-        "sshl v15.16b, v29.16b, v1.16b\n"
-        "sshl v1.16b, v3.16b, v1.16b\n"
-        "and v29.16b, v29.16b, v0.16b\n"
-        "and v3.16b, v3.16b, v0.16b\n"
-        "ldr q0, [x25, #0x30]\n"
-        "fcvtl v20.4s, v20.4h\n"
-        ".inst 0x4e8fa773  // smmla v19.4s, v27.16b, v15.16b\n"
-        "fcvtl v9.4s, v9.4h\n"
-        ".inst 0x4e81a77a  // smmla v26.4s, v27.16b, v1.16b\n"
-        "ldr q27, [x25, #0x40]\n"
-        ".inst 0x4e8fa412  // smmla v18.4s, v0.16b, v15.16b\n"
-        ".inst 0x4e81a411  // smmla v17.4s, v0.16b, v1.16b\n"
-        "ldr q0, [x25, #0x50]\n"
-        ".inst 0x4e95a773  // smmla v19.4s, v27.16b, v21.16b\n"
-        ".inst 0x4e90a77a  // smmla v26.4s, v27.16b, v16.16b\n"
-        "ldr q27, [x25, #0x60]\n"
-        ".inst 0x4e95a412  // smmla v18.4s, v0.16b, v21.16b\n"
-        ".inst 0x4e90a411  // smmla v17.4s, v0.16b, v16.16b\n"
-        "ldr q0, [x25, #0x70]\n"
-        "add x25, x25, #0x88\n"
-        ".inst 0x4e9da773  // smmla v19.4s, v27.16b, v29.16b\n"
-        ".inst 0x4e83a77a  // smmla v26.4s, v27.16b, v3.16b\n"
-        "ldr d27, [x20, #0x0]\n"
-        ".inst 0x4e9da412  // smmla v18.4s, v0.16b, v29.16b\n"
-        ".inst 0x4e83a411  // smmla v17.4s, v0.16b, v3.16b\n"
-        "fcvtl v27.4s, v27.4h\n"
-        "uzp1 v0.2d, v19.2d, v26.2d\n"
-        "uzp2 v26.2d, v19.2d, v26.2d\n"
-        "fmul v19.4s, v27.4s, v20.s[0]\n"
-        "scvtf v0.4s, v0.4s, #0x4\n"
-        "scvtf v26.4s, v26.4s, #0x4\n"
-        "fmla v2.4s, v0.4s, v19.4s\n"
-        "ldr q19, [x23, #0x0]\n"
-        "uzp1 v0.2d, v18.2d, v17.2d\n"
-        "uzp2 v18.2d, v18.2d, v17.2d\n"
-        "fmul v17.4s, v27.4s, v20.s[1]\n"
-        "scvtf v0.4s, v0.4s, #0x4\n"
-        "scvtf v18.4s, v18.4s, #0x4\n"
-        "fmla v10.4s, v26.4s, v17.4s\n"
-        "ldr q17, [x23, #0x10]\n"
-        "fmul v26.4s, v27.4s, v20.s[2]\n"
-        "fmul v20.4s, v27.4s, v20.s[3]\n"
-        "fmla v12.4s, v0.4s, v26.4s\n"
-        "ldr d0, [x22, #-0x8]\n"
-        "ldr d26, [x21, #-0x8]\n"
-        "fcvtl v0.4s, v0.4h\n"
-        "fmla v28.4s, v18.4s, v20.4s\n"
-        "movi v20.4s, #0x0\n"
-        "movi v18.4s, #0x0\n"
-        ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
-        ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
-        "ldr q19, [x23, #0x20]\n"
-        "fcvtl v26.4s, v26.4h\n"
-        ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
-        ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
-        "ldr q19, [x23, #0x40]\n"
-        ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
-        ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
-        "ldr q19, [x23, #0x60]\n"
-        ".inst 0x4e9da674  // smmla v20.4s, v19.16b, v29.16b\n"
-        ".inst 0x4e83a672  // smmla v18.4s, v19.16b, v3.16b\n"
-        "uzp1 v19.2d, v20.2d, v18.2d\n"
-        "scvtf v19.4s, v19.4s, #0x4\n"
-        "uzp2 v20.2d, v20.2d, v18.2d\n"
-        "fmul v18.4s, v27.4s, v9.s[0]\n"
-        "scvtf v20.4s, v20.4s, #0x4\n"
-        "fmla v11.4s, v19.4s, v18.4s\n"
-        "ldr q18, [x22, #0x0]\n"
-        "fmul v19.4s, v27.4s, v9.s[1]\n"
-        "fmla v13.4s, v20.4s, v19.4s\n"
-        "movi v19.4s, #0x0\n"
-        "movi v20.4s, #0x0\n"
-        ".inst 0x4e88a633  // smmla v19.4s, v17.16b, v8.16b\n"
-        ".inst 0x4e9fa634  // smmla v20.4s, v17.16b, v31.16b\n"
-        "ldr q17, [x23, #0x30]\n"
-        ".inst 0x4e8fa633  // smmla v19.4s, v17.16b, v15.16b\n"
-        ".inst 0x4e81a634  // smmla v20.4s, v17.16b, v1.16b\n"
-        "ldr q17, [x23, #0x50]\n"
-        ".inst 0x4e95a633  // smmla v19.4s, v17.16b, v21.16b\n"
-        ".inst 0x4e90a634  // smmla v20.4s, v17.16b, v16.16b\n"
-        "ldr q17, [x23, #0x70]\n"
-        "add x23, x23, #0x88\n"
-        ".inst 0x4e9da633  // smmla v19.4s, v17.16b, v29.16b\n"
-        ".inst 0x4e83a634  // smmla v20.4s, v17.16b, v3.16b\n"
-        "uzp1 v17.2d, v19.2d, v20.2d\n"
-        "scvtf v17.4s, v17.4s, #0x4\n"
-        "uzp2 v20.2d, v19.2d, v20.2d\n"
-        "fmul v19.4s, v27.4s, v9.s[2]\n"
-        "fmul v9.4s, v27.4s, v9.s[3]\n"
-        "scvtf v20.4s, v20.4s, #0x4\n"
-        "fmla v22.4s, v17.4s, v19.4s\n"
-        "ldr q17, [x22, #0x10]\n"
-        "movi v19.4s, #0x0\n"
-        ".inst 0x4e88a653  // smmla v19.4s, v18.16b, v8.16b\n"
-        "fmla v23.4s, v20.4s, v9.4s\n"
-        "movi v20.4s, #0x0\n"
-        "movi v9.4s, #0x0\n"
-        ".inst 0x4e9fa654  // smmla v20.4s, v18.16b, v31.16b\n"
-        "ldr q18, [x22, #0x20]\n"
-        ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
-        ".inst 0x4e8fa653  // smmla v19.4s, v18.16b, v15.16b\n"
-        ".inst 0x4e81a654  // smmla v20.4s, v18.16b, v1.16b\n"
-        "ldr q18, [x22, #0x40]\n"
-        ".inst 0x4e95a653  // smmla v19.4s, v18.16b, v21.16b\n"
-        ".inst 0x4e90a654  // smmla v20.4s, v18.16b, v16.16b\n"
-        "ldr q18, [x22, #0x60]\n"
-        ".inst 0x4e9da653  // smmla v19.4s, v18.16b, v29.16b\n"
-        ".inst 0x4e83a654  // smmla v20.4s, v18.16b, v3.16b\n"
-        "movi v18.4s, #0x0\n"
-        ".inst 0x4e9fa632  // smmla v18.4s, v17.16b, v31.16b\n"
-        "ldr q17, [x22, #0x30]\n"
-        ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
-        ".inst 0x4e81a632  // smmla v18.4s, v17.16b, v1.16b\n"
-        "ldr q17, [x22, #0x50]\n"
-        ".inst 0x4e95a629  // smmla v9.4s, v17.16b, v21.16b\n"
-        ".inst 0x4e90a632  // smmla v18.4s, v17.16b, v16.16b\n"
-        "ldr q17, [x22, #0x70]\n"
-        "add x22, x22, #0x88\n"
-        ".inst 0x4e9da629  // smmla v9.4s, v17.16b, v29.16b\n"
-        ".inst 0x4e83a632  // smmla v18.4s, v17.16b, v3.16b\n"
-        "uzp1 v17.2d, v19.2d, v20.2d\n"
-        "uzp2 v20.2d, v19.2d, v20.2d\n"
-        "fmul v19.4s, v27.4s, v0.s[0]\n"
-        "scvtf v17.4s, v17.4s, #0x4\n"
-        "scvtf v20.4s, v20.4s, #0x4\n"
-        "fmla v25.4s, v17.4s, v19.4s\n"
-        "ldr q19, [x21, #0x0]\n"
-        "fmul v17.4s, v27.4s, v0.s[1]\n"
-        "fmla v5.4s, v20.4s, v17.4s\n"
-        "ldr q17, [x21, #0x10]\n"
-        "uzp1 v20.2d, v9.2d, v18.2d\n"
-        "uzp2 v9.2d, v9.2d, v18.2d\n"
-        "fmul v18.4s, v27.4s, v0.s[2]\n"
-        "fmul v0.4s, v27.4s, v0.s[3]\n"
-        "scvtf v20.4s, v20.4s, #0x4\n"
-        "scvtf v9.4s, v9.4s, #0x4\n"
-        "fmla v7.4s, v20.4s, v18.4s\n"
-        "movi v20.4s, #0x0\n"
-        "movi v18.4s, #0x0\n"
-        ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
-        ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
-        "ldr q19, [x21, #0x20]\n"
-        "fmla v4.4s, v9.4s, v0.4s\n"
-        "movi v9.4s, #0x0\n"
-        "movi v0.4s, #0x0\n"
-        ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
-        "fmul v8.4s, v27.4s, v26.s[0]\n"
-        ".inst 0x4e9fa620  // smmla v0.4s, v17.16b, v31.16b\n"
-        "ldr q17, [x21, #0x30]\n"
-        ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
-        "fmul v31.4s, v27.4s, v26.s[1]\n"
-        ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
-        "ldr q19, [x21, #0x40]\n"
-        ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
-        "fmul v15.4s, v27.4s, v26.s[2]\n"
-        "fmul v27.4s, v27.4s, v26.s[3]\n"
-        ".inst 0x4e81a620  // smmla v0.4s, v17.16b, v1.16b\n"
-        "ldr q1, [x21, #0x50]\n"
-        ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
-        ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
-        "ldr q26, [x21, #0x60]\n"
-        ".inst 0x4e95a429  // smmla v9.4s, v1.16b, v21.16b\n"
-        ".inst 0x4e90a420  // smmla v0.4s, v1.16b, v16.16b\n"
-        "ldr q21, [x21, #0x70]\n"
-        "add x21, x21, #0x88\n"
-        ".inst 0x4e9da754  // smmla v20.4s, v26.16b, v29.16b\n"
-        ".inst 0x4e83a752  // smmla v18.4s, v26.16b, v3.16b\n"
-        ".inst 0x4e9da6a9  // smmla v9.4s, v21.16b, v29.16b\n"
-        ".inst 0x4e83a6a0  // smmla v0.4s, v21.16b, v3.16b\n"
-        "uzp1 v29.2d, v20.2d, v18.2d\n"
-        "uzp2 v21.2d, v20.2d, v18.2d\n"
-        "scvtf v29.4s, v29.4s, #0x4\n"
-        "uzp1 v18.2d, v9.2d, v0.2d\n"
-        "uzp2 v16.2d, v9.2d, v0.2d\n"
-        "scvtf v21.4s, v21.4s, #0x4\n"
-        "fmla v6.4s, v29.4s, v8.4s\n"
-        "scvtf v18.4s, v18.4s, #0x4\n"
-        "scvtf v16.4s, v16.4s, #0x4\n"
-        "fmla v30.4s, v21.4s, v31.4s\n"
-        "fmla v24.4s, v18.4s, v15.4s\n"
-        "fmla v14.4s, v16.4s, v27.4s\n"
-        "bgt 3b\n"
-        "mov x20, %x[res_ptr]\n"
-        "subs x27, x27, #0x4\n"
-        "add %x[res_ptr], %x[res_ptr], #0x10\n"
-        "str q2, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q10, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q12, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q28, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q11, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q13, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q22, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q23, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q25, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q5, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q7, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q4, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q6, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q30, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q24, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q14, [x20, #0x0]\n"
-        "bne 2b\n"
-        "mov x20, #0x4\n"
-        "sub x10, x10, #0x10\n"
-        "cmp x10, #0x10\n"
-        "mov %x[res_ptr], x26\n"
-        "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
-        "bge 1b\n"
-        "4:"  // Row loop skip
-        "cbz x10, 9f\n"
-        "5:"  // Row tail: Row loop
-        "add x24, %x[b_ptr], #0x8\n"
-        "mov x23, %x[nc]\n"
-        "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
-        "6:"  // Row tail: Column loop
-        "movi v2.16b, #0x0\n"
-        "movi v10.16b, #0x0\n"
-        "add x25, %x[a_ptr], #0x8\n"
-        "mov x21, %x[nb]\n"
-        "movi v12.16b, #0x0\n"
-        "movi v28.16b, #0x0\n"
-        "7:"  // Row tail: Block loop
-        "ldr q6, [x24, #0x0]\n"
-        "ldr q5, [x24, #0x10]\n"
-        "movi v17.16b, #0x4\n"
-        "movi v8.4s, #0x0\n"
-        "ldr q4, [x25, #0x0]\n"
-        "ldr q13, [x25, #0x10]\n"
-        "movi v27.4s, #0x0\n"
-        "movi v0.4s, #0x0\n"
-        "ldr q31, [x24, #0x20]\n"
-        "ldr q14, [x24, #0x30]\n"
-        "movi v29.4s, #0x0\n"
-        "movi v22.16b, #0xf0\n"
-        "ldr q11, [x25, #0x20]\n"
-        "ldr q23, [x25, #0x30]\n"
-        "sshl v21.16b, v6.16b, v17.16b\n"
-        "sshl v16.16b, v5.16b, v17.16b\n"
-        "ldr q20, [x25, #0x40]\n"
-        "ldr q26, [x25, #0x50]\n"
-        "and v6.16b, v6.16b, v22.16b\n"
-        "and v5.16b, v5.16b, v22.16b\n"
-        "ldr q25, [x25, #0x60]\n"
-        "ldr q3, [x25, #0x70]\n"
-        "sshl v19.16b, v31.16b, v17.16b\n"
-        "sshl v18.16b, v14.16b, v17.16b\n"
-        "ldr d17, [x25, #-0x8]\n"
-        ".inst 0x4e95a488  // smmla v8.4s, v4.16b, v21.16b\n"
-        ".inst 0x4e90a49b  // smmla v27.4s, v4.16b, v16.16b\n"
-        "and v31.16b, v31.16b, v22.16b\n"
-        ".inst 0x4e95a5a0  // smmla v0.4s, v13.16b, v21.16b\n"
-        ".inst 0x4e90a5bd  // smmla v29.4s, v13.16b, v16.16b\n"
-        "and v14.16b, v14.16b, v22.16b\n"
-        "sub x20, x24, #0x8\n"
-        "ldr d16, [x20, #0x0]\n"
-        "subs x21, x21, #0x1\n"
-        "add x25, x25, #0x88\n"
-        "fcvtl v17.4s, v17.4h\n"
-        "add x24, x24, #0x48\n"
-        ".inst 0x4e93a568  // smmla v8.4s, v11.16b, v19.16b\n"
-        ".inst 0x4e92a57b  // smmla v27.4s, v11.16b, v18.16b\n"
-        ".inst 0x4e93a6e0  // smmla v0.4s, v23.16b, v19.16b\n"
-        ".inst 0x4e92a6fd  // smmla v29.4s, v23.16b, v18.16b\n"
-        "fcvtl v16.4s, v16.4h\n"
-        ".inst 0x4e86a688  // smmla v8.4s, v20.16b, v6.16b\n"
-        ".inst 0x4e85a69b  // smmla v27.4s, v20.16b, v5.16b\n"
-        "fmul v23.4s, v16.4s, v17.s[0]\n"
-        "fmul v21.4s, v16.4s, v17.s[1]\n"
-        "fmul v1.4s, v16.4s, v17.s[2]\n"
-        "fmul v20.4s, v16.4s, v17.s[3]\n"
-        ".inst 0x4e86a740  // smmla v0.4s, v26.16b, v6.16b\n"
-        ".inst 0x4e85a75d  // smmla v29.4s, v26.16b, v5.16b\n"
-        ".inst 0x4e9fa728  // smmla v8.4s, v25.16b, v31.16b\n"
-        ".inst 0x4e8ea73b  // smmla v27.4s, v25.16b, v14.16b\n"
-        ".inst 0x4e9fa460  // smmla v0.4s, v3.16b, v31.16b\n"
-        ".inst 0x4e8ea47d  // smmla v29.4s, v3.16b, v14.16b\n"
-        "uzp1 v19.2d, v8.2d, v27.2d\n"
-        "uzp2 v18.2d, v8.2d, v27.2d\n"
-        "scvtf v19.4s, v19.4s, #0x4\n"
-        "uzp1 v17.2d, v0.2d, v29.2d\n"
-        "uzp2 v16.2d, v0.2d, v29.2d\n"
-        "scvtf v18.4s, v18.4s, #0x4\n"
-        "fmla v2.4s, v19.4s, v23.4s\n"
-        "scvtf v17.4s, v17.4s, #0x4\n"
-        "scvtf v16.4s, v16.4s, #0x4\n"
-        "fmla v10.4s, v18.4s, v21.4s\n"
-        "fmla v12.4s, v17.4s, v1.4s\n"
-        "fmla v28.4s, v16.4s, v20.4s\n"
-        "bgt 7b\n"
-        "mov x20, %x[res_ptr]\n"
-        "cmp x10, #0x1\n"
-        "str q2, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "ble 8f\n"
-        "cmp x10, #0x2\n"
-        "str q10, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "ble 8f\n"
-        "cmp x10, #0x3\n"
-        "str q12, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "ble 8f\n"
-        "str q28, [x20, #0x0]\n"
-        "8:"  // Row tail: Accumulator store skip
-        "subs x23, x23, #0x4\n"
-        "add %x[res_ptr], %x[res_ptr], #0x10\n"
-        "bne 6b\n"
-        "subs x10, x10, #0x4\n"
-        "add %x[a_ptr], %x[a_ptr], x9\n"
-        "mov %x[res_ptr], x22\n"
-        "bgt 5b\n"
-        "9:"  // Row tail: Row loop skip
-        : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
-        : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
-        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
-    );
-#elif defined(__ARM_NEON) && defined(__aarch64__)
-    GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
-                "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
-                "performance");
-#else
+        __asm__ __volatile__(
+            "mov x10, %x[nr]\n"
+            "mov x9, #0x88\n"
+            "cmp x10, #0x10\n"
+            "mul x9, %x[nb], x9\n"
+            "blt 4f\n"
+            "1:"  // Row loop
+            "add x28, %x[b_ptr], #0x8\n"
+            "mov x27, %x[nc]\n"
+            "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
+            "2:"  // Column loop
+            "add x25, %x[a_ptr], #0x8\n"
+            "movi v2.16b, #0x0\n"
+            "movi v10.16b, #0x0\n"
+            "mov x24, %x[nb]\n"
+            "add x23, x25, x9\n"
+            "movi v12.16b, #0x0\n"
+            "movi v28.16b, #0x0\n"
+            "add x22, x23, x9\n"
+            "movi v11.16b, #0x0\n"
+            "movi v13.16b, #0x0\n"
+            "add x21, x22, x9\n"
+            "movi v22.16b, #0x0\n"
+            "movi v23.16b, #0x0\n"
+            "movi v25.16b, #0x0\n"
+            "movi v5.16b, #0x0\n"
+            "movi v7.16b, #0x0\n"
+            "movi v4.16b, #0x0\n"
+            "movi v6.16b, #0x0\n"
+            "movi v30.16b, #0x0\n"
+            "movi v24.16b, #0x0\n"
+            "movi v14.16b, #0x0\n"
+            "3:"  // Block loop
+            "ldr q21, [x28, #0x0]\n"
+            "ldr q16, [x28, #0x10]\n"
+            "movi v1.16b, #0x4\n"
+            "movi v19.4s, #0x0\n"
+            "ldr q27, [x25, #0x0]\n"
+            "ldr q15, [x25, #0x10]\n"
+            "movi v26.4s, #0x0\n"
+            "movi v18.4s, #0x0\n"
+            "ldr q29, [x28, #0x20]\n"
+            "ldr q3, [x28, #0x30]\n"
+            "movi v17.4s, #0x0\n"
+            "movi v0.16b, #0xf0\n"
+            "ldr d20, [x25, #-0x8]\n"
+            "ldr d9, [x23, #-0x8]\n"
+            "sshl v8.16b, v21.16b, v1.16b\n"
+            "sshl v31.16b, v16.16b, v1.16b\n"
+            "and v21.16b, v21.16b, v0.16b\n"
+            "and v16.16b, v16.16b, v0.16b\n"
+            "sub x20, x28, #0x8\n"
+            "subs x24, x24, #0x1\n"
+            "add x28, x28, #0x48\n"
+            ".inst 0x4e88a773  // smmla v19.4s, v27.16b, v8.16b\n"
+            ".inst 0x4e9fa77a  // smmla v26.4s, v27.16b, v31.16b\n"
+            "ldr q27, [x25, #0x20]\n"
+            ".inst 0x4e88a5f2  // smmla v18.4s, v15.16b, v8.16b\n"
+            ".inst 0x4e9fa5f1  // smmla v17.4s, v15.16b, v31.16b\n"
+            "sshl v15.16b, v29.16b, v1.16b\n"
+            "sshl v1.16b, v3.16b, v1.16b\n"
+            "and v29.16b, v29.16b, v0.16b\n"
+            "and v3.16b, v3.16b, v0.16b\n"
+            "ldr q0, [x25, #0x30]\n"
+            "fcvtl v20.4s, v20.4h\n"
+            ".inst 0x4e8fa773  // smmla v19.4s, v27.16b, v15.16b\n"
+            "fcvtl v9.4s, v9.4h\n"
+            ".inst 0x4e81a77a  // smmla v26.4s, v27.16b, v1.16b\n"
+            "ldr q27, [x25, #0x40]\n"
+            ".inst 0x4e8fa412  // smmla v18.4s, v0.16b, v15.16b\n"
+            ".inst 0x4e81a411  // smmla v17.4s, v0.16b, v1.16b\n"
+            "ldr q0, [x25, #0x50]\n"
+            ".inst 0x4e95a773  // smmla v19.4s, v27.16b, v21.16b\n"
+            ".inst 0x4e90a77a  // smmla v26.4s, v27.16b, v16.16b\n"
+            "ldr q27, [x25, #0x60]\n"
+            ".inst 0x4e95a412  // smmla v18.4s, v0.16b, v21.16b\n"
+            ".inst 0x4e90a411  // smmla v17.4s, v0.16b, v16.16b\n"
+            "ldr q0, [x25, #0x70]\n"
+            "add x25, x25, #0x88\n"
+            ".inst 0x4e9da773  // smmla v19.4s, v27.16b, v29.16b\n"
+            ".inst 0x4e83a77a  // smmla v26.4s, v27.16b, v3.16b\n"
+            "ldr d27, [x20, #0x0]\n"
+            ".inst 0x4e9da412  // smmla v18.4s, v0.16b, v29.16b\n"
+            ".inst 0x4e83a411  // smmla v17.4s, v0.16b, v3.16b\n"
+            "fcvtl v27.4s, v27.4h\n"
+            "uzp1 v0.2d, v19.2d, v26.2d\n"
+            "uzp2 v26.2d, v19.2d, v26.2d\n"
+            "fmul v19.4s, v27.4s, v20.s[0]\n"
+            "scvtf v0.4s, v0.4s, #0x4\n"
+            "scvtf v26.4s, v26.4s, #0x4\n"
+            "fmla v2.4s, v0.4s, v19.4s\n"
+            "ldr q19, [x23, #0x0]\n"
+            "uzp1 v0.2d, v18.2d, v17.2d\n"
+            "uzp2 v18.2d, v18.2d, v17.2d\n"
+            "fmul v17.4s, v27.4s, v20.s[1]\n"
+            "scvtf v0.4s, v0.4s, #0x4\n"
+            "scvtf v18.4s, v18.4s, #0x4\n"
+            "fmla v10.4s, v26.4s, v17.4s\n"
+            "ldr q17, [x23, #0x10]\n"
+            "fmul v26.4s, v27.4s, v20.s[2]\n"
+            "fmul v20.4s, v27.4s, v20.s[3]\n"
+            "fmla v12.4s, v0.4s, v26.4s\n"
+            "ldr d0, [x22, #-0x8]\n"
+            "ldr d26, [x21, #-0x8]\n"
+            "fcvtl v0.4s, v0.4h\n"
+            "fmla v28.4s, v18.4s, v20.4s\n"
+            "movi v20.4s, #0x0\n"
+            "movi v18.4s, #0x0\n"
+            ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
+            ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
+            "ldr q19, [x23, #0x20]\n"
+            "fcvtl v26.4s, v26.4h\n"
+            ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
+            ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
+            "ldr q19, [x23, #0x40]\n"
+            ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
+            ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
+            "ldr q19, [x23, #0x60]\n"
+            ".inst 0x4e9da674  // smmla v20.4s, v19.16b, v29.16b\n"
+            ".inst 0x4e83a672  // smmla v18.4s, v19.16b, v3.16b\n"
+            "uzp1 v19.2d, v20.2d, v18.2d\n"
+            "scvtf v19.4s, v19.4s, #0x4\n"
+            "uzp2 v20.2d, v20.2d, v18.2d\n"
+            "fmul v18.4s, v27.4s, v9.s[0]\n"
+            "scvtf v20.4s, v20.4s, #0x4\n"
+            "fmla v11.4s, v19.4s, v18.4s\n"
+            "ldr q18, [x22, #0x0]\n"
+            "fmul v19.4s, v27.4s, v9.s[1]\n"
+            "fmla v13.4s, v20.4s, v19.4s\n"
+            "movi v19.4s, #0x0\n"
+            "movi v20.4s, #0x0\n"
+            ".inst 0x4e88a633  // smmla v19.4s, v17.16b, v8.16b\n"
+            ".inst 0x4e9fa634  // smmla v20.4s, v17.16b, v31.16b\n"
+            "ldr q17, [x23, #0x30]\n"
+            ".inst 0x4e8fa633  // smmla v19.4s, v17.16b, v15.16b\n"
+            ".inst 0x4e81a634  // smmla v20.4s, v17.16b, v1.16b\n"
+            "ldr q17, [x23, #0x50]\n"
+            ".inst 0x4e95a633  // smmla v19.4s, v17.16b, v21.16b\n"
+            ".inst 0x4e90a634  // smmla v20.4s, v17.16b, v16.16b\n"
+            "ldr q17, [x23, #0x70]\n"
+            "add x23, x23, #0x88\n"
+            ".inst 0x4e9da633  // smmla v19.4s, v17.16b, v29.16b\n"
+            ".inst 0x4e83a634  // smmla v20.4s, v17.16b, v3.16b\n"
+            "uzp1 v17.2d, v19.2d, v20.2d\n"
+            "scvtf v17.4s, v17.4s, #0x4\n"
+            "uzp2 v20.2d, v19.2d, v20.2d\n"
+            "fmul v19.4s, v27.4s, v9.s[2]\n"
+            "fmul v9.4s, v27.4s, v9.s[3]\n"
+            "scvtf v20.4s, v20.4s, #0x4\n"
+            "fmla v22.4s, v17.4s, v19.4s\n"
+            "ldr q17, [x22, #0x10]\n"
+            "movi v19.4s, #0x0\n"
+            ".inst 0x4e88a653  // smmla v19.4s, v18.16b, v8.16b\n"
+            "fmla v23.4s, v20.4s, v9.4s\n"
+            "movi v20.4s, #0x0\n"
+            "movi v9.4s, #0x0\n"
+            ".inst 0x4e9fa654  // smmla v20.4s, v18.16b, v31.16b\n"
+            "ldr q18, [x22, #0x20]\n"
+            ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
+            ".inst 0x4e8fa653  // smmla v19.4s, v18.16b, v15.16b\n"
+            ".inst 0x4e81a654  // smmla v20.4s, v18.16b, v1.16b\n"
+            "ldr q18, [x22, #0x40]\n"
+            ".inst 0x4e95a653  // smmla v19.4s, v18.16b, v21.16b\n"
+            ".inst 0x4e90a654  // smmla v20.4s, v18.16b, v16.16b\n"
+            "ldr q18, [x22, #0x60]\n"
+            ".inst 0x4e9da653  // smmla v19.4s, v18.16b, v29.16b\n"
+            ".inst 0x4e83a654  // smmla v20.4s, v18.16b, v3.16b\n"
+            "movi v18.4s, #0x0\n"
+            ".inst 0x4e9fa632  // smmla v18.4s, v17.16b, v31.16b\n"
+            "ldr q17, [x22, #0x30]\n"
+            ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
+            ".inst 0x4e81a632  // smmla v18.4s, v17.16b, v1.16b\n"
+            "ldr q17, [x22, #0x50]\n"
+            ".inst 0x4e95a629  // smmla v9.4s, v17.16b, v21.16b\n"
+            ".inst 0x4e90a632  // smmla v18.4s, v17.16b, v16.16b\n"
+            "ldr q17, [x22, #0x70]\n"
+            "add x22, x22, #0x88\n"
+            ".inst 0x4e9da629  // smmla v9.4s, v17.16b, v29.16b\n"
+            ".inst 0x4e83a632  // smmla v18.4s, v17.16b, v3.16b\n"
+            "uzp1 v17.2d, v19.2d, v20.2d\n"
+            "uzp2 v20.2d, v19.2d, v20.2d\n"
+            "fmul v19.4s, v27.4s, v0.s[0]\n"
+            "scvtf v17.4s, v17.4s, #0x4\n"
+            "scvtf v20.4s, v20.4s, #0x4\n"
+            "fmla v25.4s, v17.4s, v19.4s\n"
+            "ldr q19, [x21, #0x0]\n"
+            "fmul v17.4s, v27.4s, v0.s[1]\n"
+            "fmla v5.4s, v20.4s, v17.4s\n"
+            "ldr q17, [x21, #0x10]\n"
+            "uzp1 v20.2d, v9.2d, v18.2d\n"
+            "uzp2 v9.2d, v9.2d, v18.2d\n"
+            "fmul v18.4s, v27.4s, v0.s[2]\n"
+            "fmul v0.4s, v27.4s, v0.s[3]\n"
+            "scvtf v20.4s, v20.4s, #0x4\n"
+            "scvtf v9.4s, v9.4s, #0x4\n"
+            "fmla v7.4s, v20.4s, v18.4s\n"
+            "movi v20.4s, #0x0\n"
+            "movi v18.4s, #0x0\n"
+            ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
+            ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
+            "ldr q19, [x21, #0x20]\n"
+            "fmla v4.4s, v9.4s, v0.4s\n"
+            "movi v9.4s, #0x0\n"
+            "movi v0.4s, #0x0\n"
+            ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
+            "fmul v8.4s, v27.4s, v26.s[0]\n"
+            ".inst 0x4e9fa620  // smmla v0.4s, v17.16b, v31.16b\n"
+            "ldr q17, [x21, #0x30]\n"
+            ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
+            "fmul v31.4s, v27.4s, v26.s[1]\n"
+            ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
+            "ldr q19, [x21, #0x40]\n"
+            ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
+            "fmul v15.4s, v27.4s, v26.s[2]\n"
+            "fmul v27.4s, v27.4s, v26.s[3]\n"
+            ".inst 0x4e81a620  // smmla v0.4s, v17.16b, v1.16b\n"
+            "ldr q1, [x21, #0x50]\n"
+            ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
+            ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
+            "ldr q26, [x21, #0x60]\n"
+            ".inst 0x4e95a429  // smmla v9.4s, v1.16b, v21.16b\n"
+            ".inst 0x4e90a420  // smmla v0.4s, v1.16b, v16.16b\n"
+            "ldr q21, [x21, #0x70]\n"
+            "add x21, x21, #0x88\n"
+            ".inst 0x4e9da754  // smmla v20.4s, v26.16b, v29.16b\n"
+            ".inst 0x4e83a752  // smmla v18.4s, v26.16b, v3.16b\n"
+            ".inst 0x4e9da6a9  // smmla v9.4s, v21.16b, v29.16b\n"
+            ".inst 0x4e83a6a0  // smmla v0.4s, v21.16b, v3.16b\n"
+            "uzp1 v29.2d, v20.2d, v18.2d\n"
+            "uzp2 v21.2d, v20.2d, v18.2d\n"
+            "scvtf v29.4s, v29.4s, #0x4\n"
+            "uzp1 v18.2d, v9.2d, v0.2d\n"
+            "uzp2 v16.2d, v9.2d, v0.2d\n"
+            "scvtf v21.4s, v21.4s, #0x4\n"
+            "fmla v6.4s, v29.4s, v8.4s\n"
+            "scvtf v18.4s, v18.4s, #0x4\n"
+            "scvtf v16.4s, v16.4s, #0x4\n"
+            "fmla v30.4s, v21.4s, v31.4s\n"
+            "fmla v24.4s, v18.4s, v15.4s\n"
+            "fmla v14.4s, v16.4s, v27.4s\n"
+            "bgt 3b\n"
+            "mov x20, %x[res_ptr]\n"
+            "subs x27, x27, #0x4\n"
+            "add %x[res_ptr], %x[res_ptr], #0x10\n"
+            "str q2, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q10, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q12, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q28, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q11, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q13, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q22, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q23, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q25, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q5, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q7, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q4, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q6, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q30, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q24, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q14, [x20, #0x0]\n"
+            "bne 2b\n"
+            "mov x20, #0x4\n"
+            "sub x10, x10, #0x10\n"
+            "cmp x10, #0x10\n"
+            "mov %x[res_ptr], x26\n"
+            "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
+            "bge 1b\n"
+            "4:"  // Row loop skip
+            "cbz x10, 9f\n"
+            "5:"  // Row tail: Row loop
+            "add x24, %x[b_ptr], #0x8\n"
+            "mov x23, %x[nc]\n"
+            "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
+            "6:"  // Row tail: Column loop
+            "movi v2.16b, #0x0\n"
+            "movi v10.16b, #0x0\n"
+            "add x25, %x[a_ptr], #0x8\n"
+            "mov x21, %x[nb]\n"
+            "movi v12.16b, #0x0\n"
+            "movi v28.16b, #0x0\n"
+            "7:"  // Row tail: Block loop
+            "ldr q6, [x24, #0x0]\n"
+            "ldr q5, [x24, #0x10]\n"
+            "movi v17.16b, #0x4\n"
+            "movi v8.4s, #0x0\n"
+            "ldr q4, [x25, #0x0]\n"
+            "ldr q13, [x25, #0x10]\n"
+            "movi v27.4s, #0x0\n"
+            "movi v0.4s, #0x0\n"
+            "ldr q31, [x24, #0x20]\n"
+            "ldr q14, [x24, #0x30]\n"
+            "movi v29.4s, #0x0\n"
+            "movi v22.16b, #0xf0\n"
+            "ldr q11, [x25, #0x20]\n"
+            "ldr q23, [x25, #0x30]\n"
+            "sshl v21.16b, v6.16b, v17.16b\n"
+            "sshl v16.16b, v5.16b, v17.16b\n"
+            "ldr q20, [x25, #0x40]\n"
+            "ldr q26, [x25, #0x50]\n"
+            "and v6.16b, v6.16b, v22.16b\n"
+            "and v5.16b, v5.16b, v22.16b\n"
+            "ldr q25, [x25, #0x60]\n"
+            "ldr q3, [x25, #0x70]\n"
+            "sshl v19.16b, v31.16b, v17.16b\n"
+            "sshl v18.16b, v14.16b, v17.16b\n"
+            "ldr d17, [x25, #-0x8]\n"
+            ".inst 0x4e95a488  // smmla v8.4s, v4.16b, v21.16b\n"
+            ".inst 0x4e90a49b  // smmla v27.4s, v4.16b, v16.16b\n"
+            "and v31.16b, v31.16b, v22.16b\n"
+            ".inst 0x4e95a5a0  // smmla v0.4s, v13.16b, v21.16b\n"
+            ".inst 0x4e90a5bd  // smmla v29.4s, v13.16b, v16.16b\n"
+            "and v14.16b, v14.16b, v22.16b\n"
+            "sub x20, x24, #0x8\n"
+            "ldr d16, [x20, #0x0]\n"
+            "subs x21, x21, #0x1\n"
+            "add x25, x25, #0x88\n"
+            "fcvtl v17.4s, v17.4h\n"
+            "add x24, x24, #0x48\n"
+            ".inst 0x4e93a568  // smmla v8.4s, v11.16b, v19.16b\n"
+            ".inst 0x4e92a57b  // smmla v27.4s, v11.16b, v18.16b\n"
+            ".inst 0x4e93a6e0  // smmla v0.4s, v23.16b, v19.16b\n"
+            ".inst 0x4e92a6fd  // smmla v29.4s, v23.16b, v18.16b\n"
+            "fcvtl v16.4s, v16.4h\n"
+            ".inst 0x4e86a688  // smmla v8.4s, v20.16b, v6.16b\n"
+            ".inst 0x4e85a69b  // smmla v27.4s, v20.16b, v5.16b\n"
+            "fmul v23.4s, v16.4s, v17.s[0]\n"
+            "fmul v21.4s, v16.4s, v17.s[1]\n"
+            "fmul v1.4s, v16.4s, v17.s[2]\n"
+            "fmul v20.4s, v16.4s, v17.s[3]\n"
+            ".inst 0x4e86a740  // smmla v0.4s, v26.16b, v6.16b\n"
+            ".inst 0x4e85a75d  // smmla v29.4s, v26.16b, v5.16b\n"
+            ".inst 0x4e9fa728  // smmla v8.4s, v25.16b, v31.16b\n"
+            ".inst 0x4e8ea73b  // smmla v27.4s, v25.16b, v14.16b\n"
+            ".inst 0x4e9fa460  // smmla v0.4s, v3.16b, v31.16b\n"
+            ".inst 0x4e8ea47d  // smmla v29.4s, v3.16b, v14.16b\n"
+            "uzp1 v19.2d, v8.2d, v27.2d\n"
+            "uzp2 v18.2d, v8.2d, v27.2d\n"
+            "scvtf v19.4s, v19.4s, #0x4\n"
+            "uzp1 v17.2d, v0.2d, v29.2d\n"
+            "uzp2 v16.2d, v0.2d, v29.2d\n"
+            "scvtf v18.4s, v18.4s, #0x4\n"
+            "fmla v2.4s, v19.4s, v23.4s\n"
+            "scvtf v17.4s, v17.4s, #0x4\n"
+            "scvtf v16.4s, v16.4s, #0x4\n"
+            "fmla v10.4s, v18.4s, v21.4s\n"
+            "fmla v12.4s, v17.4s, v1.4s\n"
+            "fmla v28.4s, v16.4s, v20.4s\n"
+            "bgt 7b\n"
+            "mov x20, %x[res_ptr]\n"
+            "cmp x10, #0x1\n"
+            "str q2, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "cmp x10, #0x2\n"
+            "str q10, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "cmp x10, #0x3\n"
+            "str q12, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "str q28, [x20, #0x0]\n"
+            "8:"  // Row tail: Accumulator store skip
+            "subs x23, x23, #0x4\n"
+            "add %x[res_ptr], %x[res_ptr], #0x10\n"
+            "bne 6b\n"
+            "subs x10, x10, #0x4\n"
+            "add %x[a_ptr], %x[a_ptr], x9\n"
+            "mov %x[res_ptr], x22\n"
+            "bgt 5b\n"
+            "9:"  // Row tail: Row loop skip
+            : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+            : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
+            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+        );
+        return;
+    }
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
     float sumf[4][4];
     int sumi;
 
@@ -2012,7 +1983,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
                                 const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
                                 const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
                                 sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
-                                         (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
+                                        (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
                             }
                             sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
                         }
@@ -2025,7 +1996,6 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
             }
         }
     }
-#endif
 }
 
 void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
@@ -2048,8 +2018,9 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(ncols_interleaved);
     UNUSED(blocklen);
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
-    if (ggml_sve_cnt_b == QK8_0) {
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
+    if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && sve_lane_count() == QK8_0) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;
         float * res_ptr = s;
@@ -2459,154 +2430,329 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
         );
         return;
     }
-    else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-        GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
-                    "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
-                    "performance");
-    }
-    else if (ggml_cpu_has_neon()) {
-        GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
-                    "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
-                    "quantization format for optimal performance");
-    }
-#endif
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
-    GGML_ASSERT(ggml_cpu_has_sve() &&
-                "__ARM_FEATURE_SVE not defined, use the Q4_0_4_8 quantization format for optimal performance");
-#elif defined(__ARM_NEON) && defined(__aarch64__)
-    GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
-                "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
-                "performance");
+#endif // #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
 #elif defined(__AVX2__) || defined(__AVX512F__)
-    const block_q4_0x8 * b_ptr_start = (const block_q4_0x8 *)vx;
-    const block_q8_0x4 * a_ptr_start = (const block_q8_0x4 *)vy;
-    int64_t b_nb = n / QK4_0;
-    int64_t y = 0;
-    // Mask to mask out nibbles from packed bytes
-    const __m256i m4b = _mm256_set1_epi8(0x0F);
-    const __m128i loadMask = _mm_blend_epi32(_mm_setzero_si128(), _mm_set1_epi32(0xFFFFFFFF), 3);
-    // Lookup table to convert signed nibbles to signed bytes
-    __m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
-    signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
-    // Permute mask used for easier vector processing at later stages
-    __m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4);
-    int64_t xstart = 0;
-    int anr = nr - nr%16; // Used to align nr with boundary of 16
-#ifdef __AVX512F__
-    int anc = nc - nc%16; // Used to align nc with boundary of 16
-    // Mask to mask out nibbles from packed bytes expanded to 512 bit length
-    const __m512i m4bexpanded = _mm512_set1_epi8(0x0F);
-    // Lookup table to convert signed nibbles to signed bytes expanded to 512 bit length
-    __m512i signextendlutexpanded = _mm512_inserti32x8(_mm512_castsi256_si512(signextendlut), signextendlut, 1);
+    {
+        const block_q4_0x8 * b_ptr_start = (const block_q4_0x8 *)vx;
+        const block_q8_0x4 * a_ptr_start = (const block_q8_0x4 *)vy;
+        int64_t b_nb = n / QK4_0;
+        int64_t y = 0;
+        // Mask to mask out nibbles from packed bytes
+        const __m256i m4b = _mm256_set1_epi8(0x0F);
+        const __m128i loadMask = _mm_blend_epi32(_mm_setzero_si128(), _mm_set1_epi32(0xFFFFFFFF), 3);
+        // Lookup table to convert signed nibbles to signed bytes
+        __m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
+        signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
+        // Permute mask used for easier vector processing at later stages
+        __m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4);
+        int64_t xstart = 0;
+        int anr = nr - nr%16; // Used to align nr with boundary of 16
+    #ifdef __AVX512F__
+        int anc = nc - nc%16; // Used to align nc with boundary of 16
+        // Mask to mask out nibbles from packed bytes expanded to 512 bit length
+        const __m512i m4bexpanded = _mm512_set1_epi8(0x0F);
+        // Lookup table to convert signed nibbles to signed bytes expanded to 512 bit length
+        __m512i signextendlutexpanded = _mm512_inserti32x8(_mm512_castsi256_si512(signextendlut), signextendlut, 1);
 
-    // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
-    for (; y < anr / 4; y += 4) {
+        // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
+        for (; y < anr / 4; y += 4) {
 
-        const block_q8_0x4 * a_ptrs[4];
+            const block_q8_0x4 * a_ptrs[4];
 
-        a_ptrs[0] = a_ptr_start + (y * nb);
-        for (int i = 0; i < 3; ++i) {
-            a_ptrs[i + 1] = a_ptrs[i] + nb;
-        }
-
-        // Take group of two block_q4_0x8 structures at each pass of the loop and perform dot product operation
-        for (int64_t x = 0; x < anc / 8; x += 2) {
-
-            const block_q4_0x8 * b_ptr_0 = b_ptr_start + ((x)     * b_nb);
-            const block_q4_0x8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
-
-            // Master FP accumulators
-            __m512 acc_rows[16];
-            for (int i = 0; i < 16; i++) {
-                acc_rows[i] = _mm512_setzero_ps();
+            a_ptrs[0] = a_ptr_start + (y * nb);
+            for (int i = 0; i < 3; ++i) {
+                a_ptrs[i + 1] = a_ptrs[i] + nb;
             }
 
-            for (int64_t b = 0; b < nb; b++) {
-                // Load the sixteen block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....BE,BF
-                const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs));
-                const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 32));
-                const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 64));
-                const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 96));
+            // Take group of two block_q4_0x8 structures at each pass of the loop and perform dot product operation
+            for (int64_t x = 0; x < anc / 8; x += 2) {
 
-                const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs));
-                const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 32));
-                const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
-                const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
+                const block_q4_0x8 * b_ptr_0 = b_ptr_start + ((x)     * b_nb);
+                const block_q4_0x8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
 
-                // Save the values in the following vectors in the formats B0B1B4B5B8B9BCBD, B2B3B6B7BABBBEBF for further processing and storing of values
-                const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
-                const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
-                const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
-                const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
+                // Master FP accumulators
+                __m512 acc_rows[16];
+                for (int i = 0; i < 16; i++) {
+                    acc_rows[i] = _mm512_setzero_ps();
+                }
 
-                const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240);
-                const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240);
-                const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240);
-                const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240);
+                for (int64_t b = 0; b < nb; b++) {
+                    // Load the sixteen block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....BE,BF
+                    const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs));
+                    const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 32));
+                    const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 64));
+                    const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 96));
 
-                const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1);
-                const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1);
-                const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1);
-                const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1);
+                    const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs));
+                    const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 32));
+                    const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
+                    const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
 
-                // 4-bit -> 8-bit - Sign is maintained
-                const __m512i rhs_mat_014589CD_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_0, m4bexpanded)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7) B8(0-7) B9(0-7) BC(0-7) BD(0-7)
-                const __m512i rhs_mat_2367ABEF_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4bexpanded)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7) BA(0-7) BB(0-7) BE(0-7) BF(0-7)
+                    // Save the values in the following vectors in the formats B0B1B4B5B8B9BCBD, B2B3B6B7BABBBEBF for further processing and storing of values
+                    const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
+                    const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
 
-                const __m512i rhs_mat_014589CD_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_1, m4bexpanded)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15) B8(8-15) B9(8-15) BC(8-15) BD(8-15)
-                const __m512i rhs_mat_2367ABEF_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4bexpanded)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15) BA(8-15) BB(8-15) BE(8-15) BF(8-15)
+                    const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240);
+                    const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240);
 
-                const __m512i rhs_mat_014589CD_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4bexpanded)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23) B8(16-23) B9(16-23) BC(16-23) BD(16-23)
-                const __m512i rhs_mat_2367ABEF_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4bexpanded)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23) BA(16-23) BB(16-23) BE(16-23) BF(16-23)
+                    const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1);
+                    const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1);
+                    const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1);
+                    const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1);
 
-                const __m512i rhs_mat_014589CD_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4bexpanded)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31) B8(24-31) B9(24-31) BC(24-31) BD(24-31)
-                const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31)
+                    // 4-bit -> 8-bit - Sign is maintained
+                    const __m512i rhs_mat_014589CD_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_0, m4bexpanded)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7) B8(0-7) B9(0-7) BC(0-7) BD(0-7)
+                    const __m512i rhs_mat_2367ABEF_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4bexpanded)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7) BA(0-7) BB(0-7) BE(0-7) BF(0-7)
 
-                // Shuffle pattern one - right side input
-                const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
-                const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)
+                    const __m512i rhs_mat_014589CD_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_1, m4bexpanded)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15) B8(8-15) B9(8-15) BC(8-15) BD(8-15)
+                    const __m512i rhs_mat_2367ABEF_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4bexpanded)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15) BA(8-15) BB(8-15) BE(8-15) BF(8-15)
 
-                const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
-                const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)
+                    const __m512i rhs_mat_014589CD_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4bexpanded)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23) B8(16-23) B9(16-23) BC(16-23) BD(16-23)
+                    const __m512i rhs_mat_2367ABEF_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4bexpanded)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23) BA(16-23) BB(16-23) BE(16-23) BF(16-23)
 
-                const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
-                const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)
+                    const __m512i rhs_mat_014589CD_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4bexpanded)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31) B8(24-31) B9(24-31) BC(24-31) BD(24-31)
+                    const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31)
 
-                const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
-                const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)
+                    // Shuffle pattern one - right side input
+                    const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
+                    const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)
 
-                // Shuffle pattern two - right side input
+                    const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
+                    const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)
 
-                const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
-                const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)
+                    const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
+                    const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)
 
-                const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
-                const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)
+                    const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
+                    const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)
 
-                const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
-                const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)
+                    // Shuffle pattern two - right side input
 
-                const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
-                const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)
+                    const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
+                    const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)
 
-                // Scale values - Load the weight scale values of two block_q4_0x8
-                const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
+                    const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
+                    const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)
 
-                // Process LHS in pairs of rows
-                for (int rp = 0; rp < 4; rp++) {
+                    const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
+                    const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)
+
+                    const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
+                    const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)
+
+                    // Scale values - Load the weight scale values of two block_q4_0x8
+                    const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
+
+                    // Process LHS in pairs of rows
+                    for (int rp = 0; rp < 4; rp++) {
+
+                        // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
+                        // Loaded as set of 128 bit vectors and repeated and stored into a 256 bit vector before again repeating into 512 bit vector
+                        __m256i lhs_mat_ymm_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs)));
+                        __m256i lhs_mat_ymm_01_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 0);
+                        __m256i lhs_mat_ymm_23_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 17);
+                        __m256i lhs_mat_ymm_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 32)));
+                        __m256i lhs_mat_ymm_01_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 0);
+                        __m256i lhs_mat_ymm_23_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 17);
+                        __m256i lhs_mat_ymm_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 64)));
+                        __m256i lhs_mat_ymm_01_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 0);
+                        __m256i lhs_mat_ymm_23_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 17);
+                        __m256i lhs_mat_ymm_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 96)));
+                        __m256i lhs_mat_ymm_01_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 0);
+                        __m256i lhs_mat_ymm_23_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 17);
+
+                        __m512i lhs_mat_01_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_0), lhs_mat_ymm_01_0, 1);
+                        __m512i lhs_mat_23_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_0), lhs_mat_ymm_23_0, 1);
+                        __m512i lhs_mat_01_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_1), lhs_mat_ymm_01_1, 1);
+                        __m512i lhs_mat_23_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_1), lhs_mat_ymm_23_1, 1);
+                        __m512i lhs_mat_01_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_2), lhs_mat_ymm_01_2, 1);
+                        __m512i lhs_mat_23_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_2), lhs_mat_ymm_23_2, 1);
+                        __m512i lhs_mat_01_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_3), lhs_mat_ymm_01_3, 1);
+                        __m512i lhs_mat_23_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_3), lhs_mat_ymm_23_3, 1);
+
+                        // Shuffle pattern one - left side input
+
+                        const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, 160);  //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
+                        const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, 160);  //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
+
+                        const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, 160);  //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
+                        const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, 160);  //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
+
+                        const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, 160);  //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
+                        const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, 160);  //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
+
+                        const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, 160);  //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
+                        const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, 160);  //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
+
+                        // Shuffle pattern two - left side input
+
+                        const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, 245);  //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
+                        const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, 245);  //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
+
+                        const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, 245);  //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
+                        const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, 245);  //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
+
+                        const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, 245);  //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
+                        const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, 245);  //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
+
+                        const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, 245);  //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
+                        const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, 245);  //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
+
+                        // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
+                        // Resembles MMLAs into 2x2 matrices in ARM Version
+                        __m512i iacc_mat_00_sp1 =
+                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_014589CD_0_sp1));
+                        __m512i iacc_mat_01_sp1 =
+                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_2367ABEF_0_sp1));
+                        __m512i iacc_mat_10_sp1 =
+                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_014589CD_0_sp1));
+                        __m512i iacc_mat_11_sp1 =
+                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_2367ABEF_0_sp1));
+                        __m512i iacc_mat_00_sp2 =
+                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_014589CD_0_sp2));
+                        __m512i iacc_mat_01_sp2 =
+                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_2367ABEF_0_sp2));
+                        __m512i iacc_mat_10_sp2 =
+                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_014589CD_0_sp2));
+                        __m512i iacc_mat_11_sp2 =
+                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_2367ABEF_0_sp2));
+
+                        // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
+                        __m512i iacc_mat_00 = _mm512_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
+                        __m512i iacc_mat_01 = _mm512_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
+                        __m512i iacc_mat_10 = _mm512_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
+                        __m512i iacc_mat_11 = _mm512_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
+
+
+                        // Straighten out to make 4 row vectors
+                        __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, 78));
+                        __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01);
+                        __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, 78));
+                        __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11);
+
+                        // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
+                        const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptrs[rp][b].d), loadMask), 68);
+                        const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
+
+                        // Multiply with appropiate scales and accumulate
+                        acc_rows[rp * 4]     = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)),   acc_rows[rp * 4]);
+                        acc_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)),  acc_rows[rp * 4 + 1]);
+                        acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
+                        acc_rows[rp * 4 + 3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_3), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[rp * 4 + 3]);
+                    }
+                }
+
+                // Store the accumulated values
+                for (int i = 0; i < 16; i++) {
+                    _mm512_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
+                }
+            }
+        }
+        // Take a block_q8_0x4 structures at each pass of the loop and perform dot product operation
+        for (; y < nr / 4; y ++) {
+
+            const block_q8_0x4 * a_ptr = a_ptr_start + (y * nb);
+
+            // Take group of two block_q4_0x8 structures at each pass of the loop and perform dot product operation
+            for (int64_t x = 0; x < anc / 8; x += 2) {
+
+                const block_q4_0x8 * b_ptr_0 = b_ptr_start + ((x)     * b_nb);
+                const block_q4_0x8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
+
+                // Master FP accumulators
+                __m512 acc_rows[4];
+                for (int i = 0; i < 4; i++) {
+                    acc_rows[i] = _mm512_setzero_ps();
+                }
+
+                for (int64_t b = 0; b < nb; b++) {
+                    // Load the sixteen block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....BE,BF
+                    const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs));
+                    const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 32));
+                    const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 64));
+                    const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 96));
+
+                    const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs));
+                    const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 32));
+                    const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
+                    const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
+
+                    // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
+                    const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
+                    const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
+
+                    const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240);
+                    const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240);
+
+                    const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1);
+                    const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1);
+                    const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1);
+                    const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1);
+
+                    // 4-bit -> 8-bit - Sign is maintained
+                    const __m512i rhs_mat_014589CD_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_0, m4bexpanded)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7) B8(0-7) B9(0-7) BC(0-7) BD(0-7)
+                    const __m512i rhs_mat_2367ABEF_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4bexpanded)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7) BA(0-7) BB(0-7) BE(0-7) BF(0-7)
+
+                    const __m512i rhs_mat_014589CD_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_1, m4bexpanded)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15) B8(8-15) B9(8-15) BC(8-15) BD(8-15)
+                    const __m512i rhs_mat_2367ABEF_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4bexpanded)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15) BA(8-15) BB(8-15) BE(8-15) BF(8-15)
+
+                    const __m512i rhs_mat_014589CD_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4bexpanded)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23) B8(16-23) B9(16-23) BC(16-23) BD(16-23)
+                    const __m512i rhs_mat_2367ABEF_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4bexpanded)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23) BA(16-23) BB(16-23) BE(16-23) BF(16-23)
+
+                    const __m512i rhs_mat_014589CD_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4bexpanded)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31) B8(24-31) B9(24-31) BC(24-31) BD(24-31)
+                    const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31)
+
+                    // Shuffle pattern one - right side input
+                    const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
+                    const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)
+
+                    const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
+                    const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)
+
+                    const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
+                    const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)
+
+                    const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
+                    const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)
+
+                    // Shuffle pattern two - right side input
+
+                    const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
+                    const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)
+
+                    const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
+                    const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)
+
+                    const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
+                    const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)
+
+                    const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
+                    const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)
+
+
+                    // Scale values - Load the weight scale values of two block_q4_0x8
+                    const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
 
                     // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
                     // Loaded as set of 128 bit vectors and repeated and stored into a 256 bit vector before again repeating into 512 bit vector
-                    __m256i lhs_mat_ymm_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs)));
+                    __m256i lhs_mat_ymm_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs)));
                     __m256i lhs_mat_ymm_01_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 0);
                     __m256i lhs_mat_ymm_23_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 17);
-                    __m256i lhs_mat_ymm_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 32)));
+                    __m256i lhs_mat_ymm_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 32)));
                     __m256i lhs_mat_ymm_01_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 0);
                     __m256i lhs_mat_ymm_23_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 17);
-                    __m256i lhs_mat_ymm_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 64)));
+                    __m256i lhs_mat_ymm_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 64)));
                     __m256i lhs_mat_ymm_01_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 0);
                     __m256i lhs_mat_ymm_23_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 17);
-                    __m256i lhs_mat_ymm_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 96)));
+                    __m256i lhs_mat_ymm_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 96)));
                     __m256i lhs_mat_ymm_01_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 0);
                     __m256i lhs_mat_ymm_23_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 17);
 
@@ -2680,314 +2826,286 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
                     __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11);
 
                     // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
-                    const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptrs[rp][b].d), loadMask), 68);
+                    const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptr[b].d), loadMask), 68);
                     const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
 
                     // Multiply with appropiate scales and accumulate
-                    acc_rows[rp * 4]     = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)),   acc_rows[rp * 4]);
-                    acc_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)),  acc_rows[rp * 4 + 1]);
-                    acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
-                    acc_rows[rp * 4 + 3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_3), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[rp * 4 + 3]);
+                    acc_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)),   acc_rows[0]);
+                    acc_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)),  acc_rows[1]);
+                    acc_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
+                    acc_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_3), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[3]);
+                }
+
+                // Store the accumulated values
+                for (int i = 0; i < 4; i++) {
+                    _mm512_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
                 }
             }
+        }
+        if (anc != nc) {
+            xstart = anc/8;
+            y = 0;
+        }
+    #endif // __AVX512F__
 
-            // Store the accumulated values
-            for (int i = 0; i < 16; i++) {
-                _mm512_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
+        // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
+
+        for (; y < anr / 4; y += 4) {
+            const block_q8_0x4 * a_ptrs[4];
+
+            a_ptrs[0] = a_ptr_start + (y * nb);
+            for (int i = 0; i < 3; ++i) {
+                a_ptrs[i + 1] = a_ptrs[i] + nb;
+            }
+
+            // Take group of eight block_q4_0x8 structures at each pass of the loop and perform dot product operation
+            for (int64_t x = xstart; x < nc / 8; x++) {
+
+                const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
+
+                // Master FP accumulators
+                __m256 acc_rows[16];
+                for (int i = 0; i < 16; i++) {
+                    acc_rows[i] = _mm256_setzero_ps();
+                }
+
+                for (int64_t b = 0; b < nb; b++) {
+                    // Load the eight block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
+                    const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
+                    const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 32));
+                    const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
+                    const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
+
+                    // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
+                    const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
+                    const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
+
+                    // 4-bit -> 8-bit - Sign is maintained
+                    const __m256i rhs_mat_0145_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_0, m4b)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7)
+                    const __m256i rhs_mat_2367_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_0, m4b)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7)
+
+                    const __m256i rhs_mat_0145_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_1, m4b)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15)
+                    const __m256i rhs_mat_2367_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_1, m4b)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15)
+
+                    const __m256i rhs_mat_0145_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4b)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23)
+                    const __m256i rhs_mat_2367_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4b)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23)
+
+                    const __m256i rhs_mat_0145_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4b)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31)
+                    const __m256i rhs_mat_2367_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4b)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31)
+
+                    // Shuffle pattern one - right side input
+                    const __m256i rhs_mat_0145_0_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_0, 136);  //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3)
+                    const __m256i rhs_mat_2367_0_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_0, 136);  //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3)
+
+                    const __m256i rhs_mat_0145_1_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_1, 136);  //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11)
+                    const __m256i rhs_mat_2367_1_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_1, 136);  //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11)
+
+                    const __m256i rhs_mat_0145_2_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_2, 136);  //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19)
+                    const __m256i rhs_mat_2367_2_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_2, 136);  //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19)
+
+                    const __m256i rhs_mat_0145_3_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_3, 136);  //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27)
+                    const __m256i rhs_mat_2367_3_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_3, 136);  //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27)
+
+                    // Shuffle pattern two - right side input
+
+                    const __m256i rhs_mat_0145_0_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_0, 221);  //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7)
+                    const __m256i rhs_mat_2367_0_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_0, 221);  //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7)
+
+                    const __m256i rhs_mat_0145_1_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_1, 221);  //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15)
+                    const __m256i rhs_mat_2367_1_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_1, 221);  //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15)
+
+                    const __m256i rhs_mat_0145_2_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_2, 221);  //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23)
+                    const __m256i rhs_mat_2367_2_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_2, 221);  //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23)
+
+                    const __m256i rhs_mat_0145_3_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_3, 221);  //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31)
+                    const __m256i rhs_mat_2367_3_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_3, 221);  //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31)
+
+                    // Scale values - Load the wight scale values of block_q4_0x8
+                    const __m256 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d);
+
+                    // Process LHS in groups of four
+                    for (int rp = 0; rp < 4; rp++) {
+                        // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
+                        // Loaded as set of 128 bit vectors and repeated into a 256 bit vector
+                        __m256i lhs_mat_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs)));
+                        __m256i lhs_mat_01_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 0);
+                        __m256i lhs_mat_23_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 17);
+                        __m256i lhs_mat_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 32)));
+                        __m256i lhs_mat_01_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 0);
+                        __m256i lhs_mat_23_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 17);
+                        __m256i lhs_mat_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 64)));
+                        __m256i lhs_mat_01_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 0);
+                        __m256i lhs_mat_23_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 17);
+                        __m256i lhs_mat_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 96)));
+                        __m256i lhs_mat_01_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 0);
+                        __m256i lhs_mat_23_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 17);
+
+                        // Shuffle pattern one - left side input
+                        const __m256i lhs_mat_01_0_sp1 = _mm256_shuffle_epi32(lhs_mat_01_0, 160);  //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
+                        const __m256i lhs_mat_23_0_sp1 = _mm256_shuffle_epi32(lhs_mat_23_0, 160);  //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
+
+                        const __m256i lhs_mat_01_1_sp1 = _mm256_shuffle_epi32(lhs_mat_01_1, 160);  //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
+                        const __m256i lhs_mat_23_1_sp1 = _mm256_shuffle_epi32(lhs_mat_23_1, 160);  //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
+
+                        const __m256i lhs_mat_01_2_sp1 = _mm256_shuffle_epi32(lhs_mat_01_2, 160);  //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
+                        const __m256i lhs_mat_23_2_sp1 = _mm256_shuffle_epi32(lhs_mat_23_2, 160);  //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
+
+                        const __m256i lhs_mat_01_3_sp1 = _mm256_shuffle_epi32(lhs_mat_01_3, 160);  //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
+                        const __m256i lhs_mat_23_3_sp1 = _mm256_shuffle_epi32(lhs_mat_23_3, 160);  //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
+
+                        // Shuffle pattern two - left side input
+                        const __m256i lhs_mat_01_0_sp2 = _mm256_shuffle_epi32(lhs_mat_01_0, 245);  //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
+                        const __m256i lhs_mat_23_0_sp2 = _mm256_shuffle_epi32(lhs_mat_23_0, 245);  //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
+
+                        const __m256i lhs_mat_01_1_sp2 = _mm256_shuffle_epi32(lhs_mat_01_1, 245);  //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
+                        const __m256i lhs_mat_23_1_sp2 = _mm256_shuffle_epi32(lhs_mat_23_1, 245);  //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
+
+                        const __m256i lhs_mat_01_2_sp2 = _mm256_shuffle_epi32(lhs_mat_01_2, 245);  //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
+                        const __m256i lhs_mat_23_2_sp2 = _mm256_shuffle_epi32(lhs_mat_23_2, 245);  //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
+
+                        const __m256i lhs_mat_01_3_sp2 = _mm256_shuffle_epi32(lhs_mat_01_3, 245);  //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
+                        const __m256i lhs_mat_23_3_sp2 = _mm256_shuffle_epi32(lhs_mat_23_3, 245);  //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
+
+                        // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
+                        // Resembles MMLAs into 2x2 matrices in ARM Version
+                        __m256i iacc_mat_00_sp1 =
+                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_0145_0_sp1));
+                        __m256i iacc_mat_01_sp1 =
+                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_2367_0_sp1));
+                        __m256i iacc_mat_10_sp1 =
+                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_0145_0_sp1));
+                        __m256i iacc_mat_11_sp1 =
+                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_2367_0_sp1));
+                        __m256i iacc_mat_00_sp2 =
+                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_0145_0_sp2));
+                        __m256i iacc_mat_01_sp2 =
+                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_2367_0_sp2));
+                        __m256i iacc_mat_10_sp2 =
+                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_0145_0_sp2));
+                        __m256i iacc_mat_11_sp2 =
+                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_2367_0_sp2));
+
+                        // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
+                        __m256i iacc_mat_00 = _mm256_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
+                        __m256i iacc_mat_01 = _mm256_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
+                        __m256i iacc_mat_10 = _mm256_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
+                        __m256i iacc_mat_11 = _mm256_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
+
+                        // Straighten out to make 4 row vectors
+                        __m256i iacc_row_0 = _mm256_blend_epi32(iacc_mat_00, _mm256_shuffle_epi32(iacc_mat_01, 78), 204);
+                        __m256i iacc_row_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01, 204);
+                        __m256i iacc_row_2 = _mm256_blend_epi32(iacc_mat_10, _mm256_shuffle_epi32(iacc_mat_11, 78), 204);
+                        __m256i iacc_row_3 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11, 204);
+
+                        // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
+                        const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
+
+                        // Multiply with appropiate scales and accumulate
+                        acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
+                        acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
+                        acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
+                        acc_rows[rp * 4 + 3] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_3), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32,  255)), acc_rows[rp * 4 + 3]);
+                    }
+                }
+
+                // Store the accumulated values
+                for (int i = 0; i < 16; i++) {
+                    _mm256_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
+                }
             }
         }
-    }
-    // Take a block_q8_0x4 structures at each pass of the loop and perform dot product operation
-    for (; y < nr / 4; y ++) {
 
-        const block_q8_0x4 * a_ptr = a_ptr_start + (y * nb);
+        // Take a block_q8_0x4 structures at each pass of the loop and perform dot product operation
+        for (; y < nr / 4; y ++) {
 
-        // Take group of two block_q4_0x8 structures at each pass of the loop and perform dot product operation
-        for (int64_t x = 0; x < anc / 8; x += 2) {
+            const block_q8_0x4 * a_ptr = a_ptr_start + (y * nb);
 
-            const block_q4_0x8 * b_ptr_0 = b_ptr_start + ((x)     * b_nb);
-            const block_q4_0x8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
+            // Load the eight block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
+            for (int64_t x = xstart; x < nc / 8; x++) {
 
-            // Master FP accumulators
-            __m512 acc_rows[4];
-            for (int i = 0; i < 4; i++) {
-                acc_rows[i] = _mm512_setzero_ps();
-            }
+                const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
 
-            for (int64_t b = 0; b < nb; b++) {
-                // Load the sixteen block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....BE,BF
-                const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs));
-                const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 32));
-                const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 64));
-                const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 96));
+                // Master FP accumulators
+                __m256 acc_rows[4];
+                for (int i = 0; i < 4; i++) {
+                    acc_rows[i] = _mm256_setzero_ps();
+                }
 
-                const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs));
-                const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 32));
-                const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
-                const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
+                for (int64_t b = 0; b < nb; b++) {
+                    // Load the eight block_q8_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
+                    const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
+                    const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 32));
+                    const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
+                    const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
 
-                // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
-                const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
-                const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
-                const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
-                const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
+                    // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
+                    const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
+                    const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
 
-                const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240);
-                const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240);
-                const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240);
-                const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240);
+                    // 4-bit -> 8-bit - Sign is maintained
+                    const __m256i rhs_mat_0145_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_0, m4b));  //B0(0-7) B1(0-7) B4(0-7) B5(0-7)
+                    const __m256i rhs_mat_2367_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_0, m4b));  //B2(0-7) B3(0-7) B6(0-7) B7(0-7)
 
-                const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1);
-                const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1);
-                const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1);
-                const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1);
+                    const __m256i rhs_mat_0145_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_1, m4b));  //B0(8-15) B1(8-15) B4(8-15) B5(8-15)
+                    const __m256i rhs_mat_2367_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_1, m4b));  //B2(8-15) B3(8-15) B6(8-15) B7(8-15)
 
-                // 4-bit -> 8-bit - Sign is maintained
-                const __m512i rhs_mat_014589CD_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_0, m4bexpanded)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7) B8(0-7) B9(0-7) BC(0-7) BD(0-7)
-                const __m512i rhs_mat_2367ABEF_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4bexpanded)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7) BA(0-7) BB(0-7) BE(0-7) BF(0-7)
+                    const __m256i rhs_mat_0145_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4b));  //B0(16-23) B1(16-23) B4(16-23) B5(16-23)
+                    const __m256i rhs_mat_2367_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4b));  //B2(16-23) B3(16-23) B6(16-23) B7(16-23)
 
-                const __m512i rhs_mat_014589CD_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_1, m4bexpanded)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15) B8(8-15) B9(8-15) BC(8-15) BD(8-15)
-                const __m512i rhs_mat_2367ABEF_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4bexpanded)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15) BA(8-15) BB(8-15) BE(8-15) BF(8-15)
+                    const __m256i rhs_mat_0145_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4b));  //B0(24-31) B1(24-31) B4(24-31) B5(24-31)
+                    const __m256i rhs_mat_2367_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4b));  //B2(24-31) B3(24-31) B6(24-31) B7(24-31)
 
-                const __m512i rhs_mat_014589CD_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4bexpanded)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23) B8(16-23) B9(16-23) BC(16-23) BD(16-23)
-                const __m512i rhs_mat_2367ABEF_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4bexpanded)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23) BA(16-23) BB(16-23) BE(16-23) BF(16-23)
+                    // Shuffle pattern one - right side input
+                    const __m256i rhs_mat_0145_0_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_0, 136);  //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3)
+                    const __m256i rhs_mat_2367_0_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_0, 136);  //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3)
 
-                const __m512i rhs_mat_014589CD_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4bexpanded)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31) B8(24-31) B9(24-31) BC(24-31) BD(24-31)
-                const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31)
+                    const __m256i rhs_mat_0145_1_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_1, 136);  //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11)
+                    const __m256i rhs_mat_2367_1_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_1, 136);  //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11)
 
-                // Shuffle pattern one - right side input
-                const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
-                const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)
+                    const __m256i rhs_mat_0145_2_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_2, 136);  //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19)
+                    const __m256i rhs_mat_2367_2_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_2, 136);  //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19)
 
-                const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
-                const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)
+                    const __m256i rhs_mat_0145_3_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_3, 136);  //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27)
+                    const __m256i rhs_mat_2367_3_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_3, 136);  //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27)
 
-                const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
-                const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)
+                    // Shuffle pattern two - right side input
 
-                const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
-                const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)
+                    const __m256i rhs_mat_0145_0_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_0, 221);  //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7)
+                    const __m256i rhs_mat_2367_0_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_0, 221);  //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7)
 
-                // Shuffle pattern two - right side input
+                    const __m256i rhs_mat_0145_1_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_1, 221);  //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15)
+                    const __m256i rhs_mat_2367_1_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_1, 221);  //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15)
 
-                const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
-                const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)
+                    const __m256i rhs_mat_0145_2_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_2, 221);  //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23)
+                    const __m256i rhs_mat_2367_2_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_2, 221);  //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23)
 
-                const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
-                const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)
+                    const __m256i rhs_mat_0145_3_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_3, 221);  //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31)
+                    const __m256i rhs_mat_2367_3_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_3, 221);  //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31)
 
-                const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
-                const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)
+                    // Scale values - Load the wight scale values of block_q4_0x8
+                    const __m256 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d);
 
-                const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
-                const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)
-
-
-                // Scale values - Load the weight scale values of two block_q4_0x8
-                const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
-
-                // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
-                // Loaded as set of 128 bit vectors and repeated and stored into a 256 bit vector before again repeating into 512 bit vector
-                __m256i lhs_mat_ymm_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs)));
-                __m256i lhs_mat_ymm_01_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 0);
-                __m256i lhs_mat_ymm_23_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 17);
-                __m256i lhs_mat_ymm_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 32)));
-                __m256i lhs_mat_ymm_01_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 0);
-                __m256i lhs_mat_ymm_23_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 17);
-                __m256i lhs_mat_ymm_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 64)));
-                __m256i lhs_mat_ymm_01_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 0);
-                __m256i lhs_mat_ymm_23_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 17);
-                __m256i lhs_mat_ymm_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 96)));
-                __m256i lhs_mat_ymm_01_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 0);
-                __m256i lhs_mat_ymm_23_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 17);
-
-                __m512i lhs_mat_01_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_0), lhs_mat_ymm_01_0, 1);
-                __m512i lhs_mat_23_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_0), lhs_mat_ymm_23_0, 1);
-                __m512i lhs_mat_01_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_1), lhs_mat_ymm_01_1, 1);
-                __m512i lhs_mat_23_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_1), lhs_mat_ymm_23_1, 1);
-                __m512i lhs_mat_01_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_2), lhs_mat_ymm_01_2, 1);
-                __m512i lhs_mat_23_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_2), lhs_mat_ymm_23_2, 1);
-                __m512i lhs_mat_01_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_3), lhs_mat_ymm_01_3, 1);
-                __m512i lhs_mat_23_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_3), lhs_mat_ymm_23_3, 1);
-
-                // Shuffle pattern one - left side input
-
-                const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, 160);  //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
-                const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, 160);  //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
-
-                const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, 160);  //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
-                const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, 160);  //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
-
-                const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, 160);  //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
-                const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, 160);  //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
-
-                const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, 160);  //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
-                const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, 160);  //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
-
-                // Shuffle pattern two - left side input
-
-                const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, 245);  //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
-                const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, 245);  //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
-
-                const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, 245);  //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
-                const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, 245);  //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
-
-                const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, 245);  //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
-                const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, 245);  //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
-
-                const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, 245);  //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
-                const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, 245);  //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
-
-                // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
-                // Resembles MMLAs into 2x2 matrices in ARM Version
-                __m512i iacc_mat_00_sp1 =
-                    _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_014589CD_0_sp1));
-                __m512i iacc_mat_01_sp1 =
-                    _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_2367ABEF_0_sp1));
-                __m512i iacc_mat_10_sp1 =
-                    _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_014589CD_0_sp1));
-                __m512i iacc_mat_11_sp1 =
-                    _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_2367ABEF_0_sp1));
-                __m512i iacc_mat_00_sp2 =
-                    _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_014589CD_0_sp2));
-                __m512i iacc_mat_01_sp2 =
-                    _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_2367ABEF_0_sp2));
-                __m512i iacc_mat_10_sp2 =
-                    _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_014589CD_0_sp2));
-                __m512i iacc_mat_11_sp2 =
-                    _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_2367ABEF_0_sp2));
-
-                // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
-                __m512i iacc_mat_00 = _mm512_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
-                __m512i iacc_mat_01 = _mm512_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
-                __m512i iacc_mat_10 = _mm512_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
-                __m512i iacc_mat_11 = _mm512_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
-
-
-                // Straighten out to make 4 row vectors
-                __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, 78));
-                __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01);
-                __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, 78));
-                __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11);
-
-                // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
-                const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptr[b].d), loadMask), 68);
-                const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
-
-                // Multiply with appropiate scales and accumulate
-                acc_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)),   acc_rows[0]);
-                acc_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)),  acc_rows[1]);
-                acc_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
-                acc_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_3), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[3]);
-            }
-
-            // Store the accumulated values
-            for (int i = 0; i < 4; i++) {
-                _mm512_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
-            }
-        }
-    }
-    if (anc != nc) {
-        xstart = anc/8;
-        y = 0;
-    }
-#endif // __AVX512F__
-
-    // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
-
-    for (; y < anr / 4; y += 4) {
-        const block_q8_0x4 * a_ptrs[4];
-
-        a_ptrs[0] = a_ptr_start + (y * nb);
-        for (int i = 0; i < 3; ++i) {
-            a_ptrs[i + 1] = a_ptrs[i] + nb;
-        }
-
-        // Take group of eight block_q4_0x8 structures at each pass of the loop and perform dot product operation
-        for (int64_t x = xstart; x < nc / 8; x++) {
-
-            const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
-
-            // Master FP accumulators
-            __m256 acc_rows[16];
-            for (int i = 0; i < 16; i++) {
-                acc_rows[i] = _mm256_setzero_ps();
-            }
-
-            for (int64_t b = 0; b < nb; b++) {
-                // Load the eight block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
-                const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
-                const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 32));
-                const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
-                const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
-
-                // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
-                const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
-                const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
-                const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
-                const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
-
-                // 4-bit -> 8-bit - Sign is maintained
-                const __m256i rhs_mat_0145_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_0, m4b)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7)
-                const __m256i rhs_mat_2367_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_0, m4b)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7)
-
-                const __m256i rhs_mat_0145_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_1, m4b)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15)
-                const __m256i rhs_mat_2367_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_1, m4b)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15)
-
-                const __m256i rhs_mat_0145_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4b)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23)
-                const __m256i rhs_mat_2367_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4b)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23)
-
-                const __m256i rhs_mat_0145_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4b)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31)
-                const __m256i rhs_mat_2367_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4b)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31)
-
-                // Shuffle pattern one - right side input
-                const __m256i rhs_mat_0145_0_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_0, 136);  //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3)
-                const __m256i rhs_mat_2367_0_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_0, 136);  //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3)
-
-                const __m256i rhs_mat_0145_1_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_1, 136);  //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11)
-                const __m256i rhs_mat_2367_1_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_1, 136);  //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11)
-
-                const __m256i rhs_mat_0145_2_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_2, 136);  //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19)
-                const __m256i rhs_mat_2367_2_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_2, 136);  //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19)
-
-                const __m256i rhs_mat_0145_3_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_3, 136);  //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27)
-                const __m256i rhs_mat_2367_3_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_3, 136);  //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27)
-
-                // Shuffle pattern two - right side input
-
-                const __m256i rhs_mat_0145_0_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_0, 221);  //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7)
-                const __m256i rhs_mat_2367_0_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_0, 221);  //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7)
-
-                const __m256i rhs_mat_0145_1_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_1, 221);  //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15)
-                const __m256i rhs_mat_2367_1_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_1, 221);  //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15)
-
-                const __m256i rhs_mat_0145_2_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_2, 221);  //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23)
-                const __m256i rhs_mat_2367_2_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_2, 221);  //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23)
-
-                const __m256i rhs_mat_0145_3_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_3, 221);  //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31)
-                const __m256i rhs_mat_2367_3_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_3, 221);  //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31)
-
-                // Scale values - Load the wight scale values of block_q4_0x8
-                const __m256 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d);
-
-                // Process LHS in groups of four
-                for (int rp = 0; rp < 4; rp++) {
                     // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
                     // Loaded as set of 128 bit vectors and repeated into a 256 bit vector
-                    __m256i lhs_mat_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs)));
+                    __m256i lhs_mat_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs)));
                     __m256i lhs_mat_01_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 0);
                     __m256i lhs_mat_23_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 17);
-                    __m256i lhs_mat_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 32)));
+                    __m256i lhs_mat_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 32)));
                     __m256i lhs_mat_01_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 0);
                     __m256i lhs_mat_23_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 17);
-                    __m256i lhs_mat_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 64)));
+                    __m256i lhs_mat_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 64)));
                     __m256i lhs_mat_01_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 0);
                     __m256i lhs_mat_23_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 17);
-                    __m256i lhs_mat_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 96)));
+                    __m256i lhs_mat_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 96)));
                     __m256i lhs_mat_01_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 0);
                     __m256i lhs_mat_23_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 17);
 
                     // Shuffle pattern one - left side input
+
                     const __m256i lhs_mat_01_0_sp1 = _mm256_shuffle_epi32(lhs_mat_01_0, 160);  //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
                     const __m256i lhs_mat_23_0_sp1 = _mm256_shuffle_epi32(lhs_mat_23_0, 160);  //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
 
@@ -3001,6 +3119,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
                     const __m256i lhs_mat_23_3_sp1 = _mm256_shuffle_epi32(lhs_mat_23_3, 160);  //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
 
                     // Shuffle pattern two - left side input
+
                     const __m256i lhs_mat_01_0_sp2 = _mm256_shuffle_epi32(lhs_mat_01_0, 245);  //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
                     const __m256i lhs_mat_23_0_sp2 = _mm256_shuffle_epi32(lhs_mat_23_0, 245);  //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
 
@@ -3038,6 +3157,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
                     __m256i iacc_mat_10 = _mm256_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
                     __m256i iacc_mat_11 = _mm256_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
 
+
                     // Straighten out to make 4 row vectors
                     __m256i iacc_row_0 = _mm256_blend_epi32(iacc_mat_00, _mm256_shuffle_epi32(iacc_mat_01, 78), 204);
                     __m256i iacc_row_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01, 204);
@@ -3045,187 +3165,24 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
                     __m256i iacc_row_3 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11, 204);
 
                     // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
-                    const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
+                    const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptr[b].d, loadMask);
 
                     // Multiply with appropiate scales and accumulate
-                    acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
-                    acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
-                    acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
-                    acc_rows[rp * 4 + 3] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_3), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32,  255)), acc_rows[rp * 4 + 3]);
+                    acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
+                    acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
+                    acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
+                    acc_rows[3] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_3), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[3]);
+                }
+
+                // Store the accumulated values
+                for (int i = 0; i < 4; i++) {
+                    _mm256_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
                 }
             }
-
-            // Store the accumulated values
-            for (int i = 0; i < 16; i++) {
-                _mm256_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
-            }
         }
+        return;
     }
-
-    // Take a block_q8_0x4 structures at each pass of the loop and perform dot product operation
-    for (; y < nr / 4; y ++) {
-
-        const block_q8_0x4 * a_ptr = a_ptr_start + (y * nb);
-
-        // Load the eight block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
-        for (int64_t x = xstart; x < nc / 8; x++) {
-
-            const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
-
-            // Master FP accumulators
-            __m256 acc_rows[4];
-            for (int i = 0; i < 4; i++) {
-                acc_rows[i] = _mm256_setzero_ps();
-            }
-
-            for (int64_t b = 0; b < nb; b++) {
-                // Load the eight block_q8_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
-                const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
-                const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 32));
-                const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
-                const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
-
-                // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
-                const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
-                const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
-                const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
-                const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
-
-                // 4-bit -> 8-bit - Sign is maintained
-                const __m256i rhs_mat_0145_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_0, m4b));  //B0(0-7) B1(0-7) B4(0-7) B5(0-7)
-                const __m256i rhs_mat_2367_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_0, m4b));  //B2(0-7) B3(0-7) B6(0-7) B7(0-7)
-
-                const __m256i rhs_mat_0145_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_1, m4b));  //B0(8-15) B1(8-15) B4(8-15) B5(8-15)
-                const __m256i rhs_mat_2367_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_1, m4b));  //B2(8-15) B3(8-15) B6(8-15) B7(8-15)
-
-                const __m256i rhs_mat_0145_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4b));  //B0(16-23) B1(16-23) B4(16-23) B5(16-23)
-                const __m256i rhs_mat_2367_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4b));  //B2(16-23) B3(16-23) B6(16-23) B7(16-23)
-
-                const __m256i rhs_mat_0145_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4b));  //B0(24-31) B1(24-31) B4(24-31) B5(24-31)
-                const __m256i rhs_mat_2367_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4b));  //B2(24-31) B3(24-31) B6(24-31) B7(24-31)
-
-                // Shuffle pattern one - right side input
-                const __m256i rhs_mat_0145_0_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_0, 136);  //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3)
-                const __m256i rhs_mat_2367_0_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_0, 136);  //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3)
-
-                const __m256i rhs_mat_0145_1_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_1, 136);  //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11)
-                const __m256i rhs_mat_2367_1_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_1, 136);  //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11)
-
-                const __m256i rhs_mat_0145_2_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_2, 136);  //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19)
-                const __m256i rhs_mat_2367_2_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_2, 136);  //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19)
-
-                const __m256i rhs_mat_0145_3_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_3, 136);  //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27)
-                const __m256i rhs_mat_2367_3_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_3, 136);  //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27)
-
-                // Shuffle pattern two - right side input
-
-                const __m256i rhs_mat_0145_0_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_0, 221);  //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7)
-                const __m256i rhs_mat_2367_0_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_0, 221);  //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7)
-
-                const __m256i rhs_mat_0145_1_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_1, 221);  //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15)
-                const __m256i rhs_mat_2367_1_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_1, 221);  //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15)
-
-                const __m256i rhs_mat_0145_2_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_2, 221);  //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23)
-                const __m256i rhs_mat_2367_2_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_2, 221);  //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23)
-
-                const __m256i rhs_mat_0145_3_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_3, 221);  //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31)
-                const __m256i rhs_mat_2367_3_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_3, 221);  //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31)
-
-                // Scale values - Load the wight scale values of block_q4_0x8
-                const __m256 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d);
-
-                // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
-                // Loaded as set of 128 bit vectors and repeated into a 256 bit vector
-                __m256i lhs_mat_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs)));
-                __m256i lhs_mat_01_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 0);
-                __m256i lhs_mat_23_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 17);
-                __m256i lhs_mat_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 32)));
-                __m256i lhs_mat_01_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 0);
-                __m256i lhs_mat_23_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 17);
-                __m256i lhs_mat_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 64)));
-                __m256i lhs_mat_01_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 0);
-                __m256i lhs_mat_23_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 17);
-                __m256i lhs_mat_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 96)));
-                __m256i lhs_mat_01_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 0);
-                __m256i lhs_mat_23_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 17);
-
-                // Shuffle pattern one - left side input
-
-                const __m256i lhs_mat_01_0_sp1 = _mm256_shuffle_epi32(lhs_mat_01_0, 160);  //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
-                const __m256i lhs_mat_23_0_sp1 = _mm256_shuffle_epi32(lhs_mat_23_0, 160);  //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
-
-                const __m256i lhs_mat_01_1_sp1 = _mm256_shuffle_epi32(lhs_mat_01_1, 160);  //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
-                const __m256i lhs_mat_23_1_sp1 = _mm256_shuffle_epi32(lhs_mat_23_1, 160);  //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
-
-                const __m256i lhs_mat_01_2_sp1 = _mm256_shuffle_epi32(lhs_mat_01_2, 160);  //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
-                const __m256i lhs_mat_23_2_sp1 = _mm256_shuffle_epi32(lhs_mat_23_2, 160);  //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
-
-                const __m256i lhs_mat_01_3_sp1 = _mm256_shuffle_epi32(lhs_mat_01_3, 160);  //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
-                const __m256i lhs_mat_23_3_sp1 = _mm256_shuffle_epi32(lhs_mat_23_3, 160);  //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
-
-                // Shuffle pattern two - left side input
-
-                const __m256i lhs_mat_01_0_sp2 = _mm256_shuffle_epi32(lhs_mat_01_0, 245);  //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
-                const __m256i lhs_mat_23_0_sp2 = _mm256_shuffle_epi32(lhs_mat_23_0, 245);  //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
-
-                const __m256i lhs_mat_01_1_sp2 = _mm256_shuffle_epi32(lhs_mat_01_1, 245);  //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
-                const __m256i lhs_mat_23_1_sp2 = _mm256_shuffle_epi32(lhs_mat_23_1, 245);  //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
-
-                const __m256i lhs_mat_01_2_sp2 = _mm256_shuffle_epi32(lhs_mat_01_2, 245);  //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
-                const __m256i lhs_mat_23_2_sp2 = _mm256_shuffle_epi32(lhs_mat_23_2, 245);  //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
-
-                const __m256i lhs_mat_01_3_sp2 = _mm256_shuffle_epi32(lhs_mat_01_3, 245);  //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
-                const __m256i lhs_mat_23_3_sp2 = _mm256_shuffle_epi32(lhs_mat_23_3, 245);  //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
-
-                // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
-                // Resembles MMLAs into 2x2 matrices in ARM Version
-                __m256i iacc_mat_00_sp1 =
-                    _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_0145_0_sp1));
-                __m256i iacc_mat_01_sp1 =
-                    _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_2367_0_sp1));
-                __m256i iacc_mat_10_sp1 =
-                    _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_0145_0_sp1));
-                __m256i iacc_mat_11_sp1 =
-                    _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_2367_0_sp1));
-                __m256i iacc_mat_00_sp2 =
-                    _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_0145_0_sp2));
-                __m256i iacc_mat_01_sp2 =
-                    _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_2367_0_sp2));
-                __m256i iacc_mat_10_sp2 =
-                    _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_0145_0_sp2));
-                __m256i iacc_mat_11_sp2 =
-                    _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_2367_0_sp2));
-
-                // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
-                __m256i iacc_mat_00 = _mm256_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
-                __m256i iacc_mat_01 = _mm256_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
-                __m256i iacc_mat_10 = _mm256_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
-                __m256i iacc_mat_11 = _mm256_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
-
-
-                // Straighten out to make 4 row vectors
-                __m256i iacc_row_0 = _mm256_blend_epi32(iacc_mat_00, _mm256_shuffle_epi32(iacc_mat_01, 78), 204);
-                __m256i iacc_row_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01, 204);
-                __m256i iacc_row_2 = _mm256_blend_epi32(iacc_mat_10, _mm256_shuffle_epi32(iacc_mat_11, 78), 204);
-                __m256i iacc_row_3 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11, 204);
-
-                // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
-                const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptr[b].d, loadMask);
-
-                // Multiply with appropiate scales and accumulate
-                acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
-                acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
-                acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
-                acc_rows[3] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_3), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[3]);
-            }
-
-            // Store the accumulated values
-            for (int i = 0; i < 4; i++) {
-                _mm256_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
-            }
-        }
-    }
-#else
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
     float sumf[4][8];
     int sumi;
 
@@ -3258,5 +3215,4 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
             }
         }
     }
-#endif
 }

From ea9c32be71b91b42ecc538bd902e93cbb5fb36cb Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Wed, 25 Sep 2024 17:26:01 +0200
Subject: [PATCH 43/78] ci : fix docker build number and tag name (#9638)

* ci : fix docker build number and tag name

* fine-grant permissions
---
 .dockerignore                |  2 +-
 .github/workflows/docker.yml | 58 +++++++++++++++++++++++-------------
 2 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/.dockerignore b/.dockerignore
index 8916e2a66..064b7c7be 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,7 +1,7 @@
 *.o
 *.a
 .cache/
-.git/
+# Do not ignore .git directory, otherwise the reported build number will always be 0
 .github/
 .gitignore
 .vs/
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 9044cd78b..a4ac9b217 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -15,11 +15,17 @@ on:
     branches:
       - master
     paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
+  workflow_dispatch: # allows manual triggering, useful for debugging
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
   cancel-in-progress: true
 
+# Fine-grant permission
+# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
+permissions:
+  packages: write
+
 jobs:
   push_to_registry:
     name: Push Docker image to Docker Hub
@@ -46,6 +52,8 @@ jobs:
     steps:
       - name: Check out the repo
         uses: actions/checkout@v4
+        with:
+          fetch-depth: 0 # preserve git history, so we can determine the build number
 
       - name: Set up QEMU
         uses: docker/setup-qemu-action@v2
@@ -60,6 +68,34 @@ jobs:
           username: ${{ github.repository_owner }}
           password: ${{ secrets.GITHUB_TOKEN }}
 
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}"  # to lower case
+          REPO_NAME="${{ github.event.repository.name }}"
+
+          # determine tag name postfix (build number, commit hash)
+          if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
+            TAG_POSTFIX="b${BUILD_NUMBER}"
+          else
+            SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
+            TAG_POSTFIX="${SAFE_NAME}-${SHORT_HASH}"
+          fi
+
+          # list all tags possible
+          TAGS=""
+          TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }},"
+          TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }}-${TAG_POSTFIX}"
+
+          echo "output_tags=$TAGS" >> $GITHUB_OUTPUT
+          echo "output_tags=$TAGS"  # print out for debugging
+        env:
+          GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
+
       # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
       - name: Free Disk Space (Ubuntu)
         uses: jlumbroso/free-disk-space@main
@@ -77,25 +113,6 @@ jobs:
           docker-images: true
           swap-storage: true
 
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Downcase github.repository_owner
-        run: |
-          echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV
-        env:
-          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
-
       - name: Build and push Docker image (tagged + versioned)
         if: github.event_name == 'push'
         uses: docker/build-push-action@v6
@@ -103,5 +120,6 @@ jobs:
           context: .
           push: true
           platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
+          # tag list is generated from step above
+          tags: ${{ steps.tag.outputs.output_tags }}
           file: ${{ matrix.config.dockerfile }}

From 7691654c68aa5316108f881136c59f815ccb6809 Mon Sep 17 00:00:00 2001
From: R0CKSTAR <xiaodong.ye@mthreads.com>
Date: Thu, 26 Sep 2024 09:27:40 +0800
Subject: [PATCH 44/78] mtgpu: enable VMM (#9597)

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
---
 ggml/src/ggml-cuda.cu | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 0bb7f2d99..6efdab14c 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -187,7 +187,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
     for (int id = 0; id < info.device_count; ++id) {
         int device_vmm = 0;
 
-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
         CUdevice device;
         CU_CHECK(cuDeviceGet(&device, id));
         CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@@ -199,7 +199,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
             alloc_prop.location.id = id;
             CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
         }
-#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
         info.devices[id].vmm = !!device_vmm;
 
         cudaDeviceProp prop;
@@ -335,7 +335,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
 };
 
 // pool with virtual memory
-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
 struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
     static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
 
@@ -429,14 +429,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
         GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
     }
 };
-#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
 
 std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
     if (ggml_cuda_info().devices[device].vmm) {
         return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
     }
-#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
     return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
 }
 

From 95bc82fbc0df6d48cf66c857a4dda3d044f45ca2 Mon Sep 17 00:00:00 2001
From: Neo Zhang Jianyu <jianyu.zhang@intel.com>
Date: Thu, 26 Sep 2024 17:38:31 +0800
Subject: [PATCH 45/78] [SYCL] add missed dll file in package (#9577)

* update oneapi to 2024.2

* use 2024.1

---------

Co-authored-by: arthw <14088817+arthw@users.noreply.github.com>
---
 .github/workflows/build.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index a54c5de99..e6a977b60 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -956,6 +956,7 @@ jobs:
           cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin
           cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
           cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
           echo "cp oneAPI running time dll files to ./build/bin done"
           7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
 

From 44f59b4301c51f071daa2e951301bb17c14acc9b Mon Sep 17 00:00:00 2001
From: Borislav Stanimirov <b.stanimirov@abv.bg>
Date: Fri, 27 Sep 2024 10:42:06 +0300
Subject: [PATCH 46/78] cmake : add option for common library (#9661)

---
 CMakeLists.txt | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 973907819..415743c2a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,6 +62,9 @@ option(LLAMA_SANITIZE_THREAD    "llama: enable thread sanitizer"    OFF)
 option(LLAMA_SANITIZE_ADDRESS   "llama: enable address sanitizer"   OFF)
 option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
 
+# utils
+option(LLAMA_BUILD_COMMON "llama: build common utils library" ON)
+
 # extra artifacts
 option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
@@ -191,15 +194,17 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
         DESTINATION lib/pkgconfig)
 
 #
-# programs, examples and tests
+# utils, programs, examples and tests
 #
 
-add_subdirectory(common)
+if (LLAMA_BUILD_COMMON)
+    add_subdirectory(common)
+endif()
 
 if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
     include(CTest)
     add_subdirectory(tests)
-endif ()
+endif()
 
 if (LLAMA_BUILD_EXAMPLES)
     add_subdirectory(examples)

From b5de3b74a595cbfefab7eeb5a567425c6a9690cf Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 27 Sep 2024 20:57:51 +0300
Subject: [PATCH 47/78] readme : update hot topics

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ce954f713..93225b63c 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 
 ## Hot topics
 
-- Huggingface GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
+- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669**
+- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
 
 ----
 

From 89f9944981010d195e411a9fbfbb19959412f710 Mon Sep 17 00:00:00 2001
From: Markus Tavenrath <mtavenrath@users.noreply.github.com>
Date: Sat, 28 Sep 2024 12:05:05 +0200
Subject: [PATCH 48/78] Enable use to the rebar feature to upload buffers to
 the device. (#9251)

---
 ggml/src/ggml-vulkan.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index f9da45881..a877145e8 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -1079,7 +1079,8 @@ static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) {
             // Fall back to host memory type
             buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
         } else {
-            buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
+            // use rebar if available, otherwise fallback to device only visible memory
+            buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, vk::MemoryPropertyFlagBits::eDeviceLocal);
         }
     } catch (const vk::SystemError& e) {
         std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
@@ -2806,7 +2807,11 @@ static void ggml_vk_buffer_read_async(vk_context subctx, vk_buffer& src, size_t
 
 static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_t size) {
     VK_LOG_DEBUG("ggml_vk_buffer_read(" << src->buffer << ", " << offset << ", " << size << ")");
-    if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
+
+    // If the device is not an UMA device the memory is host-accessible through rebar. While writing
+    // through PCIe is sufficient fast reading back data from PCIe is slower than going through
+    // the HW device to host copy path.
+    if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && src->device->uma) {
         GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
 
         memcpy(dst, (uint8_t *) src->ptr + offset, size);

From 6a0f7794847244fb3b99a983e03137d7e832b585 Mon Sep 17 00:00:00 2001
From: Dan Johansson <164997844+eddnjjn@users.noreply.github.com>
Date: Sat, 28 Sep 2024 14:06:16 +0200
Subject: [PATCH 49/78] ggml : add run-time detection of neon, i8mm and sve
 (#9331)

* ggml: Added run-time detection of neon, i8mm and sve

Adds run-time detection of the Arm instructions set features
neon, i8mm and sve for Linux and Apple build targets.

* ggml: Extend feature detection to include non aarch64 Arm arch

* ggml: Move definition of ggml_arm_arch_features to the global data section
---
 ggml/include/ggml.h     |   3 ++
 ggml/src/ggml-aarch64.c |  13 +-----
 ggml/src/ggml-quants.c  |   4 +-
 ggml/src/ggml-quants.h  |   4 --
 ggml/src/ggml.c         | 101 ++++++++++++++++++++++++++++++++++------
 5 files changed, 93 insertions(+), 32 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index e24b8a319..9f96e0c48 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2507,6 +2507,9 @@ extern "C" {
     GGML_API int ggml_cpu_has_cann       (void);
     GGML_API int ggml_cpu_has_llamafile  (void);
 
+    // get the sve vector length in bytes
+    GGML_API int ggml_cpu_get_sve_cnt(void);
+
     //
     // Internal types and functions exposed for tests and benchmarks
     //
diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c
index 8912de63d..b27f41147 100644
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@@ -598,15 +598,6 @@ size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_
     return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
 }
 
-// Return the number of byte lanes in the SVE vector if SVE is supported; otherwise, returns 0 if SVE is not supported.
-static int sve_lane_count(void) {
-#if defined(__ARM_FEATURE_SVE)
-    return ggml_sve_cnt_b;
-#else
-    return 0;
-#endif
-}
-
 void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
     const int qk = QK8_0;
     const int nb = n / qk;
@@ -843,7 +834,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
 
 #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
 #if defined(__ARM_FEATURE_SVE)
-    if (ggml_cpu_has_sve() && sve_lane_count() == QK8_0) {
+    if (ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;
         float * res_ptr = s;
@@ -2020,7 +2011,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
 
 #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && sve_lane_count() == QK8_0) {
+    if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;
         float * res_ptr = s;
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 8bffce860..7aa6dce89 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -4013,7 +4013,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     svfloat32_t sumv0 = svdup_n_f32(0.0f);
     svfloat32_t sumv1 = svdup_n_f32(0.0f);
 
-    const int vector_length = ggml_sve_cnt_b*8;
+    const int vector_length = ggml_cpu_get_sve_cnt()*8;
 
     // VLA Implementation using switch case
     switch (vector_length) {
@@ -5597,7 +5597,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     svfloat32_t sumv0 = svdup_n_f32(0.0f);
     svfloat32_t sumv1 = svdup_n_f32(0.0f);
 
-    const int vector_length = ggml_sve_cnt_b*8;
+    const int vector_length = ggml_cpu_get_sve_cnt()*8;
 
     //VLA Implemenation for SVE
     switch (vector_length) {
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
index e96ce2b5e..df9c4b24a 100644
--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@@ -142,10 +142,6 @@ void iq2xs_free_impl(enum ggml_type type);
 void iq3xs_init_impl(int grid_size);
 void iq3xs_free_impl(int grid_size);
 
-#if defined(__ARM_FEATURE_SVE)
-extern int ggml_sve_cnt_b;
-#endif
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 4b782b0c1..fac4466e3 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -39,9 +39,6 @@
 #include <unistd.h>
 #endif
 
-#if defined(__ARM_FEATURE_SVE)
-int ggml_sve_cnt_b = 0;
-#endif
 #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
 #undef GGML_USE_LLAMAFILE
 #endif
@@ -455,6 +452,15 @@ static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
 // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
 float ggml_table_f32_f16[1 << 16];
 
+#if defined(__ARM_ARCH)
+struct ggml_arm_arch_features_type {
+    int has_neon;
+    int has_i8mm;
+    int has_sve;
+    int sve_cnt;
+} ggml_arm_arch_features = {-1, -1, -1, 0};
+#endif
+
 GGML_CALL const char * ggml_status_to_string(enum ggml_status status) {
     switch (status) {
         case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
@@ -3673,6 +3679,66 @@ static inline int ggml_up(int n, int m) {
 
 ////////////////////////////////////////////////////////////////////////////////
 
+#if defined(__ARM_ARCH)
+
+#if defined(__linux__) && defined(__aarch64__)
+#include <sys/auxv.h>
+#elif defined(__APPLE__)
+#include <sys/sysctl.h>
+#endif
+
+static void ggml_init_arm_arch_features(void) {
+#if defined(__linux__) && defined(__aarch64__)
+    uint32_t hwcap = getauxval(AT_HWCAP);
+    uint32_t hwcap2 = getauxval(AT_HWCAP2);
+
+    ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
+    ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
+    ggml_arm_arch_features.has_sve  = !!(hwcap & HWCAP_SVE);
+
+#if defined(__ARM_FEATURE_SVE)
+    ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
+#endif
+#elif defined(__APPLE__)
+    int oldp = 0;
+    size_t size = sizeof(oldp);
+    if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) {
+        oldp = 0;
+    }
+    ggml_arm_arch_features.has_neon = oldp;
+
+    if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
+        oldp = 0;
+    }
+    ggml_arm_arch_features.has_i8mm = oldp;
+
+    ggml_arm_arch_features.has_sve = 0;
+    ggml_arm_arch_features.sve_cnt = 0;
+#else
+// Run-time CPU feature detection not implemented for this platform, fallback to compile time
+#if defined(__ARM_NEON)
+    ggml_arm_arch_features.has_neon = 1;
+#else
+    ggml_arm_arch_features.has_neon = 0;
+#endif
+
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    ggml_arm_arch_features.has_i8mm = 1;
+#else
+    ggml_arm_arch_features.has_i8mm = 0;
+#endif
+
+#if defined(__ARM_FEATURE_SVE)
+    ggml_arm_arch_features.has_sve = 1;
+    ggml_arm_arch_features.sve_cnt = 16;
+#else
+    ggml_arm_arch_features.has_sve = 0;
+    ggml_arm_arch_features.sve_cnt = 0;
+#endif
+#endif
+}
+#endif
+
 struct ggml_context * ggml_init(struct ggml_init_params params) {
     // make this function thread safe
     ggml_critical_section_start();
@@ -3723,6 +3789,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
             GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
         }
 
+#if defined(__ARM_ARCH)
+        ggml_init_arm_arch_features();
+#endif
+
         is_first_call = false;
     }
 
@@ -3771,12 +3841,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
 
     GGML_ASSERT_ALIGNED(ctx->mem_buffer);
 
-#if defined(__ARM_FEATURE_SVE)
-    if (!ggml_sve_cnt_b) {
-        ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
-    }
-#endif
-
     GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
 
     ggml_critical_section_end();
@@ -23578,16 +23642,16 @@ int ggml_cpu_has_fma(void) {
 }
 
 int ggml_cpu_has_neon(void) {
-#if defined(__ARM_NEON)
-    return 1;
+#if defined(__ARM_ARCH)
+    return ggml_arm_arch_features.has_neon;
 #else
     return 0;
 #endif
 }
 
 int ggml_cpu_has_sve(void) {
-#if defined(__ARM_FEATURE_SVE)
-    return 1;
+#if defined(__ARM_ARCH)
+    return ggml_arm_arch_features.has_sve;
 #else
     return 0;
 #endif
@@ -23734,11 +23798,18 @@ int ggml_cpu_has_vsx(void) {
 }
 
 int ggml_cpu_has_matmul_int8(void) {
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    return 1;
+#if defined(__ARM_ARCH)
+    return ggml_arm_arch_features.has_i8mm;
 #else
     return 0;
 #endif
 }
 
+int ggml_cpu_get_sve_cnt(void) {
+#if defined(__ARM_ARCH)
+    return ggml_arm_arch_features.sve_cnt;
+#else
+    return 0;
+#endif
+}
 ////////////////////////////////////////////////////////////////////////////////

From 43bcdd9703ec19af7d2a519640b5ed6f4aac3d53 Mon Sep 17 00:00:00 2001
From: Aarni Koskela <akx@iki.fi>
Date: Sat, 28 Sep 2024 15:07:14 +0300
Subject: [PATCH 50/78] readme : add tool (#9655)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 93225b63c..a452a6d78 100644
--- a/README.md
+++ b/README.md
@@ -174,6 +174,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 **Tools:**
 
 - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
+- [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp
 - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
 - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
 - [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)

From 9a913110cf471a8287ac06c43cbe307d3cf6df99 Mon Sep 17 00:00:00 2001
From: nopperl <54780682+nopperl@users.noreply.github.com>
Date: Sat, 28 Sep 2024 12:08:43 +0000
Subject: [PATCH 51/78] llama : add support for Chameleon (#8543)

* convert chameleon hf to gguf

* add chameleon tokenizer tests

* fix lint

* implement chameleon graph

* add swin norm param

* return qk norm weights and biases to original format

* implement swin norm

* suppress image token output

* rem tabs

* add comment to conversion

* fix ci

* check for k norm separately

* adapt to new lora implementation

* fix layer input for swin norm

* move swin_norm in gguf writer

* add comment regarding special token regex in chameleon pre-tokenizer

* Update src/llama.cpp

Co-authored-by: compilade <git@compilade.net>

* fix punctuation regex in chameleon pre-tokenizer (@compilade)

Co-authored-by: compilade <git@compilade.net>

* fix lint

* trigger ci

---------

Co-authored-by: compilade <git@compilade.net>
---
 convert_hf_to_gguf.py                |  44 +++++
 convert_hf_to_gguf_update.py         |   1 +
 gguf-py/gguf/constants.py            |  19 ++
 gguf-py/gguf/gguf_writer.py          |   3 +
 gguf-py/gguf/tensor_mapping.py       |   4 +-
 include/llama.h                      |   1 +
 models/ggml-vocab-chameleon.gguf.inp | 112 ++++++++++++
 models/ggml-vocab-chameleon.gguf.out |  46 +++++
 src/llama-vocab.cpp                  |  14 ++
 src/llama.cpp                        | 263 +++++++++++++++++++++++++++
 10 files changed, 505 insertions(+), 2 deletions(-)
 create mode 100644 models/ggml-vocab-chameleon.gguf.inp
 create mode 100644 models/ggml-vocab-chameleon.gguf.out

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 7be609054..2cd5a8c11 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -640,6 +640,9 @@ class Model:
         if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
             # ref: https://huggingface.co/microsoft/phi-2
             res = "phi-2"
+        if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
+            # ref: https://huggingface.co/facebook/chameleon-7b
+            res = "chameleon"
 
         if res is None:
             logger.warning("\n")
@@ -4138,6 +4141,47 @@ class GraniteMoeModel(GraniteModel):
         return super().modify_tensors(data_torch, name, bid)
 
 
+@Model.register("ChameleonForCausalLM")
+class ChameleonModel(Model):
+    model_arch = gguf.MODEL_ARCH.CHAMELEON
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False))
+
+    def set_vocab(self):
+        self._set_vocab_gpt2()
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # ignore image tokenizer for now
+        # TODO: remove this once image support is implemented for Chameleon
+        if name.startswith("model.vqmodel"):
+            return []
+
+        n_head = self.hparams["num_attention_heads"]
+        n_kv_head = self.hparams.get("num_key_value_heads")
+        hidden_dim = self.hparams.get("hidden_size")
+
+        if name.endswith(("q_proj.weight", "q_proj.bias")):
+            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+        if name.endswith(("k_proj.weight", "k_proj.bias")):
+            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+        if name.endswith(("q_norm.weight", "q_norm.bias")):
+            data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim)
+        if name.endswith(("k_norm.weight", "k_norm.bias")):
+            data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim)
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203
+    @staticmethod
+    def _reverse_hf_permute(data_torch, n_heads, hidden_dim):
+        head_dim = hidden_dim // n_heads
+        data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1)
+        data_torch = data_torch.repeat_interleave(n_heads, 0)
+        return data_torch
+
+
 ###### CONVERSION LOGIC ######
 
 
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index 021f65abd..4d11059f3 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -99,6 +99,7 @@ models = [
     {'name': "gpt3-finnish",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
     {"name": "exaone",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
     {"name": "phi-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
+    {"name": "chameleon",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
 ]
 
 
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 560eee916..2fd2e9d2b 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -94,6 +94,7 @@ class Keys:
         DECODER_START_TOKEN_ID            = "{arch}.decoder_start_token_id"
         ATTN_LOGIT_SOFTCAPPING            = "{arch}.attn_logit_softcapping"
         FINAL_LOGIT_SOFTCAPPING           = "{arch}.final_logit_softcapping"
+        SWIN_NORM                         = "{arch}.swin_norm"
         RESCALE_EVERY_N_LAYERS            = "{arch}.rescale_every_n_layers"
         TIME_MIX_EXTRA_DIM                = "{arch}.time_mix_extra_dim"
         TIME_DECAY_EXTRA_DIM              = "{arch}.time_decay_extra_dim"
@@ -236,6 +237,7 @@ class MODEL_ARCH(IntEnum):
     EXAONE       = auto()
     GRANITE      = auto()
     GRANITE_MOE  = auto()
+    CHAMELEON    = auto()
 
 
 class MODEL_TENSOR(IntEnum):
@@ -394,6 +396,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
     MODEL_ARCH.EXAONE:         "exaone",
     MODEL_ARCH.GRANITE:        "granite",
     MODEL_ARCH.GRANITE_MOE:    "granitemoe",
+    MODEL_ARCH.CHAMELEON:      "chameleon",
 }
 
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -1260,6 +1263,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_DOWN_EXP,
         MODEL_TENSOR.FFN_UP_EXP,
     ],
+    MODEL_ARCH.CHAMELEON: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
     # TODO
 }
 
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index bd059b45c..5c460ef1b 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -670,6 +670,9 @@ class GGUFWriter:
     def add_expert_weights_scale(self, value: float) -> None:
         self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value)
 
+    def add_swin_norm(self, value: bool) -> None:
+        self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
+
     def add_rescale_every_n_layers(self, count: int) -> None:
         self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count)
 
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 4e850726e..5ef91f11d 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -380,7 +380,7 @@ class TensorNameMap:
         MODEL_TENSOR.ATTN_Q_NORM: (
             "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
             "model.layers.{bid}.self_attn.q_layernorm",                       # persimmon
-            "model.layers.{bid}.self_attn.q_norm",                            # cohere olmoe
+            "model.layers.{bid}.self_attn.q_norm",                            # cohere olmoe chameleon
             "transformer.blocks.{bid}.attn.q_ln",                             # sea-lion
             "encoder.layer.{bid}.attention.self.layer_norm_q",                # jina-bert-v2
             "transformer.layers.{bid}.attn.q_norm",                           # openelm
@@ -389,7 +389,7 @@ class TensorNameMap:
         MODEL_TENSOR.ATTN_K_NORM: (
             "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
             "model.layers.{bid}.self_attn.k_layernorm",                       # persimmon
-            "model.layers.{bid}.self_attn.k_norm",                            # cohere olmoe
+            "model.layers.{bid}.self_attn.k_norm",                            # cohere olmoe chameleon
             "transformer.blocks.{bid}.attn.k_ln",                             # sea-lion
             "encoder.layer.{bid}.attention.self.layer_norm_k",                # jina-bert-v2
             "transformer.layers.{bid}.attn.k_norm",                           # openelm
diff --git a/include/llama.h b/include/llama.h
index 132937a07..caef0bfff 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -102,6 +102,7 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_BLOOM          = 23,
         LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH   = 24,
         LLAMA_VOCAB_PRE_TYPE_EXAONE         = 25,
+        LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
     };
 
     enum llama_rope_type {
diff --git a/models/ggml-vocab-chameleon.gguf.inp b/models/ggml-vocab-chameleon.gguf.inp
new file mode 100644
index 000000000..9baf7d77a
--- /dev/null
+++ b/models/ggml-vocab-chameleon.gguf.inp
@@ -0,0 +1,112 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+!!!!!!
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+Cửa Việt
+__ggml_vocab_test__
+ discards
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
diff --git a/models/ggml-vocab-chameleon.gguf.out b/models/ggml-vocab-chameleon.gguf.out
new file mode 100644
index 000000000..7c5413fee
--- /dev/null
+++ b/models/ggml-vocab-chameleon.gguf.out
@@ -0,0 +1,46 @@
+ 17245 16604 16403 16604 33583 18355
+ 16421 51153
+
+ 16604
+ 16650
+ 16650 16604
+ 16581
+ 16582
+ 16582 16582
+ 16582 16582 16582
+ 16581 16582
+ 31596 17394
+ 34926 17394
+ 31596 18671
+ 34926 18671
+ 34926 18671 16384
+ 31596 16395 17394 16384
+ 34926 16395 17394 16384
+ 16811 16704 20410 16483 16631 16397 52854
+ 16470 16399 16403 16407 16604 16406 35764 38185 51595 22592 26639
+ 29479 23955 17012 20103 25527 27670 17408 19005 21473 24774
+ 54254 42231 48084 29409 16617 61889 29409 16608 21954 16628 21954 16499 58445 29409 16607 58445 21954 16479 42231 21954 16611 21954 16607 21954 16633 21954 16611 29409 16607 21954 16615
+ 52351 16604 16391 25825 16392 23686 16498 39161 18885 16618 16488 30853 16604 16391 54124 17153 25134 16656 18476 26169 16895 16392 62193 16611 16604 16391 24664 17153 57169 16721 16872 17073 17304 28729 16392
+ 31596
+ 34926
+ 16650 31596
+ 16650 34926
+ 16696 31596
+ 16696 31596 16582 16696 31596
+ 16604 16391
+ 16582 16604 16412
+ 16390 22623
+ 31596 16395 16712 16390 16828 16384 17674 16769 16732 23686 16607 16604 16414 24427 16623 41809 16495 28999 36469 45292 30197 16400 16402 16400 16403 16400 16404 16400 43969 65211 16636
+ 16384 16384 16384 16384 16384 16384
+ 16402
+ 16402 16402
+ 16402 16402 16402
+ 16402 16402 16402 16402
+ 16402 16402 16402 16402 16402
+ 16402 16402 16402 16402 16402 16402
+ 16402 16402 16402 16402 16402 16402 16402
+ 16402 16402 16402 16402 16402 16402 16402 16402
+ 16402 16402 16402 16402 16402 16402 16402 16402 16402
+ 16418 19038 16639 16448 24315 33727 16467
+ 18765 17981
+ 16582 16604 16582 16582 16604 16582 16582 16582 16604 16581 16604 16581 16581 16604 16581 16582 16650 16582 16650 16604 16582 16696 16582 16696 16604 16582 52351 16604 16391 25825 16392 23686 16498 39161 18885 16618 16488 30853 16604 16391 54124 17153 25134 16656 18476 26169 16895 16392 62193 16611 20410 16483 16631 18885 16483 16631 16604 16402 16604 16402 16402 16604 16402 16402 16402 16604 16402 16402 16402 16402 16604 16402 16402 16402 16402 16402 16604 16402 16402 16402 16402 16402 16402 16604 16402 16402 16402 16402 16402 16402 16402 16604 16402 16402 16402 16402 16402 16402 16402 16402 16604 16402 16397 16402 16604 16402 16397 16397 16402 16604 16402 16397 16397 16397 16402 16604 54254 42231 48084 29409 16617 61889 29409 16608 21954 16628 21954 16499 58445 29409 16607 58445 21954 16479 42231 21954 16611 27683 16607 16604 16414 24427 16623 41809 16495 28999 36469 45292 30197 16400 16402 16400 16403 16400 16404 16400 43969 65211 16636 16604 16396 16396 16396 16396 16396 16396 16412 16412 16412 16412 16412 16412 16412 27268 23955 17012 20103 25527 27670 17408 19005 21473 24774 16604 16390 16390 16390 16390 16390 16390 16447 16447 16447 16447 16447 16447 16447 16385 16385 16385 16385 16397 16397 16397 16397 16397 16397 16384 16384 16384 16384 16384 16384 16414 16414 16414 16414 16414 16414 16687 16390 16690 16992 16604 16390 61797 16733 16390 16466 16986 16395 16604 16390 17879 16732 17811 16414 16604 16390 16428 16804 17811 16687 16390 16683 17190 16728 16395 16604 16390 16419 16732 16945 16991 25251 16414 17119 16390 38127 16641 16390 16459 16427
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index a771eccda..146d416f7 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -450,6 +450,20 @@ struct llm_tokenizer_bpe {
                     "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                 };
                 break;
+            case LLAMA_VOCAB_PRE_TYPE_CHAMELEON:
+                // Note: in theory, the special token (sentinel and image token) regex_exprs below
+                // are unnecessary, as they are split in `tokenizer_st_partition` anyway.
+                // However, since the upstream pre-tokenizer uses them, they are also
+                // included here (see https://huggingface.co/facebook/chameleon-7b).
+                regex_exprs = {
+                    "<sentinel:[0-9]+>",  // Sentinel tokens
+                    "(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z",  // Image tokens
+                    "([\\t\\n]|    |  )",  // directly from tokenizer.json
+                    "\\p{N}", // Individual digits
+                    "[\\p{P}!-/:-@\\[-`{-~]",  // Punctuation, Isolated
+                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
+                };
+                break;
             default:
                 // default regex for BPE tokenization pre-processing
                 regex_exprs = {
diff --git a/src/llama.cpp b/src/llama.cpp
index 0accb1492..f450eaf9d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -216,6 +216,7 @@ enum llm_arch {
     LLM_ARCH_RWKV6,
     LLM_ARCH_GRANITE,
     LLM_ARCH_GRANITE_MOE,
+    LLM_ARCH_CHAMELEON,
     LLM_ARCH_UNKNOWN,
 };
 
@@ -268,6 +269,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_RWKV6,           "rwkv6"        },
     { LLM_ARCH_GRANITE,         "granite"      },
     { LLM_ARCH_GRANITE_MOE,     "granitemoe"   },
+    { LLM_ARCH_CHAMELEON,       "chameleon"    },
     { LLM_ARCH_UNKNOWN,         "(unknown)"    },
 };
 
@@ -304,6 +306,7 @@ enum llm_kv {
     LLM_KV_DECODER_START_TOKEN_ID,
     LLM_KV_ATTN_LOGIT_SOFTCAPPING,
     LLM_KV_FINAL_LOGIT_SOFTCAPPING,
+    LLM_KV_SWIN_NORM,
     LLM_KV_RESCALE_EVERY_N_LAYERS,
     LLM_KV_TIME_MIX_EXTRA_DIM,
     LLM_KV_TIME_DECAY_EXTRA_DIM,
@@ -411,6 +414,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_DECODER_START_TOKEN_ID,            "%s.decoder_start_token_id"            },
     { LLM_KV_ATTN_LOGIT_SOFTCAPPING,            "%s.attn_logit_softcapping"            },
     { LLM_KV_FINAL_LOGIT_SOFTCAPPING,           "%s.final_logit_softcapping"           },
+    { LLM_KV_SWIN_NORM,                         "%s.swin_norm"                         },
     { LLM_KV_RESCALE_EVERY_N_LAYERS,            "%s.rescale_every_n_layers"            },
     { LLM_KV_TIME_MIX_EXTRA_DIM,                "%s.time_mix_extra_dim"                },
     { LLM_KV_TIME_DECAY_EXTRA_DIM,              "%s.time_decay_extra_dim"              },
@@ -1499,6 +1503,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
             { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
         },
     },
+    {
+        LLM_ARCH_CHAMELEON,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
+        },
+    },
     {
         LLM_ARCH_UNKNOWN,
         {
@@ -2362,6 +2385,7 @@ struct llama_hparams {
     bool vocab_only;
     bool rope_finetuned;
     bool use_par_res;
+    bool swin_norm;
 
     uint32_t n_vocab;
     uint32_t n_ctx_train; // context size the model was trained on
@@ -6084,6 +6108,18 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_CHAMELEON:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                hparams.f_norm_eps = 1e-5;  // eps for qk-norm, torch default
+                ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
+
+                switch (hparams.n_layer) {
+                    case 32: model.type = e_model::MODEL_7B; break;
+                    case 48: model.type = e_model::MODEL_34B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+               }
+            } break;
         default: (void)0;
     }
 
@@ -6341,6 +6377,11 @@ static void llm_load_vocab(
             } else if (
                 tokenizer_pre == "exaone") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
+            } else if (
+                tokenizer_pre == "chameleon") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
+                vocab.tokenizer_add_bos = true;
+                vocab.tokenizer_clean_spaces = false;
             } else {
                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
             }
@@ -8728,6 +8769,45 @@ static bool llm_load_tensors(
                     }
 
                 } break;
+            case LLM_ARCH_CHAMELEON:
+                {
+                 model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+                 // output
+                    {
+                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
+                        // if output is NULL, init from the input tok embed
+                        if (model.output == NULL) {
+                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+                        }
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head});
+                        layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv});
+                        layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i),  {n_embd_head_k, n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i),  {n_embd_head_k, n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+
+                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+
+                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                    }
+                } break;
             default:
                 throw std::runtime_error("unknown architecture");
         }
@@ -15872,6 +15952,184 @@ struct llm_build_context {
 
         return gf;
     }
+
+    // ref: https://github.com/facebookresearch/chameleon
+    // based on the original build_llama() function, changes:
+    //   * qk-norm
+    //   * swin-norm
+    //   * removed bias
+    //   * removed MoE
+    struct ggml_cgraph * build_chameleon() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            if (hparams.swin_norm) {
+                cur = inpL;
+            } else {
+                cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+                cb(cur, "attn_norm", il);
+            }
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                if (model.layers[il].attn_q_norm) {
+                    Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
+                                ggml_element_size(Qcur) * n_embd_head,
+                                ggml_element_size(Qcur) * n_embd_head * n_head,
+                                0);
+                    cb(Qcur, "Qcur", il);
+
+                    Qcur = llm_build_norm(ctx0, Qcur, hparams,
+                                model.layers[il].attn_q_norm,
+                                model.layers[il].attn_q_norm_b,
+                                LLM_NORM, cb, il);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                if (model.layers[il].attn_k_norm) {
+                    Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
+                                ggml_element_size(Kcur) * n_embd_head,
+                                ggml_element_size(Kcur) * n_embd_head * n_head_kv,
+                                0);
+                    cb(Kcur, "Kcur", il);
+
+                    Kcur = llm_build_norm(ctx0, Kcur, hparams,
+                               model.layers[il].attn_k_norm,
+                               model.layers[il].attn_k_norm_b,
+                               LLM_NORM, cb, il);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, nullptr,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+
+                if (hparams.swin_norm) {
+                    cur = llm_build_norm(ctx0, cur, hparams,
+                        model.layers[il].attn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                }
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            if (!hparams.swin_norm) {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+            }
+
+            cur = llm_build_ffn(ctx0, lctx, cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_out", il);
+
+            if (hparams.swin_norm) {
+                cur = llm_build_norm(ctx0, cur, hparams,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output_with_img_logits", -1);
+
+        // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
+        // Needs to be removed once image outputs are supported.
+        int img_token_end_idx = 8196;
+        int img_token_start_idx = 4;
+        int num_img_tokens = img_token_end_idx - img_token_start_idx;
+        // creates 1d tensor of size num_img_tokens and values -FLT_MAX,
+        // which ensures that text token values are always at least larger than image token values
+        struct ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
+        img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
+        cb(img_logits, "img_logits", -1);
+        cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
 };
 
 static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -16132,6 +16390,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_rwkv6();
             } break;
+        case LLM_ARCH_CHAMELEON:
+            {
+                result = llm.build_chameleon();
+            } break;
         default:
             GGML_ABORT("fatal error");
     }
@@ -19257,6 +19519,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_CHATGLM:
         case LLM_ARCH_GRANITE:
         case LLM_ARCH_GRANITE_MOE:
+        case LLM_ARCH_CHAMELEON:
             return LLAMA_ROPE_TYPE_NORM;
 
         // the pairs of head values are offset by n_rot/2

From 6102037bbb55521880ae78a6ee6c2a0c00c901df Mon Sep 17 00:00:00 2001
From: Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com>
Date: Sat, 28 Sep 2024 20:10:58 +0800
Subject: [PATCH 52/78] vocab : refactor tokenizer to reduce init overhead
 (#9449)

* refactor tokenizer

* llama : make llm_tokenizer more private

ggml-ci

* refactor tokenizer

* refactor tokenizer

* llama : make llm_tokenizer more private

ggml-ci

* remove unused files

* remove unused fileds to avoid unused filed build error

* avoid symbol link error

* Update src/llama.cpp

* Update src/llama.cpp

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 .../convert-llama2c-to-ggml.cpp               |  14 +-
 src/llama-vocab.cpp                           | 266 +++++++++++-------
 src/llama-vocab.h                             |   9 +
 src/llama.cpp                                 |   2 +
 tests/test-tokenizer-0.cpp                    |  88 +++---
 5 files changed, 238 insertions(+), 141 deletions(-)

diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index ecff95f9a..c140daed3 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -201,7 +201,7 @@ static void print_sample_weights(TransformerWeights *w){
 
 //////////////////////////////////////// ggml structs and functions required to load models, configs and save the model.
 
-struct llama_vocab {
+struct my_llama_vocab {
     using id    = int32_t;
     using token = std::string;
     using ttype = llama_token_type;
@@ -525,7 +525,7 @@ static std::string llama_escape_whitespaces(const std::string & text) {
     return out.str();
 }
 
-static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
+static void load_vocab(const char * filename, const Config * config, struct my_llama_vocab * vocab) {
     if (is_ggml_file(filename)) {
         LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
         struct ggml_context * ctx_data = NULL;
@@ -583,13 +583,13 @@ static void load_vocab(const char * filename, const Config * config, struct llam
         const int  n_vocab = config->vocab_size;
         /* uint32_t max_token_length =  */ file.read_u32(); // unused
         vocab->id_to_token.resize(n_vocab);
-        for (llama_vocab::id id=0; id<n_vocab; ++id) {
+        for (my_llama_vocab::id id=0; id<n_vocab; ++id) {
             float_t score = file.read_f32();
             uint32_t len = file.read_u32();
             std::string text = file.read_string(len);
 
             unsigned char byte_val;
-            llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
+            my_llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
             if (id == UNKNOWN_TOKEN_ID) {
                 text = "<unk>";
                 type = LLAMA_TOKEN_TYPE_UNKNOWN;
@@ -631,7 +631,7 @@ static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const floa
 }
 
 static void save_as_llama_model(
-    struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
+    struct my_llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
 ) {
     // convert AK weights into GG weights one by one.
     // w->token_embedding_table -> model->tok_embeddings
@@ -671,7 +671,7 @@ static void save_as_llama_model(
     std::vector<const char*> tokens;
     std::vector<float> scores;
     std::vector<llama_token_type> token_types;
-    for (const llama_vocab::token_data & token_data : vocab->id_to_token) {
+    for (const my_llama_vocab::token_data & token_data : vocab->id_to_token) {
         tokens.push_back(token_data.text.c_str());
         scores.push_back(token_data.score);
         token_types.push_back(token_data.type);
@@ -905,7 +905,7 @@ int main(int argc, char ** argv) {
         fclose(file);
     }
 
-    struct llama_vocab vocab;
+    struct my_llama_vocab vocab;
     load_vocab(params.fn_vocab_model, &config, &vocab);
 
     struct my_llama_model model;
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 146d416f7..e4d844a73 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -50,7 +50,7 @@ struct naive_trie {
             res.first->second.insert(key + 1, len - 1, value);
         }
     }
-    std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) {
+    std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) const {
         if (len == 0 || offset == len) {
             return std::make_pair(key, offset);
         }
@@ -79,6 +79,15 @@ struct naive_trie {
 // impl
 //
 
+struct llm_tokenizer {
+   llm_tokenizer() {}
+   virtual ~llm_tokenizer() = default;
+};
+
+llama_vocab::~llama_vocab() {
+    delete tokenizer;
+}
+
 int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
     GGML_ASSERT(token_left.find(' ')   == std::string::npos);
     GGML_ASSERT(token_left.find('\n')  == std::string::npos);
@@ -187,10 +196,15 @@ struct llm_bigram_spm {
     size_t size;
 };
 
-struct llm_tokenizer_spm {
-    llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {}
+struct llm_tokenizer_spm : llm_tokenizer {
+    llm_tokenizer_spm(const llama_vocab & /*vocab*/) : llm_tokenizer() {}
+};
+
+struct llm_tokenizer_spm_session {
+    llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {}
 
     void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+
         // split string into utf8 chars
         int index = 0;
         size_t offs = 0;
@@ -271,7 +285,7 @@ private:
             return;
         }
 
-        resegment(symbols[p->second.first],  output);
+        resegment(symbols[p->second.first], output);
         resegment(symbols[p->second.second], output);
     }
 
@@ -279,7 +293,6 @@ private:
         if (left == -1 || right == -1) {
             return;
         }
-
         const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
         auto token = vocab.token_to_id.find(text);
 
@@ -306,10 +319,11 @@ private:
     }
 
     const llama_vocab & vocab;
+    // currently unused
+    // const llm_tokenizer_spm * spm_tokenizer;
 
     std::vector<llm_symbol> symbols;
     llm_bigram_spm::queue work_queue;
-
     std::map<std::string, std::pair<int, int>> rev_merge;
 };
 
@@ -352,8 +366,8 @@ struct llm_bigram_bpe {
     size_t size;
 };
 
-struct llm_tokenizer_bpe {
-    llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {
+struct llm_tokenizer_bpe : llm_tokenizer {
+    llm_tokenizer_bpe(const llama_vocab & vocab) : llm_tokenizer() {
         GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE);
         switch (vocab.type_pre) {
             case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
@@ -476,7 +490,14 @@ struct llm_tokenizer_bpe {
         }
     }
 
-    void append(const llama_vocab::id token_id, std::vector<llama_vocab::id> & output) const {
+    std::vector<std::string> regex_exprs;
+};
+
+struct llm_tokenizer_bpe_session {
+    llm_tokenizer_bpe_session(const llama_vocab & vocab) : vocab(vocab),
+        bpe_tokenizer(static_cast<const llm_tokenizer_bpe *>(vocab.tokenizer)) {}
+
+    static void append(const llama_vocab::id token_id, std::vector<llama_vocab::id> & output)  {
         output.push_back(token_id);
     }
 
@@ -515,12 +536,11 @@ struct llm_tokenizer_bpe {
 
     void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
         int final_prev_index = -1;
-
-        const auto word_collection = unicode_regex_split(text, regex_exprs);
+        const auto word_collection = unicode_regex_split(text, bpe_tokenizer->regex_exprs);
 
         symbols_final.clear();
 
-        for (auto & word : word_collection) {
+        for (const auto & word : word_collection) {
             work_queue = llm_bigram_bpe::queue();
             symbols.clear();
 
@@ -623,7 +643,6 @@ private:
         if (left == -1 || right == -1) {
             return;
         }
-
         std::string left_token  = std::string(symbols[left].text,  symbols[left].n);
         std::string right_token = std::string(symbols[right].text, symbols[right].n);
 
@@ -647,12 +666,10 @@ private:
     }
 
     const llama_vocab & vocab;
-
-    std::vector<std::string> regex_exprs;
+    const llm_tokenizer_bpe * bpe_tokenizer;
 
     std::vector<llm_symbol> symbols;
     std::vector<llm_symbol> symbols_final;
-
     llm_bigram_bpe::queue work_queue;
 };
 
@@ -660,15 +677,17 @@ private:
 // WPM tokenizer
 //
 
-struct llm_tokenizer_wpm {
-    llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
+struct llm_tokenizer_wpm : llm_tokenizer {
+    llm_tokenizer_wpm(const llama_vocab & /*vocab*/) : llm_tokenizer() {}
+};
 
-    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) const {
+struct llm_tokenizer_wpm_session {
+    llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {}
+
+    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
         const auto & token_map = vocab.token_to_id;
-
         // normalize and split by whitespace
         std::vector<std::string> words = preprocess(text);
-
         // bos token prepended already
 
         // find the longest tokens that form the words
@@ -713,7 +732,7 @@ struct llm_tokenizer_wpm {
     }
 
     // TODO: reduce string copies by using cpts_offs array
-    std::vector<std::string> preprocess(const std::string & text) const {
+    static std::vector<std::string> preprocess(const std::string & text)  {
         const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
         std::vector<std::string> words(1, "");
 
@@ -765,15 +784,18 @@ struct llm_tokenizer_wpm {
             //(cpt >= 0xFF00  && cpt <= 0xFFEF);
     }
 
+private:
     const llama_vocab & vocab;
+    // currently unused
+    // const llm_tokenizer_wpm * wpm_tokenizer;
 };
 
 //
 // UGM tokenizer
 //
 
-struct llm_tokenizer_ugm {
-    llm_tokenizer_ugm(const llama_vocab & vocab) : vocab(vocab) {
+struct llm_tokenizer_ugm : llm_tokenizer {
+    llm_tokenizer_ugm(const llama_vocab & vocab) : llm_tokenizer() {
         if (vocab.precompiled_charsmap.size() > 0) {
             size_t charsmap_offset = 0;
 
@@ -819,6 +841,30 @@ struct llm_tokenizer_ugm {
         unknown_token_score = min_score - unknown_token_score_penalty;
     }
 
+    // escaped space symbol - U+2581 (Lower One Eighth Block)
+    const std::string escaped_space = "\xE2\x96\x81";
+
+    const char * prefix_replacements = NULL;
+    size_t prefix_replacements_size = 0;
+
+    const uint32_t * xcda_array = NULL;
+    size_t xcda_array_size = 0;
+
+    struct naive_trie user_defined_token_matcher;
+
+    float min_score = FLT_MAX;
+    float max_score = -FLT_MAX;
+
+    float unknown_token_score_penalty = 10.0;
+    float unknown_token_score;
+
+    struct naive_trie token_matcher;
+};
+
+struct llm_tokenizer_ugm_session {
+    llm_tokenizer_ugm_session(const llama_vocab & vocab) : vocab(vocab),
+        ugm_tokenizer(static_cast<const llm_tokenizer_ugm *>(vocab.tokenizer)) {}
+
     /* This implementation is based on SentencePiece optimized Viterbi algorithm for
      * unigram language models. The general idea is to:
      * - move along the input sequence in steps of one UTF code point,
@@ -857,7 +903,7 @@ struct llm_tokenizer_ugm {
             // traverse the token matcher trie to find a matching token
             bool single_codepoint_token_found = false;
             const struct best_tokenization & current_best = tokenization_results[input_offset];
-            const struct naive_trie * node = token_matcher.traverse(normalized[prefix_offset++]);
+            const struct naive_trie * node = ugm_tokenizer->token_matcher.traverse(normalized[prefix_offset++]);
 
             while (prefix_offset <= input_len && node != NULL) {
                 // check if we found valid token in prefix
@@ -887,7 +933,7 @@ struct llm_tokenizer_ugm {
             // if we didn't find a valid token corresponding to the whole UTF code point
             // then use unknown token as the tokenization of this UTF code point
             if (!single_codepoint_token_found) {
-                const double challenger_score = current_best.score_sum + unknown_token_score;
+                const double challenger_score = current_best.score_sum + ugm_tokenizer->unknown_token_score;
                 prefix_offset = input_offset + n_utf8_code_units;
                 struct best_tokenization & current_champ = tokenization_results[prefix_offset];
                 if (challenger_score > current_champ.score_sum) {
@@ -919,7 +965,6 @@ struct llm_tokenizer_ugm {
     }
 
 private:
-    const llama_vocab & vocab;
 
     // helper structure for returning normalization results
     struct normalization_result {
@@ -932,7 +977,7 @@ private:
         normalized->clear();
         normalized->reserve(input.size() * 3);
 
-        const std::string space = vocab.tokenizer_escape_whitespaces ? escaped_space : " ";
+        const std::string space = vocab.tokenizer_escape_whitespaces ? ugm_tokenizer->escaped_space : " ";
 
         bool shall_prepend_space = !vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
         bool shall_append_space = vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
@@ -1014,13 +1059,21 @@ private:
         size_t xcda_array_size;
     };
 
+    // this structure stores the best tokenization so far at input_offset
+    struct best_tokenization {
+        llama_token token_id;
+        size_t input_offset;
+        float score_sum;
+    };
+
     struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
         if (input_offset == input.size()) {
             return { &input[input_offset], 0, 0 };
         }
 
         // if input prefix matches some user-defined token return this token as normalization result
-        auto user_defined_token_match = user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
+        auto user_defined_token_match =
+           ugm_tokenizer->user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
         if (user_defined_token_match.second > 0) {
             return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second };
         }
@@ -1028,8 +1081,8 @@ private:
         size_t longest_prefix_length = 0;
         size_t longest_prefix_offset = 0;
 
-        if (xcda_array_size > 0) {
-            struct xcda_array_view xcda_view(xcda_array, xcda_array_size);
+        if (ugm_tokenizer->xcda_array_size > 0) {
+            struct xcda_array_view xcda_view(ugm_tokenizer->xcda_array, ugm_tokenizer->xcda_array_size);
 
             // Find the longest normalized sequence matching the input prefix by walking
             // the XOR-compressed compact double array (XCDA) starting from the root node
@@ -1065,50 +1118,27 @@ private:
 
         if (longest_prefix_length > 0) {
             // we have a match, so return the replacement sequence
-            if (longest_prefix_offset >= prefix_replacements_size) {
+            if (longest_prefix_offset >= ugm_tokenizer->prefix_replacements_size) {
                 throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
             }
-            const char * prefix_replacement = &prefix_replacements[longest_prefix_offset];
+            const char * prefix_replacement = &(ugm_tokenizer->prefix_replacements)[longest_prefix_offset];
             return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
-        } else {
-            // check if the input prefix contains a valid sequence of UTF-8 code units
-            try {
-                // if yes, return this sequence unmodified
-                size_t prefix_offset = input_offset;
-                unicode_cpt_from_utf8(input, prefix_offset);
-                return { &input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset };
-            } catch (std::invalid_argument & /*ex*/) {
-                // if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER
-                return { "\xEF\xBF\xBD", 3, 1 };
-            }
+        }
+
+        // check if the input prefix contains a valid sequence of UTF-8 code units
+        try {
+            // if yes, return this sequence unmodified
+            size_t prefix_offset = input_offset;
+            unicode_cpt_from_utf8(input, prefix_offset);
+            return { &input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset };
+        } catch (std::invalid_argument & /*ex*/) {
+            // if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER
+            return { "\xEF\xBF\xBD", 3, 1 };
         }
     }
 
-    // escaped space symbol - U+2581 (Lower One Eighth Block)
-    const std::string escaped_space = "\xE2\x96\x81";
-
-    const char * prefix_replacements = NULL;
-    size_t prefix_replacements_size = 0;
-
-    const uint32_t * xcda_array = NULL;
-    size_t xcda_array_size = 0;
-
-    struct naive_trie user_defined_token_matcher;
-
-    // this structure stores the best tokenization so far at input_offset
-    struct best_tokenization {
-        llama_token token_id;
-        size_t input_offset;
-        float score_sum;
-    };
-
-    float min_score = FLT_MAX;
-    float max_score = -FLT_MAX;
-
-    float unknown_token_score_penalty = 10.0;
-    float unknown_token_score;
-
-    struct naive_trie token_matcher;
+    const llama_vocab & vocab;
+    const llm_tokenizer_ugm * ugm_tokenizer;
 };
 
 //
@@ -1169,8 +1199,8 @@ static std::vector<uint8_t> llama_unescape_rwkv_token(const std::string & escape
     return output;
 }
 
-struct llm_tokenizer_rwkv {
-    llm_tokenizer_rwkv(const llama_vocab & vocab): vocab(vocab) {
+struct llm_tokenizer_rwkv : llm_tokenizer {
+    llm_tokenizer_rwkv(const llama_vocab & vocab) : llm_tokenizer() {
         // RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens.
         // For now, we decode the vocab here into the lookup we'll use for tokenization.
 
@@ -1182,11 +1212,17 @@ struct llm_tokenizer_rwkv {
         }
     }
 
+    struct naive_trie token_matcher;
+};
+
+struct llm_tokenizer_rwkv_session {
+    llm_tokenizer_rwkv_session(const llama_vocab & vocab) : vocab(vocab),
+        rwkv_tokenizer(static_cast<const llm_tokenizer_rwkv &>(*vocab.tokenizer)) {}
+
     void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
         uint32_t position = 0;
-
         while (position < text.size()) {
-            const struct naive_trie * node = token_matcher.traverse(text[position]);
+            const struct naive_trie * node = rwkv_tokenizer.token_matcher.traverse(text[position]);
             if (node == NULL) {
                 // no matching token found, add unknown token
                 output.push_back(vocab.special_unk_id);
@@ -1211,11 +1247,33 @@ struct llm_tokenizer_rwkv {
         }
     }
 
+private:
     const llama_vocab & vocab;
-
-    struct naive_trie token_matcher;
+    const llm_tokenizer_rwkv & rwkv_tokenizer;
 };
 
+void llama_vocab::init_tokenizer() {
+    switch (type) {
+        case LLAMA_VOCAB_TYPE_SPM:
+            tokenizer = new llm_tokenizer_spm(*this);
+            break;
+        case LLAMA_VOCAB_TYPE_BPE:
+            tokenizer = new llm_tokenizer_bpe(*this);
+            break;
+        case LLAMA_VOCAB_TYPE_WPM:
+            tokenizer = new llm_tokenizer_wpm(*this);
+            break;
+        case LLAMA_VOCAB_TYPE_UGM:
+            tokenizer = new llm_tokenizer_ugm(*this);
+            break;
+        case LLAMA_VOCAB_TYPE_RWKV:
+            tokenizer = new llm_tokenizer_rwkv(*this);
+            break;
+        default:
+            GGML_ABORT("unsupported vocab type");
+    }
+}
+
 //
 // (de-) tokenize
 //
@@ -1277,7 +1335,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
 
             // if a fragment is text ( not yet processed )
             if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
-                auto & raw_text = fragment.raw_text;
+                const auto & raw_text = fragment.raw_text;
 
                 auto raw_text_base_offset = fragment.offset;
                 auto raw_text_base_length = fragment.length;
@@ -1376,7 +1434,13 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
     }
 }
 
-std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special) {
+std::vector<llama_vocab::id> llama_tokenize_internal(
+        const llama_vocab & vocab,
+        std::string raw_text,
+        bool add_special,
+        bool parse_special) {
+    GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
+
     std::vector<llama_vocab::id> output;
     std::forward_list<fragment_buffer_variant> fragment_buffer;
 
@@ -1413,9 +1477,9 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
 #ifdef PRETOKENIZERDEBUG
                         LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
 #endif
-                        llm_tokenizer_spm tokenizer(vocab);
                         llama_escape_whitespace(raw_text);
-                        tokenizer.tokenize(raw_text, output);
+                        llm_tokenizer_spm_session session(vocab);
+                        session.tokenize(raw_text, output);
                         is_prev_special = false;
                     } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
                         output.push_back(fragment.token);
@@ -1437,10 +1501,11 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
             } break;
         case LLAMA_VOCAB_TYPE_BPE:
             {
-                llm_tokenizer_bpe tokenizer(vocab);
-
+                llm_tokenizer_bpe_session session(vocab);
+                // it calls some other methods that are not exist in llm_tokenizer,
+                // here just cast it to bpe tokenizer object
                 if (add_special) {
-                    tokenizer.append_bos(output);
+                    session.append_bos(output);
                 }
                 for (const auto & fragment : fragment_buffer) {
                     if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
@@ -1449,15 +1514,15 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
 #ifdef PRETOKENIZERDEBUG
                         LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
 #endif
-                        tokenizer.tokenize(raw_text, output);
+                        session.tokenize(raw_text, output);
                     } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
-                        tokenizer.append(fragment.token, output);
+                        session.append(fragment.token, output);
                     }
                 }
 
                 if (add_special) {
-                    tokenizer.append_eos(output);
-                    tokenizer.check_double_bos_eos(output);
+                    session.append_eos(output);
+                    session.check_double_bos_eos(output);
                 }
             } break;
         case LLAMA_VOCAB_TYPE_WPM:
@@ -1467,7 +1532,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
                     output.push_back(vocab.special_cls_id);
                 }
 
-                llm_tokenizer_wpm tokenizer(vocab);
+                llm_tokenizer_wpm_session session(vocab);
 
                 for (const auto & fragment : fragment_buffer) {
                     if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
@@ -1476,7 +1541,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
 #ifdef PRETOKENIZERDEBUG
                         LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
 #endif
-                        tokenizer.tokenize(raw_text, output);
+                        session.tokenize(raw_text, output);
                     } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
                         output.push_back(fragment.token);
                     }
@@ -1489,12 +1554,11 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
             } break;
         case LLAMA_VOCAB_TYPE_UGM:
             {
-                llm_tokenizer_ugm tokenizer(vocab);
-
                 if (add_special && vocab.tokenizer_add_bos != 0) {
                     GGML_ASSERT(vocab.special_bos_id != -1);
                     output.push_back(vocab.special_bos_id);
                 }
+                llm_tokenizer_ugm_session session(vocab);
 
                 for (const auto & fragment : fragment_buffer) {
                     if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
@@ -1502,7 +1566,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
 #ifdef PRETOKENIZERDEBUG
                         LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
 #endif
-                        tokenizer.tokenize(raw_text, output);
+                        session.tokenize(raw_text, output);
                     } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
                         output.push_back(fragment.token);
                     }
@@ -1522,6 +1586,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
             } break;
         case LLAMA_VOCAB_TYPE_RWKV:
             {
+                llm_tokenizer_rwkv_session session(vocab);
                 for (const auto & fragment : fragment_buffer) {
                     if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
                         auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
@@ -1530,8 +1595,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
                         LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
 #endif
 
-                        llm_tokenizer_rwkv tokenizer(vocab);
-                        tokenizer.tokenize(raw_text, output);
+                        session.tokenize(raw_text, output);
                     } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
                         output.push_back(fragment.token);
                     }
@@ -1644,13 +1708,13 @@ llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
 }
 
 int32_t llama_tokenize_impl(
-    const struct llama_vocab & vocab,
-                  const char * text,
-                     int32_t   text_len,
-                 llama_token * tokens,
-                     int32_t   n_tokens_max,
-                        bool   add_special,
-                        bool   parse_special) {
+        const struct llama_vocab & vocab,
+                      const char * text,
+                         int32_t   text_len,
+                     llama_token * tokens,
+                         int32_t   n_tokens_max,
+                            bool   add_special,
+                            bool   parse_special) {
     auto res = llama_tokenize_internal(vocab, std::string(text, text_len), add_special, parse_special);
     if (n_tokens_max < (int) res.size()) {
         // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
@@ -1775,6 +1839,8 @@ int32_t llama_detokenize_impl(
                          int32_t   text_len_max,
                             bool   remove_special,
                             bool   unparse_special) {
+    GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
+
     int32_t avail = text_len_max;
     int32_t total = 0;
 
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
index cc46f642b..069bdc423 100644
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -8,6 +8,8 @@
 #include <map>
 #include <set>
 
+struct llm_tokenizer;
+
 struct llama_vocab {
     using id    = llama_token;
     using token = std::string;
@@ -65,7 +67,14 @@ struct llama_vocab {
 
     std::vector<char> precompiled_charsmap;
 
+    llm_tokenizer * tokenizer = nullptr;
+
+    llama_vocab() = default;
+    ~llama_vocab();
+
     int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
+
+    void init_tokenizer();
 };
 
 //
diff --git a/src/llama.cpp b/src/llama.cpp
index f450eaf9d..44afb31d7 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6464,6 +6464,8 @@ static void llm_load_vocab(
     }
     GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
 
+    vocab.init_tokenizer();
+
     // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
     if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
         // For Fill-In-the-Middle (FIM)/infill models which where converted
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
index d3d21331b..4d49850c9 100644
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -7,6 +7,7 @@
 #include <map>
 #include <vector>
 #include <fstream>
+#include <thread>
 
 //static const std::map<std::string, std::vector<llama_token>> & k_tests() {
 //    static std::map<std::string, std::vector<llama_token>> _k_tests = {
@@ -194,45 +195,64 @@ int main(int argc, char **argv) {
 
     const bool add_special = false;
 
-    for (const auto & test_kv : k_tests) {
-        const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special, false);
+    // multi-threaded tokenization
+    const int nthread = std::thread::hardware_concurrency();
+    std::vector<std::thread> threads(nthread);
 
-        printf("\n");
-        printf("src: '%s'\n", test_kv.first.c_str());
-        printf("res: '%s'\n", llama_detokenize(ctx, res).c_str());
-        printf("tok: ");
-        for (const auto & tok : res) {
-            printf("%d ", tok);
-        }
-        printf("\n");
+    for (int i = 0; i < nthread; i++) {
+        threads[i] = std::thread([&, i]() {
+            for (const auto & test_kv : k_tests) {
+                const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special, false);
 
-        bool correct = res.size() == test_kv.second.size();
-        for (int i = 0; i < (int) res.size() && correct; ++i) {
-            if (test_kv.second[i] != res[i]) {
-                correct = false;
+                // here only print the result of the first thread
+                // because the other threads are running the same tests
+                if (i != 0) {
+                    continue;
+                }
+
+                printf("\n");
+                printf("src: '%s'\n", test_kv.first.c_str());
+                printf("res: '%s'\n", llama_detokenize(ctx, res).c_str());
+                printf("tok: ");
+                for (const auto & tok : res) {
+                    printf("%d ", tok);
+                }
+                printf("\n");
+
+                bool correct = res.size() == test_kv.second.size();
+                for (int i = 0; i < (int) res.size() && correct; ++i) {
+                    if (test_kv.second[i] != res[i]) {
+                        correct = false;
+                    }
+                }
+
+                if (!correct) {
+                    fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
+                    fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
+                        llama_detokenize(ctx, res).c_str(),
+                        llama_detokenize(ctx, test_kv.second).c_str());
+                    fprintf(stderr, "%s : expected tokens: ", __func__);
+                    for (const auto & t : test_kv.second) {
+                        fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
+                    }
+                    fprintf(stderr, "\n");
+                    fprintf(stderr, "%s : got tokens:      ", __func__);
+                    for (const auto & t : res) {
+                        fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
+                    }
+                    fprintf(stderr, "\n");
+
+                    success = false;
+                }
             }
-        }
-
-        if (!correct) {
-            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
-            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
-                llama_detokenize(ctx, res).c_str(),
-                llama_detokenize(ctx, test_kv.second).c_str());
-            fprintf(stderr, "%s : expected tokens: ", __func__);
-            for (const auto & t : test_kv.second) {
-                fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
-            }
-            fprintf(stderr, "\n");
-            fprintf(stderr, "%s : got tokens:      ", __func__);
-            for (const auto & t : res) {
-                fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
-            }
-            fprintf(stderr, "\n");
-
-            success = false;
-        }
+        });
     }
 
+    for (int i = 0; i < nthread; i++) {
+        threads[i].join();
+    }
+
+    // single threaded tokenization
     if (!fname_text.empty()) {
         fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
 

From 739842703e32cd43443c45e0b4f6647cc4e6b3d6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 28 Sep 2024 15:13:21 +0300
Subject: [PATCH 53/78] llama : add comment about thread-safety [no ci] (#9449)

---
 include/llama.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/llama.h b/include/llama.h
index caef0bfff..4ea8a2c2b 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -911,6 +911,8 @@ extern "C" {
     //
     // Tokenization
     //
+    // The API is thread-safe.
+    //
 
     /// @details Convert the provided text into tokens.
     /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.

From 1b2f992cd2cff0b69e5abe78bb8888d51ed19d67 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Sat, 28 Sep 2024 14:32:46 +0200
Subject: [PATCH 54/78] test-backend-ops : use flops for some performance tests
 (#9657)

* test-backend-ops : use flops for some performance tests

- parallelize tensor quantization

- use a different set of cases for performance and correctness tests

- run each test for at least one second
---
 tests/test-backend-ops.cpp | 411 +++++++++++++++++++------------------
 1 file changed, 214 insertions(+), 197 deletions(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 9a96cfc4c..d2cfe06b5 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -32,63 +32,52 @@
 #include <stdlib.h>
 #include <string>
 #include <thread>
+#include <future>
 #include <vector>
 
 static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
-    // static RNG initialization (revisit if n_threads stops being constant)
-    static const size_t n_threads = std::thread::hardware_concurrency();
-    static std::vector<std::default_random_engine> generators = []() {
-        std::random_device rd;
-        std::vector<std::default_random_engine> vec;
-        vec.reserve(n_threads);
-        //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
-        for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
-        return vec;
-    }();
+    size_t nels = ggml_nelements(tensor);
+    std::vector<float> data(nels);
+    {
+        // parallel initialization
+        static const size_t n_threads = std::thread::hardware_concurrency();
+        // static RNG initialization (revisit if n_threads stops being constant)
+        static std::vector<std::default_random_engine> generators = []() {
+            std::random_device rd;
+            std::vector<std::default_random_engine> vec;
+            vec.reserve(n_threads);
+            //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
+            for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
+            return vec;
+        }();
 
-    size_t size = ggml_nelements(tensor);
-    std::vector<float> data(size);
+        auto init_thread = [&](size_t ith, size_t start, size_t end) {
+            std::uniform_real_distribution<float> distribution(min, max);
+            auto & gen = generators[ith];
+            for (size_t i = start; i < end; i++) {
+                data[i] = distribution(gen);
+            }
+        };
 
-    auto init_thread = [&](size_t ith, size_t start, size_t end) {
-        std::uniform_real_distribution<float> distribution(min, max);
-        for (size_t i = start; i < end; i++) {
-            data[i] = distribution(generators[ith]);
+        std::vector<std::future<void>> tasks;
+        tasks.reserve(n_threads);
+        for (size_t i = 0; i < n_threads; i++) {
+            size_t start =     i*nels/n_threads;
+            size_t end   = (i+1)*nels/n_threads;
+            tasks.push_back(std::async(std::launch::async, init_thread, i, start, end));
         }
-    };
-
-    std::vector<std::thread> threads;
-    threads.reserve(n_threads);
-    for (size_t i = 0; i < n_threads; i++) {
-        size_t start =     i*size/n_threads;
-        size_t end   = (i+1)*size/n_threads;
-        threads.emplace_back(init_thread, i, start, end);
-    }
-    for (auto & t : threads) {
-        t.join();
-    }
-
-#if 0
-    const char * val_str = getenv("GGML_TEST_EPS");
-    float val = 1e-9f;
-    if (val_str != nullptr) {
-        val = std::stof(val_str);
-        printf("GGML_TEST_EPS=%e\n", val);
-    }
-
-    // test quantization with very small values that may result in nan scales due to division by zero
-    if (ggml_is_quantized(tensor->type)) {
-        for (int i = 0; i < 256; i++) {
-            data[i] = val;
+        for (auto & t : tasks) {
+            t.get();
         }
     }
-#endif
 
     if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
-        ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
+        ggml_backend_tensor_set(tensor, data.data(), 0, nels * sizeof(float));
     } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
-        GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
-        std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
-        std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
+        GGML_ASSERT(nels % ggml_blck_size(tensor->type) == 0);
+
+         // dummy importance matrix
+        std::vector<float> imatrix(tensor->ne[0], 1.0f);
         const float * im = imatrix.data();
         if (!ggml_quantize_requires_imatrix(tensor->type)) {
             // when the imatrix is optional, we want to test both quantization with and without imatrix
@@ -98,15 +87,31 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
             }
         }
 
-        ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
-        GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
-        // TODO: other cases
-        //#pragma omp parallel for
-        //for (int i = 0; i < tensor->ne[1]; i++) {
-        //    ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
-        //        i * tensor->ne[0], 1, tensor->ne[0], im);
-        //}
+        std::vector<uint8_t> dataq(ggml_row_size(tensor->type, nels));
+        {
+            // parallel quantization by block
+            size_t blck_size = ggml_blck_size(tensor->type);
+            size_t n_blocks = nels / blck_size;
 
+            auto quantize_thread = [&](size_t start, size_t end) {
+                ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
+                    start * blck_size, end - start, blck_size, im);
+            };
+
+            const size_t min_blocks_per_thread = 1;
+            const size_t n_threads = std::min<size_t>(std::thread::hardware_concurrency()/2,
+                                                      std::max<size_t>(1, n_blocks / min_blocks_per_thread));
+            std::vector<std::future<void>> tasks;
+            tasks.reserve(n_threads);
+            for (size_t i = 0; i < n_threads; i++) {
+                size_t start =     i*n_blocks/n_threads;
+                size_t end   = (i+1)*n_blocks/n_threads;
+                tasks.push_back(std::async(std::launch::async, quantize_thread, start, end));
+            }
+            for (auto & t : tasks) {
+                t.get();
+            }
+        }
         ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
     } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
         // This is going to create some weird integers though.
@@ -160,60 +165,6 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
     return tv;
 }
 
-/*
-static double cosine_similarity(const float * v1, const float * v2, size_t n) {
-    double dot = 0.0;
-    double mag1 = 0.0;
-    double mag2 = 0.0;
-
-    for (size_t i = 0; i < n; i++) {
-        if (std::isnan(v1[i]) || std::isnan(v2[i])) {
-            return -1.0f;
-        }
-        if (std::isinf(v1[i]) && std::isinf(v2[i])) {
-            continue;
-        }
-        dot  += v1[i]*v2[i];
-        mag1 += v1[i]*v1[i];
-        mag2 += v2[i]*v2[i];
-    }
-
-    return dot/sqrt(mag1*mag2);
-}
-
-static float distance(const float * v1, const float * v2, size_t n) {
-    double d = 0.0;
-
-    for (size_t i = 0; i < n; i++) {
-        if (std::isnan(v1[i]) || std::isnan(v2[i])) {
-            return INFINITY;
-        }
-        if (std::isinf(v1[i]) && std::isinf(v2[i])) {
-            continue;
-        }
-        d += (v1[i] - v2[i])*(v1[i] - v2[i]);
-    }
-
-    return sqrt(d);
-}
-
-static float vec_len(const float * v, size_t n) {
-    double d = 0.0;
-
-    for (size_t i = 0; i < n; i++) {
-        if (std::isnan(v[i])) {
-            return INFINITY;
-        }
-        if (std::isinf(v[i])) {
-            continue;
-        }
-        d += v[i]*v[i];
-    }
-
-    return sqrt(d);
-}
-*/
-
 // normalized mean squared error = mse(a, b) / mse(a, 0)
 static double nmse(const float * a, const float * b, size_t n) {
     double mse_a_b = 0.0;
@@ -264,7 +215,6 @@ static double mean_abs_asymm(const float * a, const float * b, const size_t n, c
 }
 
 // utils for printing the variables of the test cases
-#define VAR_TO_STR(x) (#x "=" + var_to_str(x))
 
 template<typename T>
 static std::string var_to_str(const T & x) {
@@ -297,10 +247,6 @@ static std::string var_to_str(const std::array<T, N> & x) {
     return s;
 }
 
-//static std::string var_to_str(ggml_unary_op unary_op) {
-//    return ggml_unary_op_name(unary_op);
-//}
-
 static std::string var_to_str(ggml_type type) {
     return ggml_type_name(type);
 }
@@ -313,6 +259,8 @@ static std::string var_to_str(ggml_op_pool pool) {
     }
 }
 
+#define VAR_TO_STR(x) (#x "=" + var_to_str(x))
+
 #define VARS_TO_STR1(a) VAR_TO_STR(a)
 #define VARS_TO_STR2(a, b) VAR_TO_STR(a) + "," + VAR_TO_STR(b)
 #define VARS_TO_STR3(a, b, c) VAR_TO_STR(a) + "," + VARS_TO_STR2(b, c)
@@ -370,13 +318,13 @@ struct test_case {
         return 1e-4;
     }
 
-    virtual float grad_eps(){
+    virtual float grad_eps() {
         return 1e-1f;
     }
 
     // If false, estimate gradient with 2 points, neglects 3rd order derivative and higher.
     // If true,  estimate gradient with 4 points, neglects 5th order derivative and higher.
-    virtual bool grad_precise(){
+    virtual bool grad_precise() {
         return false;
     }
 
@@ -409,6 +357,11 @@ struct test_case {
         return size;
     }
 
+    virtual uint64_t op_flops(ggml_tensor * t) {
+        GGML_UNUSED(t);
+        return 0;
+    }
+
     ggml_cgraph * gf = nullptr;
     ggml_cgraph * gb = nullptr;
 
@@ -651,12 +604,11 @@ struct test_case {
         }
 
         // align while also leaving some margin for variations in parameters
-        int align = 20;
+        int align = 8;
         int last = (len + align - 1) / align * align;
         if (last - len < 5) {
             last += align;
         }
-        last = std::max(last, 60);
         printf("%*s", last - len, "");
 
         // allocate
@@ -677,9 +629,25 @@ struct test_case {
         // warmup run
         ggml_backend_graph_compute(backend, gf);
 
+        // determine number of runs
+        int n_runs;
+        if (op_flops(out) > 0) {
+            // based on flops
+            const uint64_t GFLOP = 1000 * 1000 * 1000;
+            const uint64_t target_flops_cpu =   8ULL * GFLOP;
+            const uint64_t target_flops_gpu = 100ULL * GFLOP;
+            uint64_t target_flops = ggml_backend_is_cpu(backend) ? target_flops_cpu : target_flops_gpu;
+            n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1;
+        } else {
+            // based on memory size
+            const size_t GB = 1ULL << 30;
+            const size_t target_size_cpu =  8 * GB;
+            const size_t target_size_gpu = 32 * GB;
+            size_t target_size = ggml_backend_is_cpu(backend) ? target_size_cpu : target_size_gpu;
+            n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
+        }
+
         // duplicate the op
-        size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
-        int n_runs = std::min((size_t) ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
         for (int i = 1; i < n_runs; i++) {
             ggml_graph_add_node(gf, out);
         }
@@ -706,17 +674,47 @@ struct test_case {
         // run
         ggml_backend_synchronize(backend);
 
-        int64_t start_time = ggml_time_us();
-        ggml_backend_graph_compute(backend, gf);
-        ggml_backend_synchronize(backend);
-        int64_t end_time = ggml_time_us();
-        double time_us = end_time - start_time;
+        int64_t total_time_us = 0;
+        int total_runs = 0;
+        do {
+            int64_t start_time = ggml_time_us();
+            ggml_backend_graph_compute(backend, gf);
+            ggml_backend_synchronize(backend);
+            int64_t end_time = ggml_time_us();
 
-        printf("    %5d runs - %8.2f us/run - %8zu kB/run - \033[1;34m%7.2f GB/s\033[0m\n",
-            n_runs,
-            time_us / n_runs,
-            op_size(out) / 1024,
-            mem / (time_us/1e6) / 1024.0 / 1024.0 / 1024.0);
+            total_time_us += end_time - start_time;
+            total_runs += n_runs;
+        } while (total_time_us < 1000*1000); // run for at least 1 second
+
+        printf("    %8d runs - %8.2f us/run - ",
+            total_runs,
+            (double)total_time_us / total_runs);
+
+        if (op_flops(out) > 0) {
+            double flops_per_sec = (op_flops(out) * total_runs) / (total_time_us / 1e6);
+            auto format_flops = [](double flops) -> std::string {
+                char buf[256];
+                if (flops >= 1e12) {
+                    snprintf(buf, sizeof(buf), "%6.2f TFLOP", flops / 1e12);
+                } else if (flops >= 1e9) {
+                    snprintf(buf, sizeof(buf), "%6.2f GFLOP", flops / 1e9);
+                } else if (flops >= 1e6) {
+                    snprintf(buf, sizeof(buf), "%6.2f MFLOP", flops / 1e6);
+                } else {
+                    snprintf(buf, sizeof(buf), "%6.2f KFLOP", flops / 1e3);
+                }
+                return buf;
+            };
+            printf("%s/run - \033[1;34m%sS\033[0m",
+                format_flops(op_flops(out)).c_str(),
+                format_flops(flops_per_sec).c_str());
+
+        } else {
+            printf("%8zu kB/run - \033[1;34m%7.2f GB/s\033[0m",
+                op_size(out) / 1024,
+                mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0);
+        }
+        printf("\n");
 
         ggml_backend_buffer_free(buf);
 
@@ -1591,13 +1589,9 @@ struct test_mul_mat : public test_case {
         return 5e-4;
     }
 
-    size_t op_size(ggml_tensor * t) override {
-        size_t a = ggml_nbytes(t->src[0]) * n * nr[0] * nr[1];
-        size_t b = ggml_nbytes(t->src[1]) * m;
-        size_t c  = ggml_nbytes(t);
-        return a + b + c;
-
+    uint64_t op_flops(ggml_tensor * t) override {
         GGML_UNUSED(t);
+        return 2 * m * n * k * bs[0] * nr[0] * bs[1] * nr[1];
     }
 
     test_mul_mat(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
@@ -1641,13 +1635,9 @@ struct test_mul_mat_id : public test_case {
         return 5e-4;
     }
 
-    size_t op_size(ggml_tensor * t) override {
-        size_t a = ggml_nbytes(t->src[2]) * n;
-        size_t b = ggml_nbytes(t->src[1]) * m;
-        size_t c  = ggml_nbytes(t);
-        return a + b + c;
-
+    uint64_t op_flops(ggml_tensor * t) override {
         GGML_UNUSED(t);
+        return 2 * m * k * n * n_used;
     }
 
     test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
@@ -3163,47 +3153,46 @@ struct test_falcon : public test_llm {
 // ###########################################
 // ## Section 3: GGML Op Test Instantiation ##
 // ###########################################
+static const ggml_type all_types[] = {
+    GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
+    GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
+    GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
+    GGML_TYPE_Q8_0,
+    GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
+    GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
+    GGML_TYPE_Q6_K,
+    // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
+    GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
+    GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
+    GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
+};
 
+static const ggml_type base_types[] = {
+    GGML_TYPE_F32, GGML_TYPE_F16,
+    GGML_TYPE_Q4_0,
+    GGML_TYPE_Q4_K,
+    GGML_TYPE_IQ2_XXS
+};
 
-static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
+static const ggml_type other_types[] = {
+    GGML_TYPE_Q4_1,
+    GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
+    GGML_TYPE_Q8_0,
+    GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
+    GGML_TYPE_Q5_K,
+    GGML_TYPE_Q6_K,
+    // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
+    GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
+    GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
+    GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
+    GGML_TYPE_BF16,
+};
+
+// Test cases for evaluation: should try to cover edge cases while using small input sizes to keep the runtime low
+static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     std::vector<std::unique_ptr<test_case>> test_cases;
     std::default_random_engine rng(0);
 
-    const ggml_type all_types[] = {
-        GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
-        GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
-        GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
-        GGML_TYPE_Q8_0,
-        GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
-        GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
-        GGML_TYPE_Q6_K,
-        // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
-        GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
-        GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
-        GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
-    };
-
-    const ggml_type base_types[] = {
-        GGML_TYPE_F32, GGML_TYPE_F16,
-        GGML_TYPE_Q4_0,
-        GGML_TYPE_Q4_K,
-        GGML_TYPE_IQ2_XXS
-    };
-
-    const ggml_type other_types[] = {
-        GGML_TYPE_Q4_1,
-        GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
-        GGML_TYPE_Q8_0,
-        GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
-        GGML_TYPE_Q5_K,
-        GGML_TYPE_Q6_K,
-        // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
-        GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
-        GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
-        GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
-        GGML_TYPE_BF16,
-    };
-
     // unary ops
     for (int v : {0, 1}) {
         for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
@@ -3392,6 +3381,14 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
             test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
         }
     }
+    for (ggml_type type_a : other_types) {
+        for (ggml_type type_b : {GGML_TYPE_F32}) {
+            if (ggml_blck_size(type_a) != 256) {
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), {1,  1}, {1, 1}));
+            }
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1,  1}, {1, 1}));
+        }
+    }
 #else
     // m = a rows
     // n = b rows
@@ -3411,15 +3408,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     }
 #endif
 
-    for (ggml_type type_a : other_types) {
-        for (ggml_type type_b : {GGML_TYPE_F32}) {
-            if (ggml_blck_size(type_a) != 256) {
-                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), {1,  1}, {1, 1}));
-            }
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1,  1}, {1, 1}));
-        }
-    }
-
     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 2,  128, { 8,  1}, {1, 1}));
     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  83, 2,  128, { 8,  1}, {4, 1}));
     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 2,   64, { 8,  1}, {4, 1}));
@@ -3624,20 +3612,30 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     test_cases.emplace_back(new test_falcon(2));
 #endif
 
-    // run tests
-    if (mode == MODE_GRAD) {
-        size_t n_ok = 0;
-        for (auto & test : test_cases) {
-            if (test->eval_grad(backend, op_name)) {
-                n_ok++;
+    return test_cases;
+}
+
+// Test cases for performance evaluation: should be representative of real-world use cases
+static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
+    std::vector<std::unique_ptr<test_case>> test_cases;
+
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1,   1, 1, 1}));
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
+
+    for (int bs : {1, 512}) {
+        for (ggml_type type_a : all_types) {
+            for (ggml_type type_b : {GGML_TYPE_F32}) {
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 4096, bs, 14336, {1,  1}, {1, 1}));
             }
         }
-        printf("  %zu/%zu tests passed\n", n_ok, test_cases.size());
-
-        return n_ok == test_cases.size();
     }
 
+    return test_cases;
+}
+
+static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
     if (mode == MODE_TEST) {
+        auto test_cases = make_test_cases_eval();
         ggml_backend_t backend_cpu = ggml_backend_cpu_init();
 
         size_t n_ok = 0;
@@ -3653,7 +3651,21 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
         return n_ok == test_cases.size();
     }
 
+    if (mode == MODE_GRAD) {
+        auto test_cases = make_test_cases_eval();
+        size_t n_ok = 0;
+        for (auto & test : test_cases) {
+            if (test->eval_grad(backend, op_name)) {
+                n_ok++;
+            }
+        }
+        printf("  %zu/%zu tests passed\n", n_ok, test_cases.size());
+
+        return n_ok == test_cases.size();
+    }
+
     if (mode == MODE_PERF) {
+        auto test_cases = make_test_cases_perf();
         for (auto & test : test_cases) {
             test->eval_perf(backend, op_name);
         }
@@ -3667,9 +3679,9 @@ static void usage(char ** argv) {
     printf("Usage: %s [mode] [-o op] [-b backend]\n", argv[0]);
     printf("    valid modes:\n");
     printf("      - test (default, compare with CPU backend for correctness)\n");
-    printf("      - perf (performance evaluation)\n");
     printf("      - grad (compare gradients from backpropagation with method of finite differences)\n");
-    printf("    op names are as given by ggml_op_desc() (e.g. GGML_ADD)\n");
+    printf("      - perf (performance evaluation)\n");
+    printf("    op names for -o are as given by ggml_op_desc() (e.g. ADD, MUL_MAT, etc)\n");
 }
 
 int main(int argc, char ** argv) {
@@ -3728,6 +3740,11 @@ int main(int argc, char ** argv) {
             continue;
         }
 
+        if (ggml_backend_is_cpu(backend)) {
+            // TODO: better value for n_threads
+            ggml_backend_cpu_set_n_threads(backend, std::thread::hardware_concurrency() / 2);
+        }
+
         printf("  Backend name: %s\n", ggml_backend_name(backend));
 
         bool ok = test_backend(backend, mode, op_name_filter);

From f4d2b8846a6b34419ff9e9491aee6cd95e444bfc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 28 Sep 2024 17:42:03 +0300
Subject: [PATCH 55/78] llama : add reranking support (#9510)

* py : add XLMRobertaForSequenceClassification [no ci]

* py : fix scalar-tensor conversion [no ci]

* py : fix position embeddings chop [no ci]

* llama : read new cls tensors [no ci]

* llama : add classigication head (wip) [no ci]

* llama : add "rank" pooling type

ggml-ci

* server : add rerank endpoint

ggml-ci

* llama : aboud ggml_repeat during classification

* rerank : cleanup + comments

* server : accept /rerank endpoint in addition to /v1/rerank [no ci]

* embedding : parse special tokens

* jina : support v1 reranker

* vocab : minor style

ggml-ci

* server : initiate tests for later

ggml-ci

* server : add docs

* llama : add comment [no ci]

* llama : fix uninitialized tensors

* ci : add rerank tests

ggml-ci

* add reranking test

* change test data

* Update examples/server/server.cpp

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>

* add `--reranking` argument

* update server docs

* llama : fix comment [no ci]

ggml-ci

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
---
 ci/run.sh                                     |  85 ++++++-
 common/arg.cpp                                |  18 +-
 common/common.cpp                             |   5 +
 common/common.h                               |   1 +
 convert_hf_to_gguf.py                         |  27 ++-
 convert_hf_to_gguf_update.py                  |   1 +
 examples/embedding/embedding.cpp              |   7 +-
 examples/server/README.md                     |  39 +++-
 examples/server/server.cpp                    | 215 ++++++++++++++++--
 .../server/tests/features/embeddings.feature  |   2 +-
 examples/server/tests/features/rerank.feature |  42 ++++
 examples/server/tests/features/steps/steps.py |  54 ++++-
 examples/server/utils.hpp                     |  25 +-
 gguf-py/gguf/constants.py                     |   7 +
 gguf-py/gguf/tensor_mapping.py                |   9 +
 include/llama.h                               |  10 +-
 src/llama-vocab.cpp                           |  15 +-
 src/llama.cpp                                 |  96 +++++++-
 18 files changed, 602 insertions(+), 56 deletions(-)
 create mode 100644 examples/server/tests/features/rerank.feature

diff --git a/ci/run.sh b/ci/run.sh
index 1ac08ee4e..7d241ecc0 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -712,6 +712,81 @@ function gg_run_embd_bge_small {
     set +e
 }
 
+function gg_sum_embd_bge_small {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'BGE Small (BERT):\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+}
+
+# rerank_tiny
+
+function gg_run_rerank_tiny {
+    cd ${SRC}
+
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer_config.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/special_tokens_map.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/resolve/main/pytorch_model.bin
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/sentence_bert_config.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/vocab.txt
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/modules.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
+
+    gg_wget models-mnt/rerank-tiny/1_Pooling https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/1_Pooling/config.json
+
+    path_models="../models-mnt/rerank-tiny"
+
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+
+    set -e
+
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+
+    model_f16="${path_models}/ggml-model-f16.gguf"
+
+    (time ./bin/llama-embedding --model ${model_f16}  -p "what is panda?</s><s>hi\nwhat is panda?</s><s>it's a bear\nwhat is panda?</s><s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
+
+    # sample output
+    # rerank score 0:    0.029
+    # rerank score 1:    0.029
+    # rerank score 2:    0.135
+
+    # check that the score is in the range [$3, $4]
+    function check_score {
+        qnt="$1"
+        score=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+
+        if [ $(echo "$score < $3" | bc) -eq 1 ] || [ $(echo "$score > $4" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: score not in range [%s, %s])\n' "$qnt" "$score" "$3" "$4"
+            return 20
+        fi
+
+        printf '  - %s @ %s OK\n' "$qnt" "$score"
+        return 0
+    }
+
+    check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 0")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
+    check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 1")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
+    check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.15" | tee -a $OUT/${ci}-rk-f16.log
+
+    set +e
+}
+
+function gg_sum_rerank_tiny {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Rerank Tiny (Jina):\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-rk-f16.log)"
+}
+
 function gg_check_build_requirements {
     if ! command -v cmake &> /dev/null; then
         gg_printf 'cmake not found, please install'
@@ -726,15 +801,6 @@ function gg_check_build_requirements {
     fi
 }
 
-function gg_sum_embd_bge_small {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'BGE Small (BERT):\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
-    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
-}
-
 ## main
 
 export LLAMA_LOG_PREFIX=1
@@ -762,6 +828,7 @@ test $ret -eq 0 && gg_run ctest_release
 
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
     test $ret -eq 0 && gg_run embd_bge_small
+    test $ret -eq 0 && gg_run rerank_tiny
 
     if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
         test $ret -eq 0 && gg_run test_scripts_debug
diff --git a/common/arg.cpp b/common/arg.cpp
index 6880117ed..8266a16c2 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -284,6 +284,10 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
         params.kv_overrides.back().key[0] = 0;
     }
 
+    if (params.reranking && params.embedding) {
+        throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
+    }
+
     return true;
 }
 
@@ -391,7 +395,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
         [](gpt_params & params) {
             params.verbose_prompt = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ));
     add_opt(llama_arg(
         {"--no-display-prompt"},
         format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
@@ -1093,13 +1097,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
         }
     ).set_sparam());
     add_opt(llama_arg(
-        {"--pooling"}, "{none,mean,cls,last}",
+        {"--pooling"}, "{none,mean,cls,last,rank}",
         "pooling type for embeddings, use model default if unspecified",
         [](gpt_params & params, const std::string & value) {
             /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
             else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
-            else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
+            else if (value == "cls")  { params.pooling_type = LLAMA_POOLING_TYPE_CLS;  }
             else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
+            else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
             else { throw std::invalid_argument("invalid value"); }
         }
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
@@ -1749,6 +1754,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.embedding = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
+    add_opt(llama_arg(
+        {"--reranking", "--rerank"},
+        format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
+        [](gpt_params & params) {
+            params.reranking = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
     add_opt(llama_arg(
         {"--api-key"}, "KEY",
         "API key to use for authentication (default: none)",
diff --git a/common/common.cpp b/common/common.cpp
index 8d0ed4f95..e2b8574bf 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1023,6 +1023,11 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.flash_attn        = params.flash_attn;
     cparams.no_perf           = params.no_perf;
 
+    if (params.reranking) {
+        cparams.embeddings    = true;
+        cparams.pooling_type  = LLAMA_POOLING_TYPE_RANK;
+    }
+
     cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
     cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
 
diff --git a/common/common.h b/common/common.h
index cb87c4479..8b84cf9ad 100644
--- a/common/common.h
+++ b/common/common.h
@@ -271,6 +271,7 @@ struct gpt_params {
     int32_t embd_normalize = 2;     // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
     std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
     std::string embd_sep   = "\n";  // separator of embendings
+    bool reranking         = false; // enable reranking support on server
 
     // server params
     int32_t port           = 8080;         // server listens on this network port
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 2cd5a8c11..96a8830e9 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -291,8 +291,13 @@ class Model:
                     bid = int(part)
                     break
 
-            for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
-                data: np.ndarray  # type hint
+            for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
+                data = data_torch.squeeze().numpy()
+
+                # if data ends up empty, it means data_torch was a scalar tensor -> restore
+                if len(data.shape) == 0:
+                    data = data_torch.numpy()
+
                 n_dims = len(data.shape)
                 data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)
 
@@ -592,6 +597,9 @@ class Model:
         if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
             # ref: https://huggingface.co/databricks/dbrx-base
             res = "dbrx"
+        if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
+            # ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
+            res = "jina-v1-en"
         if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
             # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
             res = "jina-v2-en"
@@ -2601,7 +2609,7 @@ class NomicBertModel(BertModel):
         self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
 
 
-@Model.register("XLMRobertaModel")
+@Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
 class XLMRobertaModel(BertModel):
     model_arch = gguf.MODEL_ARCH.BERT
 
@@ -2699,6 +2707,11 @@ class XLMRobertaModel(BertModel):
         self.gguf_writer.add_add_eos_token(True)
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # if name starts with "roberta.", remove the prefix
+        # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
+        if name.startswith("roberta."):
+            name = name[8:]
+
         # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
         if name == "embeddings.position_embeddings.weight":
             if self._position_offset is not None:
@@ -3110,6 +3123,14 @@ class JinaBertV2Model(BertModel):
         self.gguf_writer.add_add_bos_token(True)
         self.gguf_writer.add_add_eos_token(True)
 
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # if name starts with "bert.", remove the prefix
+        # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
+        if name.startswith("bert."):
+            name = name[5:]
+
+        return super().modify_tensors(data_torch, name, bid)
+
 
 @Model.register("OpenELMForCausalLM")
 class OpenELMModel(Model):
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index 4d11059f3..022354a3b 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -81,6 +81,7 @@ models = [
     {"name": "qwen2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
     {"name": "olmo",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
     {"name": "dbrx",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
+    {"name": "jina-v1-en",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", },
     {"name": "jina-v2-en",     "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
     {"name": "jina-v2-es",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
     {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index a438dcb5a..734926822 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -135,7 +135,7 @@ int main(int argc, char ** argv) {
     // tokenize the prompts and trim
     std::vector<std::vector<int32_t>> inputs;
     for (const auto & prompt : prompts) {
-        auto inp = ::llama_tokenize(ctx, prompt, true, false);
+        auto inp = ::llama_tokenize(ctx, prompt, true, true);
         if (inp.size() > n_batch) {
             LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
                     __func__, (long long int) inp.size(), (long long int) n_batch);
@@ -234,6 +234,11 @@ int main(int argc, char ** argv) {
                 }
                 LOG("\n");
             }
+        } else if (pooling_type == LLAMA_POOLING_TYPE_RANK) {
+            for (int j = 0; j < n_embd_count; j++) {
+                // NOTE: if you change this log - update the tests in ci/run.sh
+                LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
+            }
         } else {
             // print the first part of the embeddings or for a single prompt, the full embedding
             for (int j = 0; j < n_prompts; j++) {
diff --git a/examples/server/README.md b/examples/server/README.md
index dfca07f98..951c4a44c 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -7,6 +7,7 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
 **Features:**
  * LLM inference of F16 and quantized models on GPU and CPU
  * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
+ * Reranking endoint (WIP: https://github.com/ggerganov/llama.cpp/pull/9510)
  * Parallel decoding with multi-user support
  * Continuous batching
  * Multimodal (wip)
@@ -23,6 +24,7 @@ The project is under active development, and we are [looking for feedback and co
 | -------- | ----------- |
 | `-h, --help, --usage` | print usage and exit |
 | `--version` | show version and build info |
+| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
 | `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
 | `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
 | `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
@@ -130,7 +132,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
 | `-sp, --special` | special tokens output enabled (default: false) |
 | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
-| `--pooling {none,mean,cls,last}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
+| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
 | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
 | `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
 | `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_ALIAS) |
@@ -138,6 +140,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
 | `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
 | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
+| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
 | `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
 | `--api-key-file FNAME` | path to file containing API keys (default: none) |
 | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
@@ -152,6 +155,7 @@ The project is under active development, and we are [looking for feedback and co
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
 | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
 
+
 Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
 
 Example usage of docker compose with environment variables:
@@ -478,6 +482,39 @@ The same as [the embedding example](../embedding) does.
 
     `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
 
+### POST `/reranking`: Rerank documents according to a given query
+
+Similar to https://jina.ai/reranker/ but might change in the future.
+Requires a reranker model (such as [bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3)) and the `--embedding --pooling rank` options.
+
+    *Options:*
+
+    `query`: The query against which the documents will be ranked.
+
+    `documents`: An array strings representing the documents to be ranked.
+
+    *Aliases:*
+      - `/rerank`
+      - `/v1/rerank`
+      - `/v1/reranking`
+
+    *Examples:*
+
+    ```shell
+    curl http://127.0.0.1:8012/v1/rerank \
+        -H "Content-Type: application/json" \
+        -d '{
+            "model": "some-model",
+                "query": "What is panda?",
+                "top_n": 3,
+                "documents": [
+                    "hi",
+                "it is a bear",
+                "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China."
+                ]
+        }' | jq
+    ```
+
 ### POST `/infill`: For code infilling.
 
 Takes a prefix and a suffix and returns the predicted completion as stream.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 61ff09bb2..f343cc252 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -92,6 +92,7 @@ enum server_task_type {
 enum server_task_cmpl_type {
     SERVER_TASK_CMPL_TYPE_NORMAL,
     SERVER_TASK_CMPL_TYPE_EMBEDDING,
+    SERVER_TASK_CMPL_TYPE_RERANK,
     SERVER_TASK_CMPL_TYPE_INFILL,
 };
 
@@ -172,6 +173,7 @@ struct server_slot {
     std::vector<completion_token_output> generated_token_probs;
 
     server_task_cmpl_type cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL;
+
     bool has_next_token = true;
     bool truncated      = false;
     bool stopped_eos    = false;
@@ -954,8 +956,17 @@ struct server_context {
                 slot.prompt = *prompt;
             } else if (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_array()) {
                 slot.prompt = prompt->at(0);
+            } else if (prompt->is_array() && prompt->size() > 1) {
+                // array of strings
+                for (const auto & el : *prompt) {
+                    if (!el.is_string()) {
+                        send_error(task, "\"prompt\" must be a string, an array of strings or an array of integers", ERROR_TYPE_INVALID_REQUEST);
+                        return false;
+                    }
+                }
+                slot.prompt = *prompt;
             } else {
-                send_error(task, "\"prompt\" must be a string or an array of integers", ERROR_TYPE_INVALID_REQUEST);
+                send_error(task, "\"prompt\" must be a string, an array of strings or an array of integers", ERROR_TYPE_INVALID_REQUEST);
                 return false;
             }
         }
@@ -1389,6 +1400,7 @@ struct server_context {
 
                 res.data = json {
                     {"embedding", std::vector<float>(n_embd, 0.0f)},
+                    {"index",     slot.index},
                 };
 
                 continue;
@@ -1407,6 +1419,44 @@ struct server_context {
         queue_results.send(res);
     }
 
+    void send_rerank(const server_slot & slot, const llama_batch & batch) {
+        server_task_result res;
+        res.id       = slot.id_task;
+        res.error    = false;
+        res.stop     = true;
+
+        for (int i = 0; i < batch.n_tokens; ++i) {
+            if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
+                continue;
+            }
+
+            const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+            if (embd == NULL) {
+                embd = llama_get_embeddings_ith(ctx, i);
+            }
+
+            if (embd == NULL) {
+                SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
+
+                res.data = json {
+                    {"index", slot.index},
+                    {"score", -1e6},
+                };
+
+                continue;
+            }
+
+            res.data = json {
+                {"index", slot.index},
+                {"score", embd[0]},
+            };
+        }
+
+        SLT_DBG(slot, "sending rerank result, res = '%s'\n", res.data.dump().c_str());
+
+        queue_results.send(res);
+    }
+
     //
     // Functions to create new task(s) and receive result(s)
     //
@@ -1442,13 +1492,27 @@ struct server_context {
         // otherwise, it's a multiple-prompt task, we break it into smaller tasks
         else if (prompt.is_array()) {
             std::vector<json> prompts = prompt;
-            for (size_t i = 0; i < prompts.size(); i++) {
-                const auto & e = prompts[i];
-                if (e.is_string() || json_is_array_of_numbers(e)) {
-                    data["index"] = i;
-                    create_task(data, true, e);
-                } else {
-                    throw std::runtime_error(error_msg);
+            if (cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
+                // prompts[0] is the question
+                // the rest are the answers/documents
+                SRV_DBG("creating rerank tasks, n_prompts = %d\n", (int) prompts.size() - 1);
+                for (size_t i = 1; i < prompts.size(); i++) {
+                    json qd;
+                    qd.push_back(prompts[0]);
+                    qd.push_back(prompts[i]);
+                    data["index"] = i - 1;
+                    create_task(data, true, qd);
+                }
+            } else {
+                SRV_DBG("creating multi-prompt tasks, n_prompts = %d\n", (int) prompts.size());
+                for (size_t i = 0; i < prompts.size(); i++) {
+                    const auto & e = prompts[i];
+                    if (e.is_string() || json_is_array_of_numbers(e)) {
+                        data["index"] = i;
+                        create_task(data, true, e);
+                    } else {
+                        throw std::runtime_error(error_msg);
+                    }
                 }
             }
         }
@@ -1492,7 +1556,9 @@ struct server_context {
                 return;
             }
 
-            size_t idx = result.data["index"];
+            const size_t idx = result.data["index"];
+            GGML_ASSERT(idx < results.size() && "index out of range");
+
             results[idx] = result;
         }
         result_handler(results);
@@ -1903,6 +1969,7 @@ struct server_context {
         // track if this is an embedding or non-embedding batch
         // if we've added sampled tokens above, we are in non-embedding mode
         // -1: none, 0: non-embedding, 1: embedding
+        // TODO: make enum
         int32_t batch_type = batch.n_tokens > 0 ? 0 : -1;
 
         // next, batch any pending prompts without exceeding n_batch
@@ -1951,6 +2018,29 @@ struct server_context {
                             }
 
                             prompt_tokens = embd_inp;
+                        } else if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
+                            // require slot.prompt to be array of 2 strings
+                            if (!slot.prompt.is_array() || slot.prompt.size() != 2) {
+                                SLT_ERR(slot, "%s", "invalid prompt for rerank task\n");
+                                slot.release();
+                                send_error(slot, "invalid prompt for rerank task", ERROR_TYPE_INVALID_REQUEST);
+                                continue;
+                            }
+
+                            // prompt: <s>query</s><s>doc</s>
+                            prompt_tokens.clear();
+                            prompt_tokens.push_back(llama_token_bos(model));
+                            {
+                                const auto part = tokenize(slot.prompt[0], false);
+                                prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
+                            }
+                            prompt_tokens.push_back(llama_token_eos(model));
+                            prompt_tokens.push_back(llama_token_bos(model));
+                            {
+                                const auto part = tokenize(slot.prompt[1], false);
+                                prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
+                            }
+                            prompt_tokens.push_back(llama_token_eos(model));
                         } else {
                             prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
                         }
@@ -1970,7 +2060,7 @@ struct server_context {
                             continue;
                         }
 
-                        if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) {
+                        if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
                             // this prompt is too large to process - discard it
                             if (slot.n_prompt_tokens > n_ubatch) {
                                 slot.release();
@@ -2048,7 +2138,8 @@ struct server_context {
                         slot.n_prompt_tokens_processed = 0;
                     }
 
-                    if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) {
+                    // non-causal tasks require to fit the entire prompt in the physical batch
+                    if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
                         // cannot fit the prompt in the current batch - will try next iter
                         if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
                             continue;
@@ -2056,7 +2147,10 @@ struct server_context {
                     }
 
                     // check that we are in the right batch_type, if not defer the slot
-                    bool slot_type = slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING ? 1 : 0;
+                    const bool slot_type =
+                        slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING ||
+                        slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK     ? 1 : 0;
+
                     if (batch_type == -1) {
                         batch_type = slot_type;
                     } else if (batch_type != slot_type) {
@@ -2229,6 +2323,13 @@ struct server_context {
                         continue; // continue loop of slots
                     }
 
+                    if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
+                        send_rerank(slot, batch_view);
+                        slot.release();
+                        slot.i_batch = -1;
+                        continue; // continue loop of slots
+                    }
+
                     // prompt evaluated for next-token prediction
                     slot.state = SLOT_STATE_GENERATING;
                 } else if (slot.state != SLOT_STATE_GENERATING) {
@@ -2787,8 +2888,8 @@ int main(int argc, char ** argv) {
     };
 
     const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_cmpl_type cmpl_type, json & data, httplib::Response & res) {
-        if (ctx_server.params.embedding) {
-            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
+        if (ctx_server.params.embedding || ctx_server.params.reranking) {
+            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
             return;
         }
 
@@ -2848,8 +2949,8 @@ int main(int argc, char ** argv) {
 
     // TODO: maybe merge this function with "handle_completions_generic"
     const auto handle_chat_completions = [&ctx_server, &params, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) {
-        if (ctx_server.params.embedding) {
-            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
+        if (ctx_server.params.embedding || ctx_server.params.reranking) {
+            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
             return;
         }
 
@@ -2973,6 +3074,11 @@ int main(int argc, char ** argv) {
     };
 
     const auto handle_embeddings = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
+        // TODO: somehow clean up this checks in the future
+        if (!ctx_server.params.embedding || ctx_server.params.reranking) {
+            res_error(res, format_error_response("This server does not support embeddings. Start it with `--embeddings` and without `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
+            return;
+        }
         const json body = json::parse(req.body);
         bool is_openai = false;
 
@@ -3023,6 +3129,79 @@ int main(int argc, char ** argv) {
         res_ok(res, root);
     };
 
+    const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
+        if (!ctx_server.params.reranking) {
+            res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
+            return;
+        }
+        const json body = json::parse(req.body);
+
+        // TODO: implement
+        //int top_n = 1;
+        //if (body.count("top_n") != 1) {
+        //    top_n = body.at("top_n");
+        //} else {
+        //    res_error(res, format_error_response("\"top_n\" must be provided", ERROR_TYPE_INVALID_REQUEST));
+        //    return;
+        //}
+
+        json query;
+        if (body.count("query") == 1) {
+            query = body.at("query");
+            if (!query.is_string()) {
+                res_error(res, format_error_response("\"query\" must be a string", ERROR_TYPE_INVALID_REQUEST));
+                return;
+            }
+        } else {
+            res_error(res, format_error_response("\"query\" must be provided", ERROR_TYPE_INVALID_REQUEST));
+            return;
+        }
+
+        std::vector<std::string> documents = json_value(body, "documents", std::vector<std::string>());
+        if (documents.empty()) {
+            res_error(res, format_error_response("\"documents\" must be a non-empty string array", ERROR_TYPE_INVALID_REQUEST));
+            return;
+        }
+
+        // construct prompt object: array of ["query", "doc0", "doc1", ...]
+        json prompt;
+        prompt.push_back(query);
+        for (const auto & doc : documents) {
+            prompt.push_back(doc);
+        }
+
+        LOG_DBG("rerank prompt: %s\n", prompt.dump().c_str());
+
+        // create and queue the task
+        json responses = json::array();
+        bool error = false;
+        {
+            std::vector<server_task> tasks = ctx_server.create_tasks_cmpl({{"prompt", prompt}}, SERVER_TASK_CMPL_TYPE_RERANK);
+            ctx_server.queue_results.add_waiting_tasks(tasks);
+            ctx_server.queue_tasks.post(tasks);
+
+            // get the result
+            std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
+
+            ctx_server.receive_cmpl_results(task_ids, [&](std::vector<server_task_result> & results) {
+                for (const auto & res : results) {
+                    responses.push_back(res.data);
+                }
+            }, [&](const json & error_data) {
+                res_error(res, error_data);
+                error = true;
+            });
+        }
+
+        if (error) {
+            return;
+        }
+
+        // write JSON response
+        json root = format_response_rerank(body, responses);
+        res_ok(res, root);
+    };
+
     const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) {
         json result = json::array();
         for (size_t i = 0; i < ctx_server.loras.size(); ++i) {
@@ -3119,6 +3298,10 @@ int main(int argc, char ** argv) {
     svr->Post("/embedding",           handle_embeddings); // legacy
     svr->Post("/embeddings",          handle_embeddings);
     svr->Post("/v1/embeddings",       handle_embeddings);
+    svr->Post("/rerank",              handle_rerank);
+    svr->Post("/reranking",           handle_rerank);
+    svr->Post("/v1/rerank",           handle_rerank);
+    svr->Post("/v1/reranking",        handle_rerank);
     svr->Post("/tokenize",            handle_tokenize);
     svr->Post("/detokenize",          handle_detokenize);
     // LoRA adapters hotswap
diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature
index 818ea3beb..f4fe2ee43 100644
--- a/examples/server/tests/features/embeddings.feature
+++ b/examples/server/tests/features/embeddings.feature
@@ -15,7 +15,7 @@ Feature: llama.cpp server
     And   128 as batch size
     And   128 as ubatch size
     And   512 KV cache size
-    And   embeddings extraction
+    And   enable embeddings endpoint
     Then  the server is starting
     Then  the server is healthy
 
diff --git a/examples/server/tests/features/rerank.feature b/examples/server/tests/features/rerank.feature
new file mode 100644
index 000000000..c36cc8e21
--- /dev/null
+++ b/examples/server/tests/features/rerank.feature
@@ -0,0 +1,42 @@
+@llama.cpp
+@rerank
+Feature: llama.cpp server
+
+  Background: Server startup
+    Given a server listening on localhost:8080
+    And   a model url https://huggingface.co/ggml-org/models/resolve/main/jina-reranker-v1-tiny-en/ggml-model-f16.gguf
+    And   a model file jina-reranker-v1-tiny-en.gguf
+    And   a model alias jina-reranker-v1-tiny-en
+    And   42 as server seed
+    And   2 slots
+    And   512 as batch size
+    And   512 as ubatch size
+    And   512 KV cache size
+    And   enable reranking endpoint
+    Then  the server is starting
+    Then  the server is healthy
+
+  Scenario: Rerank
+    Given a rerank query:
+      """
+      Machine learning is
+      """
+    And   a rerank document:
+      """
+      A machine is a physical system that uses power to apply forces and control movement to perform an action. The term is commonly applied to artificial devices, such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines.
+      """
+    And   a rerank document:
+      """
+      Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences. The ability to learn is possessed by humans, non-human animals, and some machines; there is also evidence for some kind of learning in certain plants.
+      """
+    And   a rerank document:
+      """
+      Machine learning is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.
+      """
+    And   a rerank document:
+      """
+      Paris, capitale de la France, est une grande ville européenne et un centre mondial de l'art, de la mode, de la gastronomie et de la culture. Son paysage urbain du XIXe siècle est traversé par de larges boulevards et la Seine.
+      """
+    When  reranking request
+    Then  reranking results are returned
+    Then  reranking highest score is index 2 and lowest score is index 3
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 0fea0fe87..2611614ba 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -68,6 +68,7 @@ def step_server_config(context, server_fqdn: str, server_port: str):
     context.server_api_key = None
     context.server_continuous_batching = False
     context.server_embeddings = False
+    context.server_reranking = False
     context.server_metrics = False
     context.server_process = None
     context.seed = None
@@ -83,6 +84,10 @@ def step_server_config(context, server_fqdn: str, server_port: str):
     context.concurrent_tasks = []
     context.prompts = []
 
+    context.reranking_query = None
+    context.reranking_documents = []
+    context.reranking_results = None
+
 
 @step('a model file {hf_file} from HF repo {hf_repo}')
 def step_download_hf_model(context, hf_file: str, hf_repo: str):
@@ -172,10 +177,13 @@ def step_server_continuous_batching(context):
     context.server_continuous_batching = True
 
 
-@step('embeddings extraction')
+@step('enable embeddings endpoint')
 def step_server_embeddings(context):
     context.server_embeddings = True
 
+@step('enable reranking endpoint')
+def step_server_reranking(context):
+    context.server_reranking = True
 
 @step('prometheus compatible metrics exposed')
 def step_server_metrics(context):
@@ -452,6 +460,14 @@ def step_impl(context, n_ga_w):
 def step_prompt_passkey(context):
     context.prompt_passkey = context_text(context)
 
+@step('a rerank query')
+def step_set_rerank_query(context):
+    context.reranking_query = context_text(context)
+    context.reranking_documents = []
+
+@step('a rerank document')
+def step_set_rerank_document(context):
+    context.reranking_documents.append(context_text(context))
 
 @step('{n_prompts:d} fixed prompts')
 def step_fixed_prompts(context, n_prompts):
@@ -619,6 +635,22 @@ async def step_compute_embedding(context):
     context.embeddings = await request_embedding(context_text(context), None, base_url=context.base_url)
 
 
+@step('reranking request')
+@async_run_until_complete
+async def step_compute_reranking(context):
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
+        async with session.post(f'{context.base_url}/reranking',
+                                json={
+                                    "query": context.reranking_query,
+                                    "documents": context.reranking_documents,
+                                }) as response:
+            if response.status == 200:
+                response_json = await response.json()
+                context.reranking_results = response_json['results']
+            else:
+                context.reranking_results = response.status
+
+
 @step('all embeddings are the same')
 @async_run_until_complete
 async def step_all_embeddings_are_the_same(context):
@@ -704,6 +736,24 @@ async def all_embeddings_are_generated(context):
     for i in range(n_embedding_requests):
         assert_embeddings(context.tasks_result.pop().pop())
 
+@step('reranking results are returned')
+def reranking_results_are_returned(context):
+    assert len(context.reranking_results) == len(context.reranking_documents)
+
+@step('reranking highest score is index {idx_high:d} and lowest score is index {idx_low:d}')
+def reranking_results_are_returned(context, idx_high: int, idx_low: int):
+    max_score, max_idx = 0, 0
+    min_score, min_idx = 0, 0
+    for res in context.reranking_results:
+        if max_score < res['relevance_score']:
+            max_score = res['relevance_score']
+            max_idx   = res['index']
+        if min_score > res['relevance_score']:
+            min_score = res['relevance_score']
+            min_idx   = res['index']
+    print(context.reranking_results)
+    assert max_idx == idx_high
+    assert min_idx == idx_low
 
 @step('adding special tokens')
 def step_tokenize_set_add_special(context):
@@ -1362,6 +1412,8 @@ def start_server_background(context):
         server_args.append('--cont-batching')
     if context.server_embeddings:
         server_args.append('--embedding')
+    if context.server_reranking:
+        server_args.append('--reranking')
     if context.server_metrics:
         server_args.append('--metrics')
     if context.model_alias:
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index f093f547f..47dfdfde5 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -537,7 +537,7 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
     json res = json {
         {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
         {"object", "list"},
-        {"usage", json {
+        {"usage", json { // TODO: fill
             {"prompt_tokens", 0},
             {"total_tokens", 0}
         }},
@@ -547,6 +547,29 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
     return res;
 }
 
+static json format_response_rerank(const json & request, const json & ranks) {
+    json data = json::array();
+    int i = 0;
+    for (const auto & rank : ranks) {
+        data.push_back(json{
+            {"index",    i++},
+            {"relevance_score", json_value(rank, "score", 0.0)},
+        });
+    }
+
+    json res = json {
+        {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
+        {"object", "list"},
+        {"usage", json { // TODO: fill
+            {"prompt_tokens", 0},
+            {"total_tokens", 0}
+        }},
+        {"results", data}
+    };
+
+    return res;
+}
+
 static bool is_valid_utf8(const std::string & str) {
     const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
     const unsigned char* end = bytes + str.length();
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 2fd2e9d2b..ebe66a4a3 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -345,6 +345,8 @@ class MODEL_TENSOR(IntEnum):
     ENC_FFN_DOWN         = auto()
     ENC_FFN_UP           = auto()
     ENC_OUTPUT_NORM      = auto()
+    CLS                  = auto() # classifier
+    CLS_OUT              = auto() # classifier output projection
 
 
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -504,6 +506,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
     MODEL_TENSOR.ENC_FFN_DOWN:              "enc.blk.{bid}.ffn_down",
     MODEL_TENSOR.ENC_FFN_UP:                "enc.blk.{bid}.ffn_up",
     MODEL_TENSOR.ENC_OUTPUT_NORM:           "enc.output_norm",
+    MODEL_TENSOR.CLS:                       "cls",
+    MODEL_TENSOR.CLS_OUT:                   "cls.output",
 }
 
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -613,6 +617,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
         MODEL_TENSOR.LAYER_OUT_NORM,
+        MODEL_TENSOR.CLS,
+        MODEL_TENSOR.CLS_OUT,
     ],
     MODEL_ARCH.NOMIC_BERT: [
         MODEL_TENSOR.TOKEN_EMBD,
@@ -644,6 +650,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_GATE,
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.LAYER_OUT_NORM,
+        MODEL_TENSOR.CLS,
     ],
     MODEL_ARCH.MPT: [
         MODEL_TENSOR.TOKEN_EMBD,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 5ef91f11d..e7e9b6fd5 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -679,6 +679,15 @@ class TensorNameMap:
         MODEL_TENSOR.ENC_OUTPUT_NORM: (
             "encoder.final_layer_norm", # t5
         ),
+
+        MODEL_TENSOR.CLS: (
+            "classifier",       # jina
+            "classifier.dense", # roberta
+        ),
+
+        MODEL_TENSOR.CLS_OUT: (
+            "classifier.out_proj", # roberta
+        ),
     }
 
     # architecture-specific block mappings
diff --git a/include/llama.h b/include/llama.h
index 4ea8a2c2b..7cae1bbe2 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -193,6 +193,7 @@ extern "C" {
         LLAMA_POOLING_TYPE_MEAN = 1,
         LLAMA_POOLING_TYPE_CLS  = 2,
         LLAMA_POOLING_TYPE_LAST = 3,
+        LLAMA_POOLING_TYPE_RANK = 4, // used by reranking models to attach the classification head to the graph
     };
 
     enum llama_attention_type {
@@ -202,9 +203,9 @@ extern "C" {
     };
 
     enum llama_split_mode {
-        LLAMA_SPLIT_MODE_NONE    = 0, // single GPU
-        LLAMA_SPLIT_MODE_LAYER   = 1, // split layers and KV across GPUs
-        LLAMA_SPLIT_MODE_ROW     = 2, // split rows across GPUs
+        LLAMA_SPLIT_MODE_NONE  = 0, // single GPU
+        LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
+        LLAMA_SPLIT_MODE_ROW   = 2, // split rows across GPUs
     };
 
     // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
@@ -872,7 +873,8 @@ extern "C" {
 
     // Get the embeddings for a sequence id
     // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
-    // shape: [n_embd] (1-dimensional)
+    // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
+    // otherwise: float[n_embd] (1-dimensional)
     LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
 
     //
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index e4d844a73..d2f34ddd6 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1554,7 +1554,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
             } break;
         case LLAMA_VOCAB_TYPE_UGM:
             {
-                if (add_special && vocab.tokenizer_add_bos != 0) {
+                if (add_special && vocab.tokenizer_add_bos) {
                     GGML_ASSERT(vocab.special_bos_id != -1);
                     output.push_back(vocab.special_bos_id);
                 }
@@ -1572,14 +1572,14 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
                     }
                 }
 
-                if (add_special && vocab.tokenizer_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
+                if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
                     LLAMA_LOG_WARN(
                         "%s: Added a BOS token to the prompt as specified by the model but the prompt "
                         "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
                         "Are you sure this is what you want?\n", __FUNCTION__);
                 }
 
-                if (add_special && vocab.tokenizer_add_eos == 1) {
+                if (add_special && vocab.tokenizer_add_eos) {
                     GGML_ASSERT(vocab.special_eos_id != -1);
                     output.push_back(vocab.special_eos_id);
                 }
@@ -1791,11 +1791,13 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
                 // suppressing them like CONTROL tokens.
                 if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
                     return _try_copy(token_text.data(), token_text.size());
-                } else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
+                }
+                if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
                     std::string result = token_text;
                     llama_unescape_whitespace(result);
                     return _try_copy(result.data(), result.size());
-                } else if (attr & LLAMA_TOKEN_ATTR_BYTE) {
+                }
+                if (attr & LLAMA_TOKEN_ATTR_BYTE) {
                     char byte = (char) llama_token_to_byte(vocab, token);
                     return _try_copy((char*) &byte, 1);
                 }
@@ -1806,7 +1808,8 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
                 // suppressing them like CONTROL tokens.
                 if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
                     return _try_copy(token_text.data(), token_text.size());
-                } else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
+                }
+                if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
                     std::string result = llama_decode_text(token_text);
                     return _try_copy(result.data(), result.size());
                 }
diff --git a/src/llama.cpp b/src/llama.cpp
index 44afb31d7..c466cd88b 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -606,6 +606,8 @@ enum llm_tensor {
     LLM_TENSOR_ENC_FFN_DOWN,
     LLM_TENSOR_ENC_FFN_UP,
     LLM_TENSOR_ENC_OUTPUT_NORM,
+    LLM_TENSOR_CLS,
+    LLM_TENSOR_CLS_OUT,
 };
 
 static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -793,6 +795,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
             { LLM_TENSOR_LAYER_OUT_NORM,  "blk.%d.layer_output_norm" },
             { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_CLS,             "cls" },
+            { LLM_TENSOR_CLS_OUT,         "cls.output" },
         },
     },
     {
@@ -828,6 +832,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
             { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
             { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_CLS,             "cls" },
         },
     },
     {
@@ -2894,6 +2899,7 @@ struct llama_model {
     llama_hparams hparams = {};
     llama_vocab   vocab;
 
+    // TODO: should init all tensors to nullptr
     struct ggml_tensor * tok_embd;
     struct ggml_tensor * type_embd;
     struct ggml_tensor * pos_embd;
@@ -2906,6 +2912,12 @@ struct llama_model {
     struct ggml_tensor * output_b;
     struct ggml_tensor * output_norm_enc;
 
+    // classifier
+    struct ggml_tensor * cls;
+    struct ggml_tensor * cls_b;
+    struct ggml_tensor * cls_out   = nullptr;
+    struct ggml_tensor * cls_out_b = nullptr;
+
     std::vector<llama_layer> layers;
 
     llama_split_mode split_mode;
@@ -5604,11 +5616,11 @@ static void llm_load_hparams(
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
                 ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
                 ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
-                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type);
+                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type, false);
                 hparams.f_max_alibi_bias = 8.0f;
 
                 switch (hparams.n_layer) {
-                    case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
+                    case 4:  model.type = e_model::MODEL_33M;  break; // jina-embeddings-small
                     case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
                 }
             } break;
@@ -6313,6 +6325,7 @@ static void llm_load_vocab(
                     tokenizer_pre == "phi-2"   ||
                     tokenizer_pre == "jina-es" ||
                     tokenizer_pre == "jina-de" ||
+                    tokenizer_pre == "jina-v1-en" ||
                     tokenizer_pre == "jina-v2-es" ||
                     tokenizer_pre == "jina-v2-de" ||
                     tokenizer_pre == "jina-v2-code") {
@@ -6439,7 +6452,12 @@ static void llm_load_vocab(
 
     for (uint32_t i = 0; i < n_vocab; i++) {
         std::string word = gguf_get_arr_str(ctx, token_idx, i);
-        GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
+
+        //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
+        if (word.empty()) {
+            LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
+            word = "[EMPTY_" + std::to_string(i) + "]";
+        }
 
         vocab.token_to_id[word] = i;
         vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
@@ -6520,8 +6538,14 @@ static void llm_load_vocab(
         vocab.linefeed_id = ids[0];
     } else {
         const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
-        GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
-        vocab.linefeed_id = ids[0];
+
+        //GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
+        if (ids.empty()) {
+            LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
+            vocab.linefeed_id = vocab.special_pad_id;
+        } else {
+            vocab.linefeed_id = ids[0];
+        }
     }
 
     // special tokens
@@ -7394,6 +7418,12 @@ static bool llm_load_tensors(
 
                     if (model.arch == LLM_ARCH_BERT) {
                         model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD,    "weight"), {n_embd, n_ctx_train});
+
+                        model.cls   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.cls_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         llama_model_loader::TENSOR_NOT_REQUIRED);
+
+                        model.cls_out   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.cls_out_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS_OUT, "bias"),   {1},         llama_model_loader::TENSOR_NOT_REQUIRED);
                     }
 
                     model.tok_norm   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
@@ -7446,6 +7476,8 @@ static bool llm_load_tensors(
                     model.tok_norm   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
                     model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}); //LayerNorm bias
 
+                    model.cls   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    model.cls_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "bias"),   {1},         llama_model_loader::TENSOR_NOT_REQUIRED);
                     for (int i = 0; i < n_layer; ++i) {
                         ggml_context * ctx_layer = ctx_for_layer(i);
                         ggml_context * ctx_split = ctx_for_layer_split(i);
@@ -10279,6 +10311,10 @@ struct llm_build_context {
         struct ggml_tensor * cur;
 
         switch (pooling_type) {
+            case LLAMA_POOLING_TYPE_NONE:
+                {
+                    cur = inp;
+                } break;
             case LLAMA_POOLING_TYPE_MEAN:
                 {
                     struct ggml_tensor * inp_mean = build_inp_mean();
@@ -10290,9 +10326,26 @@ struct llm_build_context {
                     struct ggml_tensor * inp_cls = build_inp_cls();
                     cur = ggml_get_rows(ctx0, inp, inp_cls);
                 } break;
-            case LLAMA_POOLING_TYPE_NONE:
+            case LLAMA_POOLING_TYPE_RANK:
                 {
-                    cur = inp;
+                    struct ggml_tensor * inp_cls = build_inp_cls();
+                    inp = ggml_get_rows(ctx0, inp, inp_cls);
+
+                    // classification head
+                    // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
+                    GGML_ASSERT(model.cls       != nullptr);
+                    GGML_ASSERT(model.cls_b     != nullptr);
+
+                    cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls, inp), model.cls_b);
+                    cur = ggml_tanh(ctx0, cur);
+
+                    // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
+                    // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
+                    if (model.cls_out) {
+                        GGML_ASSERT(model.cls_out_b != nullptr);
+
+                        cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls_out, cur), model.cls_out_b);
+                    }
                 } break;
             default:
                 {
@@ -11521,8 +11574,8 @@ struct llm_build_context {
             inpL = cur;
         }
 
-        // final output
         cur = inpL;
+
         cb(cur, "result_embd", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -16682,7 +16735,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
         }
     }
 
-    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
+    if (cparams.embeddings && (
+                cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
+                cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
         const int64_t n_tokens     = batch.n_tokens;
         const int64_t n_seq_tokens = batch.n_seq_tokens;
         const int64_t n_seqs       = batch.n_seqs;
@@ -16697,7 +16752,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
             const llama_seq_id seq_id = batch.seq_id[s][0];
 
             // TODO: adapt limits to n_seqs when batch.equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
 
             for (int i = 0; i < n_seq_tokens; ++i) {
                 const llama_pos pos = batch.pos[s*n_seq_tokens + i];
@@ -17237,6 +17292,20 @@ static int llama_decode_internal(
                             ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
                         }
                     } break;
+                case LLAMA_POOLING_TYPE_RANK:
+                    {
+                        // extract the rerank score - a single float per sequence
+                        auto & embd_seq_out = lctx.embd_seq;
+
+                        for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
+                            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                                continue;
+                            }
+                            embd_seq_out[seq_id].resize(1);
+                            ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
+                        }
+                    } break;
                 case LLAMA_POOLING_TYPE_UNSPECIFIED:
                     {
                         GGML_ABORT("unknown pooling type");
@@ -17443,6 +17512,13 @@ static int llama_encode_internal(
                             ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
                         }
                     } break;
+                case LLAMA_POOLING_TYPE_RANK:
+                    {
+                        // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
+                        //       wait for an encoder model that requires this pooling type in order to test it
+                        //       https://github.com/ggerganov/llama.cpp/pull/9510
+                        GGML_ABORT("RANK pooling not implemented yet");
+                    }
                 case LLAMA_POOLING_TYPE_UNSPECIFIED:
                     {
                         GGML_ABORT("unknown pooling type");

From 589b48d41efb0e95133b77c335f4fb9779af9bfb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 29 Sep 2024 14:38:18 +0300
Subject: [PATCH 56/78] contrib : add Resources section (#9675)

---
 CONTRIBUTING.md | 5 +++++
 README.md       | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index a9e000e52..3d7c6f86c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -27,3 +27,8 @@
 
 ![matmul](media/matmul.png)
 
+# Resources
+
+The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:
+
+https://github.com/ggerganov/llama.cpp/projects
diff --git a/README.md b/README.md
index a452a6d78..ecc2df8ca 100644
--- a/README.md
+++ b/README.md
@@ -443,7 +443,7 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
 - Contributors can open PRs
 - Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
 - Collaborators will be invited based on contributions
-- Any help with managing issues and PRs is very appreciated!
+- Any help with managing issues, PRs and projects is very appreciated!
 - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
 - Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
 - Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)

From f99d3f8367174f7aba73c07fd87de687d4a0ece1 Mon Sep 17 00:00:00 2001
From: nopperl <54780682+nopperl@users.noreply.github.com>
Date: Sun, 29 Sep 2024 12:02:06 +0000
Subject: [PATCH 57/78] py : add model class for Chameleon conversion (#9683)

---
 convert_hf_to_gguf.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 96a8830e9..f3857d487 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4162,7 +4162,8 @@ class GraniteMoeModel(GraniteModel):
         return super().modify_tensors(data_torch, name, bid)
 
 
-@Model.register("ChameleonForCausalLM")
+@Model.register("ChameleonForConditionalGeneration")
+@Model.register("ChameleonForCausalLM")  # obsolete
 class ChameleonModel(Model):
     model_arch = gguf.MODEL_ARCH.CHAMELEON
 

From faac0bae265449fd988c57bf894018edc36fbe1e Mon Sep 17 00:00:00 2001
From: matiaslin <45382001+matiaslin@users.noreply.github.com>
Date: Sun, 29 Sep 2024 05:25:00 -0700
Subject: [PATCH 58/78] common : ensure llama_batch size does not exceed max
 size (#9668)

A crash was observed when the number of tokens added to a batch exceeds
llama_batch size. An assertion in llama_batch_add was added to protect
against llama_batch size overflow.
---
 common/common.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/common/common.cpp b/common/common.cpp
index e2b8574bf..a0611f3d1 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1437,6 +1437,8 @@ void llama_batch_add(
                           llama_pos   pos,
     const std::vector<llama_seq_id> & seq_ids,
                                bool   logits) {
+    GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded");
+
     batch.token   [batch.n_tokens] = id;
     batch.pos     [batch.n_tokens] = pos;
     batch.n_seq_id[batch.n_tokens] = seq_ids.size();

From 6084bfb261b03f812de2255b05b6b5bb8d1c7171 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 24 Sep 2024 13:23:59 +0300
Subject: [PATCH 59/78] ggml : fix GGML_MAX_N_THREADS + improve formatting
 (ggml/969)

---
 ggml/include/ggml.h | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 9f96e0c48..f46d4a8a6 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -229,14 +229,16 @@
 #define GGML_MAX_PARAMS         2048
 #define GGML_MAX_CONTEXTS       64
 #define GGML_MAX_SRC            10
-#ifndef GGML_MAX_NAME
-#define GGML_MAX_NAME           64
 #define GGML_MAX_N_THREADS      512
-
-#endif
 #define GGML_MAX_OP_PARAMS      64
+
+#ifndef GGML_MAX_NAME
+#   define GGML_MAX_NAME        64
+#endif
+
 #define GGML_DEFAULT_N_THREADS  4
 #define GGML_DEFAULT_GRAPH_SIZE 2048
+
 #if UINTPTR_MAX == 0xFFFFFFFF
     #define GGML_MEM_ALIGN 4
 #else
@@ -259,21 +261,21 @@
 #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
 
 #ifndef NDEBUG
-#define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
+#   define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
 #elif defined(__GNUC__)
-#define GGML_UNREACHABLE() __builtin_unreachable()
+#   define GGML_UNREACHABLE() __builtin_unreachable()
 #elif defined(_MSC_VER)
-#define GGML_UNREACHABLE() __assume(0)
+#   define GGML_UNREACHABLE() __assume(0)
 #else
-#define GGML_UNREACHABLE() ((void) 0)
+#   define GGML_UNREACHABLE() ((void) 0)
 #endif
 
 #ifdef __cplusplus
-#define GGML_NORETURN [[noreturn]]
+#   define GGML_NORETURN [[noreturn]]
 #elif defined(_MSC_VER)
-#define GGML_NORETURN __declspec(noreturn)
+#   define GGML_NORETURN __declspec(noreturn)
 #else
-#define GGML_NORETURN _Noreturn
+#   define GGML_NORETURN _Noreturn
 #endif
 
 #define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)

From 544f409b4bd8fc98a3e87820f0ac934e00402de7 Mon Sep 17 00:00:00 2001
From: Salvatore Mesoraca <s.mesoraca16@gmail.com>
Date: Thu, 26 Sep 2024 08:59:42 +0200
Subject: [PATCH 60/78] vulkan : argsort barriers must be under uniform control
 flow (ggml/951)

a return before a barrier (that happens only in some threads in
a workgroup) leads to UB.
While the old code actually works on some devices,
it fails on some others (i.e. "smaller" GPUs).

BTW, I think it would be better to set specialization constants
when the graph is built, in that way the local workgroup
could be sized appropriately.
But it would take a lot of work.

Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>
---
 ggml/src/vulkan-shaders/argsort.comp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/ggml/src/vulkan-shaders/argsort.comp b/ggml/src/vulkan-shaders/argsort.comp
index e55414b03..d4fa45b1e 100644
--- a/ggml/src/vulkan-shaders/argsort.comp
+++ b/ggml/src/vulkan-shaders/argsort.comp
@@ -29,20 +29,18 @@ void main() {
     const int col = int(gl_LocalInvocationID.x);
     const uint row = gl_WorkGroupID.y;
 
-    if (col >= p.ncols_pad) {
-        return;
-    }
-
     const uint row_offset = row * p.ncols;
 
     // initialize indices
-    dst_row[col] = col;
+    if (col < p.ncols_pad) {
+        dst_row[col] = col;
+    }
     barrier();
 
     for (uint k = 2; k <= p.ncols_pad; k *= 2) {
         for (uint j = k / 2; j > 0; j /= 2) {
             const uint ixj = col ^ j;
-            if (ixj > col) {
+            if (col < p.ncols_pad && ixj > col) {
                 if ((col & k) == 0) {
                     if (dst_row[col] >= p.ncols ||
                         (dst_row[ixj] < p.ncols && (p.order == ASC ?

From 0de8b203f1d31cf5ee0d2a3560a0ad78d44f4d4c Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Fri, 27 Sep 2024 02:58:01 -0500
Subject: [PATCH 61/78] vulkan : fix build for GGML_VULKAN_RUN_TESTS, add
 TFLOPS to log (ggml/961)

---
 ggml/src/ggml-vulkan.cpp | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index a877145e8..70b729154 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -5013,6 +5013,8 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
         }
     }
 
+    ggml_pipeline_allocate_descriptor_sets(ctx->device);
+
     vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
     vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
     vk_buffer d_D = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
@@ -5129,7 +5131,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
 
     avg_err /= m * n;
 
-    std::cerr << "TEST " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time / num_it << "ms avg_err=" << avg_err << std::endl;
+    double tflops = 2.0*m*n*k*batch*num_it / (time / 1000.0) / (1000.0*1000.0*1000.0*1000.0);
+
+    std::cerr << "TEST " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time / num_it << "ms " << tflops << " TFLOPS avg_err=" << avg_err << std::endl;
 
     if (avg_err > 0.1) {
         std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
@@ -5251,12 +5255,14 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
 
     ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
 
+    ggml_pipeline_allocate_descriptor_sets(ctx->device);
+
     ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
 
     vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
     ggml_vk_ctx_begin(ctx->device, subctx);
     const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
-    ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
+    ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
     ggml_vk_ctx_end(subctx);
 
     auto begin = std::chrono::high_resolution_clock::now();
@@ -5383,6 +5389,8 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
         }
     }
 
+    ggml_pipeline_allocate_descriptor_sets(ctx->device);
+
     ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
     ggml_vk_buffer_write(y_buf, 0, y, y_sz);
 
@@ -5450,7 +5458,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
 
     avg_err /= m * n;
 
-    std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms avg_err=" << avg_err << std::endl;
+    double tflops = 2.0*m*n*k*batch*num_it / (time_ms / 1000.0) / (1000.0*1000.0*1000.0*1000.0);
+
+    std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms " << tflops << " TFLOPS avg_err=" << avg_err << std::endl;
 
     if (avg_err > 0.01 || std::isnan(avg_err)) {
         std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
@@ -5502,9 +5512,6 @@ static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor)
 
 static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
 #if defined(GGML_VULKAN_RUN_TESTS)
-    ctx->staging = ggml_vk_create_buffer_check(ctx->device, 100ul * 1024ul * 1024ul,
-        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
-        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
     ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32);
     ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_0);
     ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_1);

From 641002fba8d2a0c0269027e23d2ef58e90546028 Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Sun, 29 Sep 2024 11:50:17 -0500
Subject: [PATCH 62/78] vulkan : multithread pipeline creation (ggml/963)

---
 ggml/src/ggml-vulkan.cpp | 41 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index 70b729154..c677a2728 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -20,6 +20,8 @@
 #include <unordered_map>
 #include <memory>
 #include <mutex>
+#include <future>
+#include <thread>
 
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"
@@ -607,13 +609,16 @@ typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx
 
 GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
 
-static void ggml_vk_create_pipeline(vk_device& device, vk_pipeline& pipeline, const std::string& name, size_t spv_size, const void* spv_data, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t>&& specialization_constants, uint32_t align) {
+// variables to track number of compiles in progress
+static uint32_t compile_count = 0;
+static std::mutex compile_count_mutex;
+static std::condition_variable compile_count_cond;
+
+static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants, uint32_t align) {
     VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
     GGML_ASSERT(parameter_count > 0);
     GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
 
-    std::lock_guard<std::mutex> guard(device->mutex);
-
     pipeline = std::make_shared<vk_pipeline_struct>();
     pipeline->name = name;
     pipeline->parameter_count = parameter_count;
@@ -681,7 +686,17 @@ static void ggml_vk_create_pipeline(vk_device& device, vk_pipeline& pipeline, co
         pipeline->layout);
     pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
 
-    device->pipelines.insert({ pipeline->name, pipeline });
+    {
+        std::lock_guard<std::mutex> guard(device->mutex);
+        device->pipelines.insert({ pipeline->name, pipeline });
+    }
+
+    {
+        std::lock_guard<std::mutex> guard(compile_count_mutex);
+        assert(compile_count > 0);
+        compile_count--;
+    }
+    compile_count_cond.notify_all();
 }
 
 static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
@@ -1194,6 +1209,20 @@ static void ggml_vk_load_shaders(vk_device& device) {
     device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
     device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
 
+    std::vector<std::future<void>> compiles;
+    auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t>&& specialization_constants, uint32_t align) {
+        {
+            // wait until fewer than N compiles are in progress
+            uint32_t N = std::max(1u, std::thread::hardware_concurrency());
+            std::unique_lock<std::mutex> guard(compile_count_mutex);
+            while (compile_count >= N) {
+                compile_count_cond.wait(guard);
+            }
+            compile_count++;
+        }
+        compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align));
+    };
+
     if (device->fp16) {
         ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
         ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
@@ -1743,6 +1772,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
 
     ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
+
+    for (auto &c : compiles) {
+        c.wait();
+    }
 }
 
 static vk_device ggml_vk_get_device(size_t idx) {

From aaa40999251f5d18309b3fddc5a7d576f5fdb4e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Sun, 29 Sep 2024 19:56:17 +0200
Subject: [PATCH 63/78] CUDA: remove bad assert (ggml/972)

---
 ggml/src/ggml-cuda/im2col.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/im2col.cu b/ggml/src/ggml-cuda/im2col.cu
index 3d0d8d4e6..16463ab0f 100644
--- a/ggml/src/ggml-cuda/im2col.cu
+++ b/ggml/src/ggml-cuda/im2col.cu
@@ -69,7 +69,6 @@ void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     float * dst_d = (float *)dst->data;
     cudaStream_t stream = ctx.stream();
 
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
 

From d0b1d663e430354ab35853a6e1bce51cc8819376 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 29 Sep 2024 21:16:07 +0300
Subject: [PATCH 64/78] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 36eeed0cc..aa301462a 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-336c10a4c3c8ec99af484b25a0cddd397a09cdb2
+9a24b8c8c40eab7262d067e91d08df160678df8d

From c919d5db39c8a7fcb64737f008e4b105ee0acd20 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 29 Sep 2024 21:18:23 +0300
Subject: [PATCH 65/78] ggml : define missing HWCAP flags (#9684)

ggml-ci

Co-authored-by: Willy Tarreau <w@1wt.eu>
---
 ggml/src/ggml.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index fac4466e3..81b651c6a 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -3687,6 +3687,10 @@ static inline int ggml_up(int n, int m) {
 #include <sys/sysctl.h>
 #endif
 
+#if !defined(HWCAP2_I8MM)
+#define HWCAP2_I8MM 0
+#endif
+
 static void ggml_init_arm_arch_features(void) {
 #if defined(__linux__) && defined(__aarch64__)
     uint32_t hwcap = getauxval(AT_HWCAP);

From 8277a817f18967581b02b2248989d773e8e99998 Mon Sep 17 00:00:00 2001
From: Ruchira Hasaranga <ruchira66@gmail.com>
Date: Mon, 30 Sep 2024 13:53:42 +0530
Subject: [PATCH 66/78] console : utf-8 fix for windows stdin (#9690)

* utf-8 fix for windows stdin

* Update common/console.cpp

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 common/console.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/common/console.cpp b/common/console.cpp
index f65cbc6ed..078a8d678 100644
--- a/common/console.cpp
+++ b/common/console.cpp
@@ -94,6 +94,9 @@ namespace console {
                 simple_io = true;
             }
         }
+        if (simple_io) {
+            _setmode(_fileno(stdin), _O_U8TEXT);
+        }
 #else
         // POSIX-specific console initialization
         if (!simple_io) {

From ace4f4be37abed4801fbd54a94cf38a7ae462416 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 30 Sep 2024 17:48:49 +0300
Subject: [PATCH 67/78] flake.lock: Update (#9680)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Flake lock file updates:

• Updated input 'nixpkgs':
    'github:NixOS/nixpkgs/c04d5652cfa9742b1d519688f65d1bbccea9eb7e?narHash=sha256-PmUr/2GQGvFTIJ6/Tvsins7Q43KTMvMFhvG6oaYK%2BWk%3D' (2024-09-19)
  → 'github:NixOS/nixpkgs/1925c603f17fc89f4c8f6bf6f631a802ad85d784?narHash=sha256-J%2BPeFKSDV%2BpHL7ukkfpVzCOO7mBSrrpJ3svwBFABbhI%3D' (2024-09-26)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 flake.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flake.lock b/flake.lock
index 6333a09f0..dde1ab527 100644
--- a/flake.lock
+++ b/flake.lock
@@ -20,11 +20,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1726755586,
-        "narHash": "sha256-PmUr/2GQGvFTIJ6/Tvsins7Q43KTMvMFhvG6oaYK+Wk=",
+        "lastModified": 1727348695,
+        "narHash": "sha256-J+PeFKSDV+pHL7ukkfpVzCOO7mBSrrpJ3svwBFABbhI=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "c04d5652cfa9742b1d519688f65d1bbccea9eb7e",
+        "rev": "1925c603f17fc89f4c8f6bf6f631a802ad85d784",
         "type": "github"
       },
       "original": {

From 08a43d05b6ba74de97610ae519450ad9996475e0 Mon Sep 17 00:00:00 2001
From: vb <vaibhavs10@gmail.com>
Date: Mon, 30 Sep 2024 17:03:47 +0200
Subject: [PATCH 68/78] py : update transfomers version (#9694)

* update transfomers version.

* update hfh version.
---
 examples/server/tests/requirements.txt             | 2 +-
 requirements/requirements-convert_legacy_llama.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt
index f2d7e5c57..553954872 100644
--- a/examples/server/tests/requirements.txt
+++ b/examples/server/tests/requirements.txt
@@ -1,6 +1,6 @@
 aiohttp~=3.9.3
 behave~=1.2.6
-huggingface_hub~=0.20.3
+huggingface_hub~=0.23.2
 numpy~=1.26.4
 openai~=1.30.3
 prometheus-client~=0.20.0
diff --git a/requirements/requirements-convert_legacy_llama.txt b/requirements/requirements-convert_legacy_llama.txt
index 1d07b0952..859204b27 100644
--- a/requirements/requirements-convert_legacy_llama.txt
+++ b/requirements/requirements-convert_legacy_llama.txt
@@ -1,5 +1,5 @@
 numpy~=1.26.4
 sentencepiece~=0.2.0
-transformers>=4.40.1,<5.0.0
+transformers>=4.45.1,<5.0.0
 gguf>=0.1.0
 protobuf>=4.21.0,<5.0.0

From 511636df0c90826b4dd1fc21ff260c19d69a3b5d Mon Sep 17 00:00:00 2001
From: compilade <git@compilade.net>
Date: Mon, 30 Sep 2024 14:13:16 -0400
Subject: [PATCH 69/78] ci : reduce severity of unused Pyright ignore comments
 (#9697)

---
 .github/workflows/python-type-check.yml         | 4 +++-
 examples/llava/convert_image_encoder_to_gguf.py | 4 ++--
 pyrightconfig.json                              | 3 ++-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/python-type-check.yml b/.github/workflows/python-type-check.yml
index e5ff5e6d7..373bb6010 100644
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@@ -4,11 +4,13 @@ on:
   push:
     paths:
       - '.github/workflows/python-type-check.yml'
+      - 'pyrightconfig.json'
       - '**.py'
       - '**/requirements*.txt'
   pull_request:
     paths:
       - '.github/workflows/python-type-check.yml'
+      - 'pyrightconfig.json'
       - '**.py'
       - '**/requirements*.txt'
 
@@ -33,6 +35,6 @@ jobs:
       - name: Type-check with Pyright
         uses: jakebailey/pyright-action@v2
         with:
-          version: 1.1.370
+          version: 1.1.382
           level: warning
           warnings: true
diff --git a/examples/llava/convert_image_encoder_to_gguf.py b/examples/llava/convert_image_encoder_to_gguf.py
index 36f6b92fb..4fa1d6cea 100644
--- a/examples/llava/convert_image_encoder_to_gguf.py
+++ b/examples/llava/convert_image_encoder_to_gguf.py
@@ -274,7 +274,7 @@ fout.add_bool("clip.use_gelu", use_gelu)
 
 
 if has_llava_projector:
-    model.vision_model.encoder.layers.pop(-1)  # pyright: ignore[reportAttributeAccessIssue]
+    model.vision_model.encoder.layers.pop(-1)
     projector = torch.load(args.llava_projector)
     for name, data in projector.items():
         name = get_tensor_name(name)
@@ -288,7 +288,7 @@ if has_llava_projector:
 
     print("Projector tensors added\n")
 
-state_dict = model.state_dict()  # pyright: ignore[reportAttributeAccessIssue]
+state_dict = model.state_dict()
 for name, data in state_dict.items():
     if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
         # we don't need this
diff --git a/pyrightconfig.json b/pyrightconfig.json
index 6016f4b6d..9acbbeb78 100644
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@@ -5,7 +5,8 @@
   "reportUnusedImport": "warning",
   "reportDuplicateImport": "error",
   "reportDeprecated": "warning",
-  "reportUnnecessaryTypeIgnoreComment": "warning",
+  "reportUnnecessaryTypeIgnoreComment": "information",
+  "disableBytesTypePromotions": false, // TODO: change once Python 3.12 is the minimum
   "executionEnvironments": [
     {
       // TODO: make this version override work correctly

From 6f1d9d71f4c568778a7637ff6582e6f6ba5fb9d3 Mon Sep 17 00:00:00 2001
From: serhii-nakon <57632032+serhii-nakon@users.noreply.github.com>
Date: Mon, 30 Sep 2024 21:57:12 +0300
Subject: [PATCH 70/78] Fix Docker ROCM builds, use AMDGPU_TARGETS instead of
 GPU_TARGETS (#9641)

* Fix Docker ROCM builds, use AMDGPU_TARGETS instead of GPU_TARGETS

* Set ROCM_DOCKER_ARCH as string due it incorrectly build and cause OOM exit code
---
 .devops/full-rocm.Dockerfile         | 6 +++---
 .devops/llama-cli-rocm.Dockerfile    | 6 +++---
 .devops/llama-server-rocm.Dockerfile | 6 +++---
 .github/workflows/build.yml          | 2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile
index 680d1cb92..df496bcd2 100644
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@@ -11,7 +11,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
-ARG ROCM_DOCKER_ARCH=\
+ARG ROCM_DOCKER_ARCH="\
     gfx803 \
     gfx900 \
     gfx906 \
@@ -21,7 +21,7 @@ ARG ROCM_DOCKER_ARCH=\
     gfx1030 \
     gfx1100 \
     gfx1101 \
-    gfx1102
+    gfx1102"
 
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -34,7 +34,7 @@ WORKDIR /app
 COPY . .
 
 # Set nvcc architecture
-ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
+ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
diff --git a/.devops/llama-cli-rocm.Dockerfile b/.devops/llama-cli-rocm.Dockerfile
index c3d1ab067..e60c747bd 100644
--- a/.devops/llama-cli-rocm.Dockerfile
+++ b/.devops/llama-cli-rocm.Dockerfile
@@ -11,7 +11,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
-ARG ROCM_DOCKER_ARCH=\
+ARG ROCM_DOCKER_ARCH="\
     gfx803 \
     gfx900 \
     gfx906 \
@@ -21,7 +21,7 @@ ARG ROCM_DOCKER_ARCH=\
     gfx1030 \
     gfx1100 \
     gfx1101 \
-    gfx1102
+    gfx1102"
 
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -34,7 +34,7 @@ WORKDIR /app
 COPY . .
 
 # Set nvcc architecture
-ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
+ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
diff --git a/.devops/llama-server-rocm.Dockerfile b/.devops/llama-server-rocm.Dockerfile
index fd0e19ad6..8553af75b 100644
--- a/.devops/llama-server-rocm.Dockerfile
+++ b/.devops/llama-server-rocm.Dockerfile
@@ -11,7 +11,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
-ARG ROCM_DOCKER_ARCH=\
+ARG ROCM_DOCKER_ARCH="\
     gfx803 \
     gfx900 \
     gfx906 \
@@ -21,7 +21,7 @@ ARG ROCM_DOCKER_ARCH=\
     gfx1030 \
     gfx1100 \
     gfx1101 \
-    gfx1102
+    gfx1102"
 
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -34,7 +34,7 @@ WORKDIR /app
 COPY . .
 
 # Set nvcc architecture
-ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
+ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index e6a977b60..c71d422e7 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1032,7 +1032,7 @@ jobs:
         run: |
           $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
           $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
+          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
           cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
           md "build\bin\rocblas\library\"
           cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"

From 1927378bcce20ba72b6c89d5977b854a4bcaeb5d Mon Sep 17 00:00:00 2001
From: compilade <git@compilade.net>
Date: Tue, 1 Oct 2024 02:31:36 -0400
Subject: [PATCH 71/78] convert : refactor rope_freqs generation (#9396)

* convert : refactor rope_freqs generation

This should also fix vocab-only conversion for Phi-3.

* convert : adapt MiniCPM3 to separate rope_freqs insertion

MiniCPM3's tokenizer is treated as a SentencePiece tokenizer to avoid
having to run its custom Python code which mixes tokenization
in the same file as tool calls.

gguf-py : add long and short RoPE factors to tensor mappings

Empty, but the key names are used to populate the mappings.
---
 convert_hf_to_gguf.py          | 61 ++++++++++++++++++----------------
 convert_lora_to_gguf.py        |  5 ++-
 gguf-py/gguf/constants.py      |  4 +++
 gguf-py/gguf/tensor_mapping.py |  3 ++
 4 files changed, 44 insertions(+), 29 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index f3857d487..da5feb25b 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -15,6 +15,7 @@ from enum import IntEnum
 from pathlib import Path
 from hashlib import sha256
 from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
+from itertools import chain
 
 import math
 import numpy as np
@@ -64,7 +65,6 @@ class Model:
     model_name: str | None
     metadata_override: Path | None
     dir_model_card: Path
-    is_lora: bool
 
     # subclasses should define this!
     model_arch: gguf.MODEL_ARCH
@@ -72,7 +72,7 @@ class Model:
     def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
                  use_temp_file: bool = False, eager: bool = False,
                  metadata_override: Path | None = None, model_name: str | None = None,
-                 split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, is_lora: bool = False):
+                 split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
         if type(self) is Model:
             raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
 
@@ -94,7 +94,6 @@ class Model:
         self.metadata_override = metadata_override
         self.model_name = model_name
         self.dir_model_card = dir_model  # overridden in convert_lora_to_gguf.py
-        self.is_lora = is_lora  # true if model is used inside convert_lora_to_gguf.py
 
         # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
         if self.ftype == gguf.LlamaFileType.GUESSED:
@@ -270,10 +269,14 @@ class Model:
 
         return False
 
+    # some models need extra generated tensors (like rope_freqs)
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        return ()
+
     def prepare_tensors(self):
         max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
 
-        for name, data_torch in self.get_tensors():
+        for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()):
             # we don't need these
             if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
                 continue
@@ -1617,7 +1620,7 @@ class LlamaModel(Model):
 
         return [(self.map_tensor_name(name), data_torch)]
 
-    def prepare_tensors(self):
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
         if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
             if rope_scaling.get("rope_type", '').lower() == "llama3":
                 base = self.hparams.get("rope_theta", 10000.0)
@@ -1644,9 +1647,9 @@ class LlamaModel(Model):
                         smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
                         rope_factors.append(1 / ((1 - smooth) / factor + smooth))
 
-                if not self.is_lora:
-                    self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
+                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
 
+    def prepare_tensors(self):
         super().prepare_tensors()
 
         if self._experts is not None:
@@ -1870,8 +1873,6 @@ class MiniCPM3Model(Model):
     def set_gguf_parameters(self):
         hparams = self.hparams
 
-        rope_dims = hparams["qk_rope_head_dim"]
-
         self.gguf_writer.add_file_type(self.ftype)
         self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
         self.gguf_writer.add_embedding_length(hparams["hidden_size"])
@@ -1887,24 +1888,25 @@ class MiniCPM3Model(Model):
         self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
         self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
 
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
         rope_scaling = self.find_hparam(['rope_scaling'], True)
-        if rope_scaling is None:
-            return
+        if rope_scaling is not None:
+            rope_dims = self.hparams["qk_rope_head_dim"]
 
-        long_factors = rope_scaling.get('long_factor', None)
-        short_factors = rope_scaling.get('short_factor', None)
+            long_factors = rope_scaling.get('long_factor', None)
+            short_factors = rope_scaling.get('short_factor', None)
 
-        if long_factors is None or short_factors is None:
-            raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
+            if long_factors is None or short_factors is None:
+                raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
 
-        if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
-            raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
+            if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
+                raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
 
-        self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG]  + ".weight", np.array(long_factors, dtype=np.float32))
-        self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
+            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
+            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
 
     def set_vocab(self):
-        self._set_vocab_llama_hf()
+        self._set_vocab_sentencepiece()
 
     def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
         if n_kv_head is not None and n_head != n_kv_head:
@@ -2216,6 +2218,13 @@ class Phi3MiniModel(Model):
         self.gguf_writer.add_file_type(self.ftype)
         self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
 
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        n_embd = self.find_hparam(["hidden_size", "n_embd"])
+        n_head = self.find_hparam(["num_attention_heads", "n_head"])
+        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
+        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
+        rope_dims = n_embd // n_head
+
         # write rope scaling for long context (128k) model
         rope_scaling = self.find_hparam(['rope_scaling'], True)
         if rope_scaling is None:
@@ -2245,9 +2254,8 @@ class Phi3MiniModel(Model):
         if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
             raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
 
-        if not self.is_lora:
-            self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG]  + ".weight", np.array(long_factors, dtype=np.float32))
-            self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
+        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
+        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
 
 
 @Model.register("PlamoForCausalLM")
@@ -4071,7 +4079,7 @@ class ExaoneModel(Model):
                 self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
                 self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
 
-    def prepare_tensors(self):
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
         if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
             if rope_scaling.get("rope_type", '').lower() == "llama3":
                 base = self.hparams.get("rope_theta", 10000.0)
@@ -4098,10 +4106,7 @@ class ExaoneModel(Model):
                         smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
                         rope_factors.append(1 / ((1 - smooth) / factor + smooth))
 
-                if not self.is_lora:
-                    self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
-
-        super().prepare_tensors()
+                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
 
 
 @Model.register("GraniteForCausalLM")
diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
index d1c94e580..439a78de1 100755
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -331,6 +331,10 @@ if __name__ == '__main__':
                 self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
                 super().set_gguf_parameters()
 
+            def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+                # Never add extra tensors (e.g. rope_freqs) for LoRA adapters
+                return ()
+
             def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
                 tensor_map: dict[str, PartialLoraTensor] = {}
 
@@ -392,7 +396,6 @@ if __name__ == '__main__':
             dry_run=args.dry_run,
             dir_lora_model=dir_lora,
             lora_alpha=alpha,
-            is_lora=True,
         )
 
         logger.info("Exporting model...")
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index ebe66a4a3..e08617ba2 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -814,6 +814,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
         MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FACTORS_LONG,
+        MODEL_TENSOR.ROPE_FACTORS_SHORT,
         MODEL_TENSOR.ATTN_NORM,
         MODEL_TENSOR.ATTN_QKV,
         MODEL_TENSOR.ATTN_Q,
@@ -892,6 +894,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
         MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FACTORS_LONG,
+        MODEL_TENSOR.ROPE_FACTORS_SHORT,
         MODEL_TENSOR.ATTN_NORM,
         MODEL_TENSOR.ATTN_Q_A,
         MODEL_TENSOR.ATTN_Q_B,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index e7e9b6fd5..f4a787c56 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -87,6 +87,9 @@ class TensorNameMap:
             "rope.freqs",  # llama-pth
             "rotary_pos_emb.inv_freq",  # chatglm
         ),
+
+        MODEL_TENSOR.ROPE_FACTORS_LONG: (),
+        MODEL_TENSOR.ROPE_FACTORS_SHORT: (),
     }
 
     block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {

From a90484c6d9db699bf739d0f33daf1c50cbdd45c9 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 1 Oct 2024 11:42:01 +0300
Subject: [PATCH 72/78] llama : print correct model type for Llama 3.2 1B and
 3B

---
 src/llama.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/llama.cpp b/src/llama.cpp
index c466cd88b..d1d27d21e 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -5502,8 +5502,10 @@ static void llm_load_hparams(
                     }
                 } else {
                     switch (hparams.n_layer) {
+                        case 16: model.type = e_model::MODEL_1B; break; // Llama 3.2 1B
                         case 22: model.type = e_model::MODEL_1B; break;
                         case 26: model.type = e_model::MODEL_3B; break;
+                        case 28: model.type = e_model::MODEL_3B; break; // Llama 3.2 3B
                         // granite uses a vocab with len 49152
                         case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
                         case 36: model.type = e_model::MODEL_8B; break; // granite

From cad341d88924788b940997c55e7bef000c99e784 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 1 Oct 2024 16:00:25 +0300
Subject: [PATCH 73/78] metal : reduce command encoding overhead (#9698)

* metal : reduce command encoding overhead

ggml-ci

* metal : add comments
---
 examples/cvector-generator/pca.hpp |    7 -
 examples/llava/clip.cpp            |    6 -
 ggml/include/ggml-metal.h          |    5 -
 ggml/src/ggml-metal.m              | 3888 ++++++++++++++--------------
 src/llama.cpp                      |    6 -
 5 files changed, 2000 insertions(+), 1912 deletions(-)

diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp
index a969c486d..f6e307fbc 100644
--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@@ -204,13 +204,6 @@ static ggml_status compute_piter(
         ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
     }
 
-// TODO: enable GPU support when support for GGML_OP_SQRT is added
-//#ifdef GGML_USE_METAL
-//    if (ggml_backend_is_metal(model.backend)) {
-//        ggml_backend_metal_set_n_cb(model.backend, params.n_threads);
-//    }
-//#endif
-
     ggml_status res = ggml_backend_graph_compute(model.backend, gf);
     if (res == GGML_STATUS_SUCCESS) {
         auto extract_i = [](std::string prefix, std::string str) -> int {
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 8aa7b0750..14e02c8dd 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2444,12 +2444,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
     }
 
-#ifdef GGML_USE_METAL
-    if (ggml_backend_is_metal(ctx->backend)) {
-        ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
-    }
-#endif
-
     ggml_backend_graph_compute(ctx->backend, gf);
 
     // the last node is the embedding tensor
diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h
index d483cf1ac..4d4165324 100644
--- a/ggml/include/ggml-metal.h
+++ b/ggml/include/ggml-metal.h
@@ -25,9 +25,6 @@
 #include <stddef.h>
 #include <stdbool.h>
 
-// max memory buffers that can be mapped to the device
-#define GGML_METAL_MAX_BUFFERS 64
-
 struct ggml_tensor;
 struct ggml_cgraph;
 
@@ -48,8 +45,6 @@ GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
 
 GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
 
-GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
-
 GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
 
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index ef3b7f0e8..9da08fe2e 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -12,6 +12,12 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 
+// max memory buffers that can be mapped to the device
+#define GGML_METAL_MAX_BUFFERS 64
+
+// max number of MTLCommandBuffer used to submit a graph for processing
+#define GGML_METAL_MAX_COMMAND_BUFFERS 8
+
 #ifdef GGML_METAL_NDEBUG
 #define GGML_METAL_LOG(...)
 #define GGML_METAL_LOG_INFO(...)
@@ -221,11 +227,11 @@ enum ggml_metal_kernel_type {
 };
 
 struct ggml_backend_metal_context {
-    int n_cb;
-
     id<MTLDevice>       device;
     id<MTLCommandQueue> queue;
 
+    MTLComputePassDescriptor * edesc;
+
     dispatch_queue_t d_queue;
 
     struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT];
@@ -233,7 +239,27 @@ struct ggml_backend_metal_context {
     bool support_simdgroup_reduction;
     bool support_simdgroup_mm;
 
-    bool should_capture_next_compute;
+    // capture state
+    bool capture_next_compute;
+    bool capture_started;
+
+    id<MTLCaptureScope> capture_scope;
+
+    // command buffer state
+    int n_cb;           // number of extra threads used to submit the command buffers
+    int n_nodes_0;      // number of nodes submitted by the main thread
+    int n_nodes_1;      // remaining number of nodes submitted by the n_cb threads
+    int n_nodes_per_cb;
+
+    struct ggml_cgraph * gf;
+
+    // the callback given to the thread pool
+    // TODO: ideally, this should be created once, utilizing the command buffer state above
+    //       for some reason, doing it like this leads to a crash
+    void (^encode_async)(size_t ith);
+
+    // n_cb command buffers + 1 used by the main thread
+    id<MTLCommandBuffer> command_buffers[GGML_METAL_MAX_COMMAND_BUFFERS + 1];
 
     // abort ggml_metal_graph_compute if callback returns true
     ggml_abort_callback abort_callback;
@@ -303,7 +329,7 @@ static void * ggml_metal_host_malloc(size_t n) {
     return data;
 }
 
-static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
+static struct ggml_backend_metal_context * ggml_metal_init(void) {
     GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
 
 #if TARGET_OS_OSX && !GGML_METAL_NDEBUG
@@ -322,8 +348,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
     // Configure context
     struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context));
     ctx->device = device;
-    ctx->n_cb   = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
     ctx->queue  = [ctx->device newCommandQueue];
+    ctx->edesc  = MTLComputePassDescriptor.computePassDescriptor;
+    ctx->edesc.dispatchType = MTLDispatchTypeSerial;
     ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
 
     id<MTLLibrary> metal_library;
@@ -455,7 +482,15 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
     GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n",       __func__, ctx->support_simdgroup_mm ? "true" : "false");
     GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
 
-    ctx->should_capture_next_compute = false;
+    ctx->capture_next_compute = false;
+    ctx->capture_started = false;
+    ctx->capture_scope = nil;
+
+    ctx->gf = nil;
+    ctx->encode_async = nil;
+    for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
+        ctx->command_buffers[i] = nil;
+    }
 
 #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
     if (@available(macOS 10.12, iOS 16.0, *)) {
@@ -686,6 +721,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
     }
 
     [metal_library release];
+
     return ctx;
 }
 
@@ -874,874 +910,820 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
     }
 }
 
-static enum ggml_status ggml_metal_graph_compute(
-        struct ggml_backend_metal_context * ctx,
-               struct ggml_cgraph * gf) {
+static void ggml_metal_encode_node(
+     struct ggml_backend_metal_context * ctx,
+                                   int   idx,
+          id<MTLComputeCommandEncoder>   encoder) {
+    struct ggml_cgraph * gf = ctx->gf;
 
-    @autoreleasepool {
-    MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
-    edesc.dispatchType = MTLDispatchTypeSerial;
+    struct ggml_tensor * node = ggml_graph_node(gf, idx);
 
-    // create multiple command buffers and enqueue them
-    // then, we encode the graph into the command buffers in parallel
+    //GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, idx, ggml_op_name(node->op));
 
-    const int n_nodes = gf->n_nodes;
-    const int n_cb = ctx->n_cb;
-    const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
+    struct ggml_tensor * src0 = node->src[0];
+    struct ggml_tensor * src1 = node->src[1];
+    struct ggml_tensor * src2 = node->src[2];
+    struct ggml_tensor * dst  = node;
 
-    const bool should_capture = ctx->should_capture_next_compute;
-    if (should_capture) {
-        ctx->should_capture_next_compute = false;
-
-        MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
-        descriptor.captureObject = ctx->queue;
-
-        NSError * error = nil;
-        if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
-            GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
-            GGML_ABORT("capture failed");
-        }
+    if (ggml_is_empty(dst)) {
+        return;
     }
 
-    id<MTLCommandBuffer> command_buffer_builder[n_cb];
-    for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
-        id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBufferWithUnretainedReferences];
-        command_buffer_builder[cb_idx] = command_buffer;
-
-        // always enqueue the first two command buffers
-        // enqueue all of the command buffers if we don't need to abort
-        if (cb_idx < 2 || ctx->abort_callback == NULL) {
-            [command_buffer enqueue];
-        }
+    switch (dst->op) {
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_PERMUTE:
+            {
+                // noop -> next node
+            } return;
+        default:
+            {
+            } break;
     }
 
-    const id<MTLCommandBuffer> *command_buffers = command_buffer_builder;
+    if (!ggml_metal_supports_op(ctx, dst)) {
+        GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
+        GGML_ABORT("unsupported op");
+    }
 
-    dispatch_apply(n_cb, ctx->d_queue, ^(size_t iter) {
-        const int cb_idx = iter;
+    const int64_t  ne00 = src0 ? src0->ne[0] : 0;
+    const int64_t  ne01 = src0 ? src0->ne[1] : 0;
+    const int64_t  ne02 = src0 ? src0->ne[2] : 0;
+    const int64_t  ne03 = src0 ? src0->ne[3] : 0;
 
-        size_t offs_src0 = 0;
-        size_t offs_src1 = 0;
-        size_t offs_src2 = 0;
-        size_t offs_dst  = 0;
+    const uint64_t nb00 = src0 ? src0->nb[0] : 0;
+    const uint64_t nb01 = src0 ? src0->nb[1] : 0;
+    const uint64_t nb02 = src0 ? src0->nb[2] : 0;
+    const uint64_t nb03 = src0 ? src0->nb[3] : 0;
 
-        id<MTLCommandBuffer> command_buffer  = command_buffers[cb_idx];
-        id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
+    const int64_t  ne10 = src1 ? src1->ne[0] : 0;
+    const int64_t  ne11 = src1 ? src1->ne[1] : 0;
+    const int64_t  ne12 = src1 ? src1->ne[2] : 0;
+    const int64_t  ne13 = src1 ? src1->ne[3] : 0;
 
-        const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
-        const int node_end   = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
+    const uint64_t nb10 = src1 ? src1->nb[0] : 0;
+    const uint64_t nb11 = src1 ? src1->nb[1] : 0;
+    const uint64_t nb12 = src1 ? src1->nb[2] : 0;
+    const uint64_t nb13 = src1 ? src1->nb[3] : 0;
 
-        for (int i = node_start; i < node_end; ++i) {
-            if (i == -1) {
-                [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
-                continue;
-            }
+    const int64_t  ne20 = src2 ? src2->ne[0] : 0;
+    const int64_t  ne21 = src2 ? src2->ne[1] : 0;
+    const int64_t  ne22 = src2 ? src2->ne[2] : 0; GGML_UNUSED(ne22);
+    const int64_t  ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23);
 
-            //GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+    const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
+    const uint64_t nb21 = src2 ? src2->nb[1] : 0;
+    const uint64_t nb22 = src2 ? src2->nb[2] : 0;
+    const uint64_t nb23 = src2 ? src2->nb[3] : 0;
 
-            struct ggml_tensor * src0 = gf->nodes[i]->src[0];
-            struct ggml_tensor * src1 = gf->nodes[i]->src[1];
-            struct ggml_tensor * src2 = gf->nodes[i]->src[2];
-            struct ggml_tensor * dst  = gf->nodes[i];
+    const int64_t  ne0  =  dst ?  dst->ne[0] : 0;
+    const int64_t  ne1  =  dst ?  dst->ne[1] : 0;
+    const int64_t  ne2  =  dst ?  dst->ne[2] : 0;
+    const int64_t  ne3  =  dst ?  dst->ne[3] : 0;
 
-            if (ggml_is_empty(dst)) {
-                continue;
-            }
+    const uint64_t nb0  =  dst ?  dst->nb[0] : 0;
+    const uint64_t nb1  =  dst ?  dst->nb[1] : 0;
+    const uint64_t nb2  =  dst ?  dst->nb[2] : 0;
+    const uint64_t nb3  =  dst ?  dst->nb[3] : 0;
 
-            switch (dst->op) {
-                case GGML_OP_NONE:
-                case GGML_OP_RESHAPE:
-                case GGML_OP_VIEW:
-                case GGML_OP_TRANSPOSE:
-                case GGML_OP_PERMUTE:
-                    {
-                        // noop -> next node
-                    } continue;
+    const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
+    const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
+    const enum ggml_type dstt  = dst  ? dst->type  : GGML_TYPE_COUNT;
+
+    size_t offs_src0 = 0;
+    size_t offs_src1 = 0;
+    size_t offs_src2 = 0;
+    size_t offs_dst  = 0;
+
+    id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil;
+    id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil;
+    id<MTLBuffer> id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil;
+    id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(dst,  &offs_dst)  : nil;
+
+    //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
+    //if (src0) {
+    //    GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
+    //            ggml_is_contiguous(src0), src0->name);
+    //}
+    //if (src1) {
+    //    GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
+    //            ggml_is_contiguous(src1), src1->name);
+    //}
+    //if (dst) {
+    //    GGML_METAL_LOG_INFO("%s: dst  - %4s [%5lld, %5lld, %5lld], 1, %s\n",  __func__, ggml_type_name(dstt),  ne0,  ne1,  ne2,
+    //            dst->name);
+    //}
+
+    switch (dst->op) {
+        case GGML_OP_CONCAT:
+            {
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline;
+
+                const int32_t dim = ((const int32_t *) dst->op_params)[0];
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
+                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
+                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
+                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
+                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
+                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
+                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
+                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
+                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
+                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
+                [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
+                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
+                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
+                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
+                [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
+                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
+                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
+                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
+                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
+                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
+                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
+                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
+                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
+                [encoder setBytes:&dim  length:sizeof(dim)  atIndex:27];
+
+                const int nth = MIN(1024, ne0);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_ADD:
+        case GGML_OP_SUB:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+            {
+                GGML_ASSERT(src0t == GGML_TYPE_F32);
+                GGML_ASSERT(src1t == GGML_TYPE_F32);
+
+                const size_t offs = 0;
+
+                bool bcast_row = false;
+
+                int64_t nb = ne00; // used by the "row" kernels
+
+                id<MTLComputePipelineState> pipeline = nil;
+
+                if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
+                    GGML_ASSERT(ggml_is_contiguous(src0));
+
+                    // src1 is a row
+                    GGML_ASSERT(ne11 == 1);
+
+                    nb = ne00 / 4;
+                    switch (dst->op) {
+                        case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
+                        case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB_ROW].pipeline; break;
+                        case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
+                        case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
+                        default: GGML_ABORT("fatal error");
+                    }
+
+                    bcast_row = true;
+                } else {
+                    switch (dst->op) {
+                        case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
+                        case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB].pipeline; break;
+                        case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
+                        case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
+                        default: GGML_ABORT("fatal error");
+                    }
+                }
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
+                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
+                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
+                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
+                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
+                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
+                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
+                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
+                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
+                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
+                [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
+                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
+                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
+                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
+                [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
+                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
+                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
+                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
+                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
+                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
+                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
+                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
+                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
+                [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
+                [encoder setBytes:&nb   length:sizeof(nb)   atIndex:28];
+
+                if (bcast_row) {
+                    const int64_t n = ggml_nelements(dst)/4;
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } else {
+                    const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                }
+            } break;
+        case GGML_OP_REPEAT:
+            {
+                id<MTLComputePipelineState> pipeline;
+
+                switch (src0t) {
+                    case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F32].pipeline; break;
+                    case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break;
+                    case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break;
+                    case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break;
+                    default: GGML_ABORT("fatal error");
+                }
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
+                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
+                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
+                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
+                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
+                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
+                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
+                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
+                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
+                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
+
+                const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_ACC:
+            {
+                GGML_ASSERT(src0t == GGML_TYPE_F32);
+                GGML_ASSERT(src1t == GGML_TYPE_F32);
+                GGML_ASSERT(dstt  == GGML_TYPE_F32);
+
+                GGML_ASSERT(ggml_is_contiguous(src0));
+                GGML_ASSERT(ggml_is_contiguous(src1));
+
+                const size_t pnb1 = ((const int32_t *) dst->op_params)[0];
+                const size_t pnb2 = ((const int32_t *) dst->op_params)[1];
+                const size_t pnb3 = ((const int32_t *) dst->op_params)[2];
+                const size_t offs = ((const int32_t *) dst->op_params)[3];
+
+                const bool inplace = (bool) ((const int32_t *) dst->op_params)[4];
+
+                if (!inplace) {
+                    // run a separete kernel to cpy src->dst
+                    // not sure how to avoid this
+                    // TODO: make a simpler cpy_bytes kernel
+
+                    const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline;
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                    [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                    [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
+                    [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
+                    [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
+                    [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
+                    [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
+                    [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
+                    [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
+                    [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
+                    [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
+                    [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
+                    [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
+                    [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
+                    [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
+                    [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
+                    [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
+
+                    const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                }
+
+                const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
+                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
+                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
+                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
+                [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:8];
+                [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:9];
+                [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:10];
+                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
+                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
+                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
+                [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
+                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
+                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
+                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
+                [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
+                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
+                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
+                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
+                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
+                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
+                [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:24];
+                [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:25];
+                [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
+                [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
+
+                const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_SCALE:
+            {
+                GGML_ASSERT(ggml_is_contiguous(src0));
+
+                float scale;
+                memcpy(&scale, dst->op_params, sizeof(scale));
+
+                int64_t n = ggml_nelements(dst);
+
+                id<MTLComputePipelineState> pipeline = nil;
+
+                if (n % 4 == 0) {
+                    n /= 4;
+                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE_4].pipeline;
+                } else {
+                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE].pipeline;
+                }
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0   offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst    offset:offs_dst  atIndex:1];
+                [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+            } break;
+        case GGML_OP_CLAMP:
+            {
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CLAMP].pipeline;
+
+                float min;
+                float max;
+                memcpy(&min, ((const int32_t *) dst->op_params) + 0, sizeof(float));
+                memcpy(&max, ((const int32_t *) dst->op_params) + 1, sizeof(float));
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0   offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst    offset:offs_dst  atIndex:1];
+                [encoder setBytes:&min length:sizeof(min) atIndex:2];
+                [encoder setBytes:&max length:sizeof(max) atIndex:3];
+
+                const int64_t n = ggml_nelements(dst);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+            } break;
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(node)) {
+                // we are not taking into account the strides, so for now require contiguous tensors
+                GGML_ASSERT(ggml_is_contiguous(src0));
+
+                case GGML_UNARY_OP_TANH:
+                {
+                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TANH].pipeline;
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                    const int64_t n = ggml_nelements(dst);
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
+                case GGML_UNARY_OP_RELU:
+                {
+                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RELU].pipeline;
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                    const int64_t n = ggml_nelements(dst);
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
+                case GGML_UNARY_OP_SIGMOID:
+                {
+                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIGMOID].pipeline;
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                    const int64_t n = ggml_nelements(dst);
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
+                case GGML_UNARY_OP_GELU:
+                {
+                    int64_t n = ggml_nelements(dst);
+
+                    id<MTLComputePipelineState> pipeline = nil;
+
+                    if (n % 4 == 0) {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_4].pipeline;
+                        n /= 4;
+                    } else {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline;
+                    }
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
+                case GGML_UNARY_OP_GELU_QUICK:
+                {
+                    int64_t n = ggml_nelements(dst);
+
+                    id<MTLComputePipelineState> pipeline = nil;
+
+                    if (n % 4 == 0) {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK_4].pipeline;
+                        n /= 4;
+                    } else {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline;
+                    }
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
+                case GGML_UNARY_OP_SILU:
+                {
+                    int64_t n = ggml_nelements(dst);
+
+                    id<MTLComputePipelineState> pipeline = nil;
+
+                    if (n % 4 == 0) {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU_4].pipeline;
+                        n /= 4;
+                    } else {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline;
+                    }
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
                 default:
-                    {
-                    } break;
-            }
-
-            if (!ggml_metal_supports_op(ctx, dst)) {
-                GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
-                GGML_ABORT("unsupported op");
-            }
-
-            if (should_capture) {
-                [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]];
-            }
-
-            const int64_t  ne00 = src0 ? src0->ne[0] : 0;
-            const int64_t  ne01 = src0 ? src0->ne[1] : 0;
-            const int64_t  ne02 = src0 ? src0->ne[2] : 0;
-            const int64_t  ne03 = src0 ? src0->ne[3] : 0;
-
-            const uint64_t nb00 = src0 ? src0->nb[0] : 0;
-            const uint64_t nb01 = src0 ? src0->nb[1] : 0;
-            const uint64_t nb02 = src0 ? src0->nb[2] : 0;
-            const uint64_t nb03 = src0 ? src0->nb[3] : 0;
-
-            const int64_t  ne10 = src1 ? src1->ne[0] : 0;
-            const int64_t  ne11 = src1 ? src1->ne[1] : 0;
-            const int64_t  ne12 = src1 ? src1->ne[2] : 0;
-            const int64_t  ne13 = src1 ? src1->ne[3] : 0;
-
-            const uint64_t nb10 = src1 ? src1->nb[0] : 0;
-            const uint64_t nb11 = src1 ? src1->nb[1] : 0;
-            const uint64_t nb12 = src1 ? src1->nb[2] : 0;
-            const uint64_t nb13 = src1 ? src1->nb[3] : 0;
-
-            const int64_t  ne20 = src2 ? src2->ne[0] : 0;
-            const int64_t  ne21 = src2 ? src2->ne[1] : 0;
-            const int64_t  ne22 = src2 ? src2->ne[2] : 0; GGML_UNUSED(ne22);
-            const int64_t  ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23);
-
-            const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
-            const uint64_t nb21 = src2 ? src2->nb[1] : 0;
-            const uint64_t nb22 = src2 ? src2->nb[2] : 0;
-            const uint64_t nb23 = src2 ? src2->nb[3] : 0;
-
-            const int64_t  ne0  =  dst ?  dst->ne[0] : 0;
-            const int64_t  ne1  =  dst ?  dst->ne[1] : 0;
-            const int64_t  ne2  =  dst ?  dst->ne[2] : 0;
-            const int64_t  ne3  =  dst ?  dst->ne[3] : 0;
-
-            const uint64_t nb0  =  dst ?  dst->nb[0] : 0;
-            const uint64_t nb1  =  dst ?  dst->nb[1] : 0;
-            const uint64_t nb2  =  dst ?  dst->nb[2] : 0;
-            const uint64_t nb3  =  dst ?  dst->nb[3] : 0;
-
-            const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
-            const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
-            const enum ggml_type dstt  = dst  ? dst->type  : GGML_TYPE_COUNT;
-
-            id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil;
-            id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil;
-            id<MTLBuffer> id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil;
-            id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(dst,  &offs_dst)  : nil;
-
-            //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
-            //if (src0) {
-            //    GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
-            //            ggml_is_contiguous(src0), src0->name);
-            //}
-            //if (src1) {
-            //    GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
-            //            ggml_is_contiguous(src1), src1->name);
-            //}
-            //if (dst) {
-            //    GGML_METAL_LOG_INFO("%s: dst  - %4s [%5lld, %5lld, %5lld], 1, %s\n",  __func__, ggml_type_name(dstt),  ne0,  ne1,  ne2,
-            //            dst->name);
-            //}
-
-            switch (dst->op) {
-                case GGML_OP_CONCAT:
-                    {
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline;
-
-                        const int32_t dim = ((int32_t *) dst->op_params)[0];
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
-                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
-                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
-                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
-                        [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
-                        [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
-                        [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
-                        [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
-                        [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
-                        [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
-                        [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
-                        [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
-                        [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
-                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
-                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
-                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
-                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
-                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
-                        [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
-                        [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
-                        [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
-                        [encoder setBytes:&dim  length:sizeof(dim)  atIndex:27];
-
-                        const int nth = MIN(1024, ne0);
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
-                case GGML_OP_ADD:
-                case GGML_OP_SUB:
-                case GGML_OP_MUL:
-                case GGML_OP_DIV:
-                    {
-                        GGML_ASSERT(src0t == GGML_TYPE_F32);
-                        GGML_ASSERT(src1t == GGML_TYPE_F32);
-
-                        const size_t offs = 0;
-
-                        bool bcast_row = false;
-
-                        int64_t nb = ne00; // used by the "row" kernels
-
-                        id<MTLComputePipelineState> pipeline = nil;
-
-                        if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
-                            GGML_ASSERT(ggml_is_contiguous(src0));
-
-                            // src1 is a row
-                            GGML_ASSERT(ne11 == 1);
-
-                            nb = ne00 / 4;
-                            switch (dst->op) {
-                                case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
-                                case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB_ROW].pipeline; break;
-                                case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
-                                case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
-                                default: GGML_ABORT("fatal error");
-                            }
-
-                            bcast_row = true;
-                        } else {
-                            switch (dst->op) {
-                                case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
-                                case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB].pipeline; break;
-                                case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
-                                case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
-                                default: GGML_ABORT("fatal error");
-                            }
-                        }
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
-                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
-                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
-                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
-                        [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
-                        [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
-                        [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
-                        [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
-                        [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
-                        [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
-                        [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
-                        [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
-                        [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
-                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
-                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
-                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
-                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
-                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
-                        [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
-                        [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
-                        [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
-                        [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
-                        [encoder setBytes:&nb   length:sizeof(nb)   atIndex:28];
-
-                        if (bcast_row) {
-                            const int64_t n = ggml_nelements(dst)/4;
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        } else {
-                            const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        }
-                    } break;
-                case GGML_OP_REPEAT:
-                    {
-                        id<MTLComputePipelineState> pipeline;
-
-                        switch (src0t) {
-                            case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F32].pipeline; break;
-                            case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break;
-                            case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break;
-                            case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break;
-                            default: GGML_ABORT("fatal error");
-                        }
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
-                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                        [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
-                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
-                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
-                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
-                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
-                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
-                        [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
-                        [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
-                        [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
-
-                        const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
-                case GGML_OP_ACC:
-                    {
-                        GGML_ASSERT(src0t == GGML_TYPE_F32);
-                        GGML_ASSERT(src1t == GGML_TYPE_F32);
-                        GGML_ASSERT(dstt  == GGML_TYPE_F32);
-
-                        GGML_ASSERT(ggml_is_contiguous(src0));
-                        GGML_ASSERT(ggml_is_contiguous(src1));
-
-                        const size_t pnb1 = ((int32_t *) dst->op_params)[0];
-                        const size_t pnb2 = ((int32_t *) dst->op_params)[1];
-                        const size_t pnb3 = ((int32_t *) dst->op_params)[2];
-                        const size_t offs = ((int32_t *) dst->op_params)[3];
-
-                        const bool inplace = (bool) ((int32_t *) dst->op_params)[4];
-
-                        if (!inplace) {
-                            // run a separete kernel to cpy src->dst
-                            // not sure how to avoid this
-                            // TODO: make a simpler cpy_bytes kernel
-
-                            const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
-                            [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
-                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
-                            [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
-                            [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
-                            [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
-                            [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
-                            [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
-                            [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
-                            [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
-                            [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
-
-                            const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        }
-
-                        const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
-                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
-                        [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:8];
-                        [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:9];
-                        [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:10];
-                        [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
-                        [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
-                        [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
-                        [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
-                        [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
-                        [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
-                        [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
-                        [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
-                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
-                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
-                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
-                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
-                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
-                        [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:24];
-                        [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:25];
-                        [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
-                        [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
-
-                        const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
-                case GGML_OP_SCALE:
-                    {
-                        GGML_ASSERT(ggml_is_contiguous(src0));
-
-                        float scale;
-                        memcpy(&scale, dst->op_params, sizeof(scale));
-
-                        int64_t n = ggml_nelements(dst);
-
-                        id<MTLComputePipelineState> pipeline = nil;
-
-                        if (n % 4 == 0) {
-                            n /= 4;
-                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE_4].pipeline;
-                        } else {
-                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE].pipeline;
-                        }
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0   offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_dst    offset:offs_dst  atIndex:1];
-                        [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                    } break;
-                case GGML_OP_CLAMP:
-                    {
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CLAMP].pipeline;
-
-                        float min;
-                        float max;
-                        memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float));
-                        memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float));
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0   offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_dst    offset:offs_dst  atIndex:1];
-                        [encoder setBytes:&min length:sizeof(min) atIndex:2];
-                        [encoder setBytes:&max length:sizeof(max) atIndex:3];
-
-                        const int64_t n = ggml_nelements(dst);
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                    } break;
-                case GGML_OP_UNARY:
-                    switch (ggml_get_unary_op(gf->nodes[i])) {
-                        // we are not taking into account the strides, so for now require contiguous tensors
-                        GGML_ASSERT(ggml_is_contiguous(src0));
-
-                        case GGML_UNARY_OP_TANH:
-                            {
-                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TANH].pipeline;
-
-                                [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                const int64_t n = ggml_nelements(dst);
-
-                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                            } break;
-                        case GGML_UNARY_OP_RELU:
-                            {
-                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RELU].pipeline;
-
-                                [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                const int64_t n = ggml_nelements(dst);
-
-                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                            } break;
-                        case GGML_UNARY_OP_SIGMOID:
-                            {
-                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIGMOID].pipeline;
-
-                                [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                const int64_t n = ggml_nelements(dst);
-
-                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                            } break;
-                        case GGML_UNARY_OP_GELU:
-                            {
-                                int64_t n = ggml_nelements(dst);
-
-                                id<MTLComputePipelineState> pipeline = nil;
-
-                                if (n % 4 == 0) {
-                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_4].pipeline;
-                                    n /= 4;
-                                } else {
-                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline;
-                                }
-
-                                [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                            } break;
-                        case GGML_UNARY_OP_GELU_QUICK:
-                            {
-                                int64_t n = ggml_nelements(dst);
-
-                                id<MTLComputePipelineState> pipeline = nil;
-
-                                if (n % 4 == 0) {
-                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK_4].pipeline;
-                                    n /= 4;
-                                } else {
-                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline;
-                                }
-
-                                [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                            } break;
-                        case GGML_UNARY_OP_SILU:
-                            {
-                                int64_t n = ggml_nelements(dst);
-
-                                id<MTLComputePipelineState> pipeline = nil;
-
-                                if (n % 4 == 0) {
-                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU_4].pipeline;
-                                    n /= 4;
-                                } else {
-                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline;
-                                }
-
-                                [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                            } break;
-                        default:
-                            {
-                                GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-                                GGML_ABORT("fatal error");
-                            }
-                    } break;
-                case GGML_OP_SQR:
-                    {
-                        GGML_ASSERT(ggml_is_contiguous(src0));
-
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQR].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
-
-                        const int64_t n = ggml_nelements(dst);
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                    } break;
-                case GGML_OP_SQRT:
-                    {
-                        GGML_ASSERT(ggml_is_contiguous(src0));
-
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQRT].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
-
-                        const int64_t n = ggml_nelements(dst);
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                    } break;
-                case GGML_OP_SIN:
-                    {
-                        GGML_ASSERT(ggml_is_contiguous(src0));
-
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIN].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
-
-                        const int64_t n = ggml_nelements(dst);
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                    } break;
-                case GGML_OP_COS:
-                    {
-                        GGML_ASSERT(ggml_is_contiguous(src0));
-
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_COS].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
-
-                        const int64_t n = ggml_nelements(dst);
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                    } break;
-                case GGML_OP_SUM_ROWS:
-                    {
-                        GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
-
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
-                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                        [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
-                        [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
-                        [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11];
-                        [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
-                        [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13];
-                        [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
-                        [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
-                        [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
-                        [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17];
-                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:18];
-                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:19];
-                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:20];
-                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:21];
-                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:22];
-                        [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:23];
-                        [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:24];
-                        [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:25];
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                    } break;
-                case GGML_OP_SOFT_MAX:
-                    {
-                        GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32);
-
-                        int nth = 32; // SIMD width
-
-                        id<MTLComputePipelineState> pipeline = nil;
-
-                        const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
-
-                        if (ne00%4 == 0) {
-                            while (nth < ne00/4 && nth*ne01*ne02*ne03 < 256) {
-                                nth *= 2;
-                            }
-                            if (use_f16) {
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4].pipeline;
-                            } else {
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4].pipeline;
-                            }
-                        } else {
-                            while (nth < ne00 && nth*ne01*ne02*ne03 < 256) {
-                                nth *= 2;
-                            }
-                            if (use_f16) {
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16].pipeline;
-                            } else {
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32].pipeline;
-                            }
-                        }
-
-                        float scale;
-                        float max_bias;
-
-                        memcpy(&scale,    ((int32_t *) dst->op_params) + 0, sizeof(scale));
-                        memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias));
-
-                        const int64_t nrows_x = ggml_nrows(src0);
-                        const int64_t nrows_y = src0->ne[1];
-
-                        const uint32_t n_head      = nrows_x/nrows_y;
-                        const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
-
-                        const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
-                        const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
-                        if (id_src1) {
-                            [encoder setBuffer:id_src1 offset:offs_src1   atIndex:1];
-                        } else {
-                            [encoder setBuffer:id_src0 offset:offs_src0   atIndex:1];
-                        }
-                        [encoder setBuffer:id_dst      offset:offs_dst            atIndex:2];
-                        [encoder setBytes:&ne00        length:sizeof(ne00)        atIndex:3];
-                        [encoder setBytes:&ne01        length:sizeof(ne01)        atIndex:4];
-                        [encoder setBytes:&ne02        length:sizeof(ne02)        atIndex:5];
-                        [encoder setBytes:&scale       length:sizeof(scale)       atIndex:6];
-                        [encoder setBytes:&max_bias    length:sizeof(max_bias)    atIndex:7];
-                        [encoder setBytes:&m0          length:sizeof(m0)          atIndex:8];
-                        [encoder setBytes:&m1          length:sizeof(m1)          atIndex:9];
-                        [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:10];
-                        [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
-                case GGML_OP_DIAG_MASK_INF:
-                    {
-                        const int n_past = ((int32_t *)(dst->op_params))[0];
-
-                        id<MTLComputePipelineState> pipeline = nil;
-
-                        if (ne00%8 == 0) {
-                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8].pipeline;
-                        } else {
-                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF].pipeline;
-                        }
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                        [encoder setBytes:&ne00   length:sizeof(ne00) atIndex:2];
-                        [encoder setBytes:&ne01   length:sizeof(ne01) atIndex:3];
-                        [encoder setBytes:&n_past length:sizeof(int)  atIndex:4];
-
-                        if (ne00%8 == 0) {
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        }
-                        else {
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        }
-                    } break;
-                case GGML_OP_SSM_CONV:
-                    {
-                        GGML_ASSERT(src0t == GGML_TYPE_F32);
-                        GGML_ASSERT(src1t == GGML_TYPE_F32);
-
-                        GGML_ASSERT(ggml_is_contiguous(src0));
-                        GGML_ASSERT(ggml_is_contiguous(src1));
-
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_CONV_F32].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
-                        [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
-                        [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
-                        [encoder setBytes:&ne00    length:sizeof(ne00) atIndex:3];
-                        [encoder setBytes:&ne01    length:sizeof(ne01) atIndex:4];
-                        [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:5];
-                        [encoder setBytes:&nb00    length:sizeof(nb00) atIndex:6];
-                        [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:7];
-                        [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:8];
-                        [encoder setBytes:&ne10    length:sizeof(ne10) atIndex:9];
-                        [encoder setBytes:&ne11    length:sizeof(ne11) atIndex:10];
-                        [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:11];
-                        [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:12];
-                        [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:13];
-                        [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:14];
-                        [encoder setBytes:&ne2     length:sizeof(ne2)  atIndex:15];
-                        [encoder setBytes:&nb0     length:sizeof(nb0)  atIndex:16];
-                        [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:17];
-                        [encoder setBytes:&nb2     length:sizeof(nb2)  atIndex:18];
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne1, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                    } break;
-                case GGML_OP_SSM_SCAN:
-                    {
-                        struct ggml_tensor * src3 = gf->nodes[i]->src[3];
-                        struct ggml_tensor * src4 = gf->nodes[i]->src[4];
-                        struct ggml_tensor * src5 = gf->nodes[i]->src[5];
-
-                        GGML_ASSERT(src3);
-                        GGML_ASSERT(src4);
-                        GGML_ASSERT(src5);
-
-                        size_t offs_src3 = 0;
-                        size_t offs_src4 = 0;
-                        size_t offs_src5 = 0;
-
-                        id<MTLBuffer> id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil;
-                        id<MTLBuffer> id_src4 = src4 ? ggml_metal_get_buffer(src4, &offs_src4) : nil;
-                        id<MTLBuffer> id_src5 = src5 ? ggml_metal_get_buffer(src5, &offs_src5) : nil;
-
-                        const int64_t  ne30 = src3->ne[0]; GGML_UNUSED(ne30);
-                        const int64_t  ne31 = src3->ne[1]; GGML_UNUSED(ne31);
-
-                        const uint64_t nb30 = src3->nb[0];
-                        const uint64_t nb31 = src3->nb[1];
-
-                        const int64_t  ne40 = src4->ne[0]; GGML_UNUSED(ne40);
-                        const int64_t  ne41 = src4->ne[1]; GGML_UNUSED(ne41);
-                        const int64_t  ne42 = src4->ne[2]; GGML_UNUSED(ne42);
-
-                        const uint64_t nb40 = src4->nb[0];
-                        const uint64_t nb41 = src4->nb[1];
-                        const uint64_t nb42 = src4->nb[2];
-
-                        const int64_t  ne50 = src5->ne[0]; GGML_UNUSED(ne50);
-                        const int64_t  ne51 = src5->ne[1]; GGML_UNUSED(ne51);
-                        const int64_t  ne52 = src5->ne[2]; GGML_UNUSED(ne52);
-
-                        const uint64_t nb50 = src5->nb[0];
-                        const uint64_t nb51 = src5->nb[1];
-                        const uint64_t nb52 = src5->nb[2];
-
-                        const int64_t d_state      = ne00;
-                        const int64_t d_inner      = ne01;
-                        const int64_t n_seq_tokens = ne11;
-                        const int64_t n_seqs       = ne02;
-
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                        [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
-                        [encoder setBuffer:id_src3 offset:offs_src3 atIndex:3];
-                        [encoder setBuffer:id_src4 offset:offs_src4 atIndex:4];
-                        [encoder setBuffer:id_src5 offset:offs_src5 atIndex:5];
-                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:6];
-
-                        [encoder setBytes:&d_state      length:sizeof(d_state)      atIndex:7];
-                        [encoder setBytes:&d_inner      length:sizeof(d_inner)      atIndex:8];
-                        [encoder setBytes:&n_seq_tokens length:sizeof(n_seq_tokens) atIndex:9];
-                        [encoder setBytes:&n_seqs       length:sizeof(n_seqs)       atIndex:10];
-
-                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:11];
-                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:12];
-                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:13];
-                        [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
-                        [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
-                        [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
-                        [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17];
-                        [encoder setBytes:&nb20 length:sizeof(nb20) atIndex:18];
-                        [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:19];
-                        [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:20];
-                        [encoder setBytes:&nb30 length:sizeof(nb30) atIndex:21];
-                        [encoder setBytes:&nb31 length:sizeof(nb31) atIndex:22];
-                        [encoder setBytes:&nb40 length:sizeof(nb40) atIndex:23];
-                        [encoder setBytes:&nb41 length:sizeof(nb41) atIndex:24];
-                        [encoder setBytes:&nb42 length:sizeof(nb42) atIndex:25];
-                        [encoder setBytes:&nb50 length:sizeof(nb50) atIndex:26];
-                        [encoder setBytes:&nb51 length:sizeof(nb51) atIndex:27];
-                        [encoder setBytes:&nb52 length:sizeof(nb52) atIndex:28];
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                    } break;
-                case GGML_OP_MUL_MAT:
-                    {
-                        GGML_ASSERT(ne00 == ne10);
-
-                        GGML_ASSERT(ne12 % ne02 == 0);
-                        GGML_ASSERT(ne13 % ne03 == 0);
-
-                        const uint r2 = ne12/ne02;
-                        const uint r3 = ne13/ne03;
-
-                        // find the break-even point where the matrix-matrix kernel becomes more efficient compared
-                        // to the matrix-vector kernel
-                        int ne11_mm_min = 1;
+                {
+                    GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
+                    GGML_ABORT("fatal error");
+                }
+            } break;
+        case GGML_OP_SQR:
+            {
+                GGML_ASSERT(ggml_is_contiguous(src0));
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQR].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
+
+                const int64_t n = ggml_nelements(dst);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+            } break;
+        case GGML_OP_SQRT:
+            {
+                GGML_ASSERT(ggml_is_contiguous(src0));
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQRT].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
+
+                const int64_t n = ggml_nelements(dst);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+            } break;
+        case GGML_OP_SIN:
+            {
+                GGML_ASSERT(ggml_is_contiguous(src0));
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIN].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
+
+                const int64_t n = ggml_nelements(dst);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+            } break;
+        case GGML_OP_COS:
+            {
+                GGML_ASSERT(ggml_is_contiguous(src0));
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_COS].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
+
+                const int64_t n = ggml_nelements(dst);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+            } break;
+        case GGML_OP_SUM_ROWS:
+            {
+                GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
+                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
+                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
+                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11];
+                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
+                [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13];
+                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
+                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
+                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
+                [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17];
+                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:18];
+                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:19];
+                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:20];
+                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:21];
+                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:22];
+                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:23];
+                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:24];
+                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:25];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+            } break;
+        case GGML_OP_SOFT_MAX:
+            {
+                GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32);
+
+                int nth = 32; // SIMD width
+
+                id<MTLComputePipelineState> pipeline = nil;
+
+                const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
+
+                if (ne00%4 == 0) {
+                    while (nth < ne00/4 && nth*ne01*ne02*ne03 < 256) {
+                        nth *= 2;
+                    }
+                    if (use_f16) {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4].pipeline;
+                    } else {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4].pipeline;
+                    }
+                } else {
+                    while (nth < ne00 && nth*ne01*ne02*ne03 < 256) {
+                        nth *= 2;
+                    }
+                    if (use_f16) {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16].pipeline;
+                    } else {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32].pipeline;
+                    }
+                }
+
+                float scale;
+                float max_bias;
+
+                memcpy(&scale,    ((const int32_t *) dst->op_params) + 0, sizeof(scale));
+                memcpy(&max_bias, ((const int32_t *) dst->op_params) + 1, sizeof(max_bias));
+
+                const int64_t nrows_x = ggml_nrows(src0);
+                const int64_t nrows_y = src0->ne[1];
+
+                const uint32_t n_head      = nrows_x/nrows_y;
+                const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
+
+                const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
+                const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
+                if (id_src1) {
+                    [encoder setBuffer:id_src1 offset:offs_src1   atIndex:1];
+                } else {
+                    [encoder setBuffer:id_src0 offset:offs_src0   atIndex:1];
+                }
+                [encoder setBuffer:id_dst      offset:offs_dst            atIndex:2];
+                [encoder setBytes:&ne00        length:sizeof(ne00)        atIndex:3];
+                [encoder setBytes:&ne01        length:sizeof(ne01)        atIndex:4];
+                [encoder setBytes:&ne02        length:sizeof(ne02)        atIndex:5];
+                [encoder setBytes:&scale       length:sizeof(scale)       atIndex:6];
+                [encoder setBytes:&max_bias    length:sizeof(max_bias)    atIndex:7];
+                [encoder setBytes:&m0          length:sizeof(m0)          atIndex:8];
+                [encoder setBytes:&m1          length:sizeof(m1)          atIndex:9];
+                [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:10];
+                [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_DIAG_MASK_INF:
+            {
+                const int n_past = ((const int32_t *)(dst->op_params))[0];
+
+                id<MTLComputePipelineState> pipeline = nil;
+
+                if (ne00%8 == 0) {
+                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8].pipeline;
+                } else {
+                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF].pipeline;
+                }
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                [encoder setBytes:&ne00   length:sizeof(ne00) atIndex:2];
+                [encoder setBytes:&ne01   length:sizeof(ne01) atIndex:3];
+                [encoder setBytes:&n_past length:sizeof(int)  atIndex:4];
+
+                if (ne00%8 == 0) {
+                    [encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                }
+                else {
+                    [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                }
+            } break;
+        case GGML_OP_SSM_CONV:
+            {
+                GGML_ASSERT(src0t == GGML_TYPE_F32);
+                GGML_ASSERT(src1t == GGML_TYPE_F32);
+
+                GGML_ASSERT(ggml_is_contiguous(src0));
+                GGML_ASSERT(ggml_is_contiguous(src1));
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_CONV_F32].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
+                [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
+                [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
+                [encoder setBytes:&ne00    length:sizeof(ne00) atIndex:3];
+                [encoder setBytes:&ne01    length:sizeof(ne01) atIndex:4];
+                [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:5];
+                [encoder setBytes:&nb00    length:sizeof(nb00) atIndex:6];
+                [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:7];
+                [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:8];
+                [encoder setBytes:&ne10    length:sizeof(ne10) atIndex:9];
+                [encoder setBytes:&ne11    length:sizeof(ne11) atIndex:10];
+                [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:11];
+                [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:12];
+                [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:13];
+                [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:14];
+                [encoder setBytes:&ne2     length:sizeof(ne2)  atIndex:15];
+                [encoder setBytes:&nb0     length:sizeof(nb0)  atIndex:16];
+                [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:17];
+                [encoder setBytes:&nb2     length:sizeof(nb2)  atIndex:18];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne1, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+            } break;
+        case GGML_OP_SSM_SCAN:
+            {
+                struct ggml_tensor * src3 = node->src[3];
+                struct ggml_tensor * src4 = node->src[4];
+                struct ggml_tensor * src5 = node->src[5];
+
+                GGML_ASSERT(src3);
+                GGML_ASSERT(src4);
+                GGML_ASSERT(src5);
+
+                size_t offs_src3 = 0;
+                size_t offs_src4 = 0;
+                size_t offs_src5 = 0;
+
+                id<MTLBuffer> id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil;
+                id<MTLBuffer> id_src4 = src4 ? ggml_metal_get_buffer(src4, &offs_src4) : nil;
+                id<MTLBuffer> id_src5 = src5 ? ggml_metal_get_buffer(src5, &offs_src5) : nil;
+
+                const int64_t  ne30 = src3->ne[0]; GGML_UNUSED(ne30);
+                const int64_t  ne31 = src3->ne[1]; GGML_UNUSED(ne31);
+
+                const uint64_t nb30 = src3->nb[0];
+                const uint64_t nb31 = src3->nb[1];
+
+                const int64_t  ne40 = src4->ne[0]; GGML_UNUSED(ne40);
+                const int64_t  ne41 = src4->ne[1]; GGML_UNUSED(ne41);
+                const int64_t  ne42 = src4->ne[2]; GGML_UNUSED(ne42);
+
+                const uint64_t nb40 = src4->nb[0];
+                const uint64_t nb41 = src4->nb[1];
+                const uint64_t nb42 = src4->nb[2];
+
+                const int64_t  ne50 = src5->ne[0]; GGML_UNUSED(ne50);
+                const int64_t  ne51 = src5->ne[1]; GGML_UNUSED(ne51);
+                const int64_t  ne52 = src5->ne[2]; GGML_UNUSED(ne52);
+
+                const uint64_t nb50 = src5->nb[0];
+                const uint64_t nb51 = src5->nb[1];
+                const uint64_t nb52 = src5->nb[2];
+
+                const int64_t d_state      = ne00;
+                const int64_t d_inner      = ne01;
+                const int64_t n_seq_tokens = ne11;
+                const int64_t n_seqs       = ne02;
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
+                [encoder setBuffer:id_src3 offset:offs_src3 atIndex:3];
+                [encoder setBuffer:id_src4 offset:offs_src4 atIndex:4];
+                [encoder setBuffer:id_src5 offset:offs_src5 atIndex:5];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:6];
+
+                [encoder setBytes:&d_state      length:sizeof(d_state)      atIndex:7];
+                [encoder setBytes:&d_inner      length:sizeof(d_inner)      atIndex:8];
+                [encoder setBytes:&n_seq_tokens length:sizeof(n_seq_tokens) atIndex:9];
+                [encoder setBytes:&n_seqs       length:sizeof(n_seqs)       atIndex:10];
+
+                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:11];
+                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:12];
+                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:13];
+                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
+                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
+                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
+                [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17];
+                [encoder setBytes:&nb20 length:sizeof(nb20) atIndex:18];
+                [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:19];
+                [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:20];
+                [encoder setBytes:&nb30 length:sizeof(nb30) atIndex:21];
+                [encoder setBytes:&nb31 length:sizeof(nb31) atIndex:22];
+                [encoder setBytes:&nb40 length:sizeof(nb40) atIndex:23];
+                [encoder setBytes:&nb41 length:sizeof(nb41) atIndex:24];
+                [encoder setBytes:&nb42 length:sizeof(nb42) atIndex:25];
+                [encoder setBytes:&nb50 length:sizeof(nb50) atIndex:26];
+                [encoder setBytes:&nb51 length:sizeof(nb51) atIndex:27];
+                [encoder setBytes:&nb52 length:sizeof(nb52) atIndex:28];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+            } break;
+        case GGML_OP_MUL_MAT:
+            {
+                GGML_ASSERT(ne00 == ne10);
+
+                GGML_ASSERT(ne12 % ne02 == 0);
+                GGML_ASSERT(ne13 % ne03 == 0);
+
+                const uint r2 = ne12/ne02;
+                const uint r3 = ne13/ne03;
+
+                // find the break-even point where the matrix-matrix kernel becomes more efficient compared
+                // to the matrix-vector kernel
+                int ne11_mm_min = 1;
 
 #if 0
-                        // the numbers below are measured on M2 Ultra for 7B and 13B models
-                        // these numbers do not translate to other devices or model sizes
-                        // TODO: need to find a better approach
+                // the numbers below are measured on M2 Ultra for 7B and 13B models
+                // these numbers do not translate to other devices or model sizes
+                // TODO: need to find a better approach
                         if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
                             switch (src0t) {
                                 case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
@@ -1763,11 +1745,11 @@ static enum ggml_status ggml_metal_graph_compute(
                         // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                         // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
                         if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
-                            !ggml_is_transposed(src0) &&
-                            !ggml_is_transposed(src1) &&
-                            src1t == GGML_TYPE_F32 &&
-                            ne00 % 32 == 0 && ne00 >= 64 &&
-                            (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) {
+                                !ggml_is_transposed(src0) &&
+                                !ggml_is_transposed(src1) &&
+                                src1t == GGML_TYPE_F32 &&
+                                ne00 % 32 == 0 && ne00 >= 64 &&
+                                (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) {
                             //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
 
                             // some Metal matrix data types require aligned pointers
@@ -2001,8 +1983,8 @@ static enum ggml_status ggml_metal_graph_compute(
                             [encoder setBytes:&r3   length:sizeof(r3)   atIndex:18];
 
                             if (src0t == GGML_TYPE_Q4_0  || src0t == GGML_TYPE_Q4_1  || src0t == GGML_TYPE_Q5_0 ||
-                                src0t == GGML_TYPE_Q5_1  || src0t == GGML_TYPE_Q8_0  || src0t == GGML_TYPE_Q2_K ||
-                                src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) {
+                                    src0t == GGML_TYPE_Q5_1  || src0t == GGML_TYPE_Q8_0  || src0t == GGML_TYPE_Q2_K ||
+                                    src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) {
                                 [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                             }
                             else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
@@ -2036,1041 +2018,1158 @@ static enum ggml_status ggml_metal_graph_compute(
                                 [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                             }
                         }
-                    } break;
-                case GGML_OP_MUL_MAT_ID:
-                    {
-                        const int n_as = src0->ne[2];
-
-                        // src2 = ids
-                        const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t);
-
-                        GGML_ASSERT(src2t == GGML_TYPE_I32);
-
-                        GGML_ASSERT(!ggml_is_transposed(src0));
-                        GGML_ASSERT(!ggml_is_transposed(src1));
-
-                        GGML_ASSERT(src1t == GGML_TYPE_F32);
-
-                        // find the break-even point where the matrix-matrix kernel becomes more efficient compared
-                        // to the matrix-vector kernel
-                        // ne20 = n_used_experts
-                        // ne21 = n_rows
-                        const int dst_rows = ne20*ne21;
-                        const int dst_rows_min = n_as;
-                        const int dst_rows_max = (ctx->device.maxThreadgroupMemoryLength - 32 - 8192)/4;
-
-                        // max size of the rowids array in the kernel shared buffer
-                        GGML_ASSERT(dst_rows <= dst_rows_max);
-
-                        // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
-                        // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
-                        // !!!
-                        // TODO: for now, always use mat-vec kernels until we figure out how to improve the
-                        //       indirect matrix multiplication
-                        // !!!
-                        if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
-                            ne00 % 32 == 0 && ne00 >= 64 &&
-                            dst_rows > dst_rows_min) {
-
-                            // some Metal matrix data types require aligned pointers
-                            // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
-                            switch (src0->type) {
-                                case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break;
-                                case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8  == 0); break;
-                                default: break;
-                            }
-
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            switch (src0->type) {
-                                case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32    ].pipeline; break;
-                                case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32    ].pipeline; break;
-                                case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32   ].pipeline; break;
-                                case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32   ].pipeline; break;
-                                case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32   ].pipeline; break;
-                                case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32   ].pipeline; break;
-                                case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32   ].pipeline; break;
-                                case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32   ].pipeline; break;
-                                case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32   ].pipeline; break;
-                                case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32   ].pipeline; break;
-                                case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32   ].pipeline; break;
-                                case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32   ].pipeline; break;
-                                case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break;
-                                case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
-                                case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
-                                case GGML_TYPE_IQ3_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32  ].pipeline; break;
-                                case GGML_TYPE_IQ2_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32  ].pipeline; break;
-                                case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32  ].pipeline; break;
-                                case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32  ].pipeline; break;
-                                case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
-                                case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break;
-                                default: GGML_ABORT("MUL_MAT_ID not implemented");
-                            }
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
-                            [encoder setBuffer:id_src2 offset:offs_src2    atIndex:3];
-                            [encoder setBytes:&ne20    length:sizeof(ne20) atIndex:4];
-                            [encoder setBytes:&ne21    length:sizeof(ne21) atIndex:5];
-                            [encoder setBytes:&nb21    length:sizeof(nb21) atIndex:6];
-                            [encoder setBytes:&ne00    length:sizeof(ne00) atIndex:7];
-                            [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:8];
-                            [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:9];
-                            [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:10];
-                            [encoder setBytes:&ne11    length:sizeof(ne11) atIndex:11];
-                            [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:12];
-                            [encoder setBytes:&ne13    length:sizeof(ne13) atIndex:13];
-                            [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:14];
-                            [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:15];
-                            [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:16];
-                            [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:17];
-                            [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:18];
-                            [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:19];
-
-                            [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + dst_rows*4/*sizeof(ushort2)*/, 16) atIndex:0];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 31)/32, (ne01 + 63)/64, n_as) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
-                        } else {
-                            int nth0 = 32;
-                            int nth1 = 1;
-                            int nrows = 1;
-                            //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
-
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            // use custom matrix x vector kernel
-                            switch (src0t) {
-                                case GGML_TYPE_F32:
-                                    {
-                                        GGML_ASSERT(src1t == GGML_TYPE_F32);
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_F16:
-                                    {
-                                        GGML_ASSERT(src1t == GGML_TYPE_F32);
-                                        nth0 = 32;
-                                        nth1 = 1;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q4_0:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q4_1:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q5_0:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q5_1:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q8_0:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q2_K:
-                                    {
-                                        nth0 = 2;
-                                        nth1 = 32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q3_K:
-                                    {
-                                        nth0 = 2;
-                                        nth1 = 32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q4_K:
-                                    {
-                                        nth0 = 4; //1;
-                                        nth1 = 8; //32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q5_K:
-                                    {
-                                        nth0 = 2;
-                                        nth1 = 32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q6_K:
-                                    {
-                                        nth0 = 2;
-                                        nth1 = 32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ2_XXS:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ2_XS:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ3_XXS:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ3_S:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ2_S:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ1_S:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ1_M:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ4_NL:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ4_XS:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
-                                    } break;
-                                default:
-                                    {
-                                        GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
-                                        GGML_ABORT("not implemented");
-                                    }
-                            };
-
-                            if (ggml_is_quantized(src0t)) {
-                                GGML_ASSERT(ne00 >= nth0*nth1);
-                            }
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBuffer:id_src2 offset:offs_src2 atIndex:3];
-                            [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4];
-                            [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5];
-                            [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:6];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:7];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:8];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:9];
-                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:10];
-                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:11];
-                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:12];
-                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:13];
-                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:14];
-                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:15];
-                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:16];
-                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:17];
-                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:18];
-                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:19];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:20];
-                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:21];
-                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:22];
-
-                            const int64_t _ne1 = 1;
-                            const int tgz = dst_rows;
-
-                            if (src0t == GGML_TYPE_Q4_0  || src0t == GGML_TYPE_Q4_1  || src0t == GGML_TYPE_Q5_0 ||
-                                src0t == GGML_TYPE_Q5_1  || src0t == GGML_TYPE_Q8_0  || src0t == GGML_TYPE_Q2_K ||
-                                src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
-                                const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
-                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) {
-                                const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
-                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) {
-                                const int mem_size = 32*sizeof(float);
-                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_Q4_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_Q3_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_Q5_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_Q6_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            } else {
-                                const int64_t ny = (_ne1 + nrows - 1)/nrows; // = _ne1
-                                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                        }
-                    } break;
-                case GGML_OP_GET_ROWS:
-                    {
-                        id<MTLComputePipelineState> pipeline = nil;
-
-                        switch (src0->type) {
-                            case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F32    ].pipeline; break;
-                            case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F16    ].pipeline; break;
-                            case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0   ].pipeline; break;
-                            case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1   ].pipeline; break;
-                            case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0   ].pipeline; break;
-                            case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1   ].pipeline; break;
-                            case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0   ].pipeline; break;
-                            case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K   ].pipeline; break;
-                            case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K   ].pipeline; break;
-                            case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K   ].pipeline; break;
-                            case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K   ].pipeline; break;
-                            case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K   ].pipeline; break;
-                            case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break;
-                            case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
-                            case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break;
-                            case GGML_TYPE_IQ3_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S  ].pipeline; break;
-                            case GGML_TYPE_IQ2_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S  ].pipeline; break;
-                            case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S  ].pipeline; break;
-                            case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_M  ].pipeline; break;
-                            case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break;
-                            case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS ].pipeline; break;
-                            case GGML_TYPE_I32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32    ].pipeline; break;
-                            default: GGML_ABORT("not implemented");
-                        }
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0     offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_src1     offset:offs_src1 atIndex:1];
-                        [encoder setBuffer:id_dst      offset:offs_dst  atIndex:2];
-                        [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
-                        [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
-                        [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:5];
-                        [encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:6];
-                        [encoder setBytes:&nb10 length:sizeof( int64_t) atIndex:7];
-                        [encoder setBytes:&nb11 length:sizeof( int64_t) atIndex:8];
-                        [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:9];
-                        [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:10];
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne10, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
-                    } break;
-                case GGML_OP_RMS_NORM:
-                    {
-                        GGML_ASSERT(ne00 % 4 == 0);
-                        GGML_ASSERT(ggml_is_contiguous_1(src0));
-
-                        float eps;
-                        memcpy(&eps, dst->op_params, sizeof(float));
-
-                        int nth = 32; // SIMD width
-
-                        while (nth < ne00/4 && nth < 1024) {
-                            nth *= 2;
-                        }
-
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                        [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                        [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
-                        [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
-                        [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
-
-                        const int64_t nrows = ggml_nrows(src0);
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
-                case GGML_OP_GROUP_NORM:
-                    {
-                        GGML_ASSERT(ne00 % 4 == 0);
-                        GGML_ASSERT(ggml_is_contiguous(src0));
-
-                        float eps;
-                        memcpy(&eps, dst->op_params + 1, sizeof(float));
-
-                        const int32_t n_groups = ((int32_t *) dst->op_params)[0];
-
-                        int nth = 32; // SIMD width
-
-                        //while (nth < ne00/4 && nth < 1024) {
-                        //    nth *= 2;
-                        //}
-
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GROUP_NORM].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0  offset:offs_src0        atIndex:0];
-                        [encoder setBuffer:id_dst   offset:offs_dst         atIndex:1];
-                        [encoder setBytes:&ne00     length:sizeof( int64_t) atIndex:2];
-                        [encoder setBytes:&ne01     length:sizeof( int64_t) atIndex:3];
-                        [encoder setBytes:&ne02     length:sizeof( int64_t) atIndex:4];
-                        [encoder setBytes:&nb00     length:sizeof(uint64_t) atIndex:5];
-                        [encoder setBytes:&nb01     length:sizeof(uint64_t) atIndex:6];
-                        [encoder setBytes:&nb02     length:sizeof(uint64_t) atIndex:7];
-                        [encoder setBytes:&n_groups length:sizeof( int32_t) atIndex:8];
-                        [encoder setBytes:&eps      length:sizeof(   float) atIndex:9];
-                        [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(n_groups, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
-                case GGML_OP_NORM:
-                    {
-                        GGML_ASSERT(ggml_is_contiguous_1(src0));
-
-                        float eps;
-                        memcpy(&eps, dst->op_params, sizeof(float));
-
-                        const int nth = MIN(256, ne00);
-
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NORM].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                        [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                        [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
-                        [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
-                        [encoder setThreadgroupMemoryLength:GGML_PAD(nth*sizeof(float), 16) atIndex:0];
-
-                        const int64_t nrows = ggml_nrows(src0);
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
-                case GGML_OP_ROPE:
-                    {
-                        GGML_ASSERT(ne10 == ne02);
-
-                        const int nth = MIN(1024, ne00);
-
-                        const int n_past     = ((int32_t *) dst->op_params)[0];
-                        const int n_dims     = ((int32_t *) dst->op_params)[1];
-                        const int mode       = ((int32_t *) dst->op_params)[2];
-                        // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
-                        const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
-
-                        float freq_base;
-                        float freq_scale;
-                        float ext_factor;
-                        float attn_factor;
-                        float beta_fast;
-                        float beta_slow;
-
-                        memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-                        memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-                        memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-                        memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-                        memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-                        memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-
-                        const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
-
-                        id<MTLComputePipelineState> pipeline = nil;
-
-                        if (!is_neox) {
-                            switch (src0->type) {
-                                case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
-                                case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
-                                default: GGML_ABORT("fatal error");
-                            };
-                        } else {
-                            switch (src0->type) {
-                                case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
-                                case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
-                                default: GGML_ABORT("fatal error");
-                            };
-                        }
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0     offset:offs_src0        atIndex:0];
-                        [encoder setBuffer:id_src1     offset:offs_src1        atIndex:1];
-                        if (id_src2 != nil) {
-                            [encoder setBuffer:id_src2 offset:offs_src2        atIndex:2];
-                        } else {
-                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:2];
-                        }
-                        [encoder setBuffer:id_dst      offset:offs_dst         atIndex:3];
-                        [encoder setBytes:&ne00        length:sizeof( int64_t) atIndex:4];
-                        [encoder setBytes:&ne01        length:sizeof( int64_t) atIndex:5];
-                        [encoder setBytes:&ne02        length:sizeof( int64_t) atIndex:6];
-                        [encoder setBytes:&ne03        length:sizeof( int64_t) atIndex:7];
-                        [encoder setBytes:&nb00        length:sizeof(uint64_t) atIndex:8];
-                        [encoder setBytes:&nb01        length:sizeof(uint64_t) atIndex:9];
-                        [encoder setBytes:&nb02        length:sizeof(uint64_t) atIndex:10];
-                        [encoder setBytes:&nb03        length:sizeof(uint64_t) atIndex:11];
-                        [encoder setBytes:&ne0         length:sizeof( int64_t) atIndex:12];
-                        [encoder setBytes:&ne1         length:sizeof( int64_t) atIndex:13];
-                        [encoder setBytes:&ne2         length:sizeof( int64_t) atIndex:14];
-                        [encoder setBytes:&ne3         length:sizeof( int64_t) atIndex:15];
-                        [encoder setBytes:&nb0         length:sizeof(uint64_t) atIndex:16];
-                        [encoder setBytes:&nb1         length:sizeof(uint64_t) atIndex:17];
-                        [encoder setBytes:&nb2         length:sizeof(uint64_t) atIndex:18];
-                        [encoder setBytes:&nb3         length:sizeof(uint64_t) atIndex:19];
-                        [encoder setBytes:&n_past      length:sizeof(     int) atIndex:20];
-                        [encoder setBytes:&n_dims      length:sizeof(     int) atIndex:21];
-                        [encoder setBytes:&n_ctx_orig  length:sizeof(     int) atIndex:22];
-                        [encoder setBytes:&freq_base   length:sizeof(   float) atIndex:23];
-                        [encoder setBytes:&freq_scale  length:sizeof(   float) atIndex:24];
-                        [encoder setBytes:&ext_factor  length:sizeof(   float) atIndex:25];
-                        [encoder setBytes:&attn_factor length:sizeof(   float) atIndex:26];
-                        [encoder setBytes:&beta_fast   length:sizeof(   float) atIndex:27];
-                        [encoder setBytes:&beta_slow   length:sizeof(   float) atIndex:28];
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
-                case GGML_OP_IM2COL:
-                    {
-                        GGML_ASSERT(src0->type == GGML_TYPE_F16);
-                        GGML_ASSERT(src1->type == GGML_TYPE_F32);
-                        GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
-
-                        const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
-                        const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
-                        const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
-                        const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
-                        const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
-                        const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
-
-                        const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
-
-                        const int32_t N  = src1->ne[is_2D ? 3 : 2];
-                        const int32_t IC = src1->ne[is_2D ? 2 : 1];
-                        const int32_t IH = is_2D ? src1->ne[1] : 1;
-                        const int32_t IW =         src1->ne[0];
-
-                        const int32_t KH = is_2D ? src0->ne[1] : 1;
-                        const int32_t KW =         src0->ne[0];
-
-                        const int32_t OH = is_2D ? dst->ne[2] : 1;
-                        const int32_t OW =         dst->ne[1];
-
-                        const int32_t CHW = IC * KH * KW;
-
-                        const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4;
-                        const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4;
-
-                        id<MTLComputePipelineState> pipeline = nil;
-
-                        switch (dst->type) {
-                            case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break;
-                            case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break;
-                            default: GGML_ABORT("fatal error");
-                        };
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src1 offset:offs_src1        atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                        [encoder setBytes:&ofs0    length:sizeof( int32_t) atIndex:2];
-                        [encoder setBytes:&ofs1    length:sizeof( int32_t) atIndex:3];
-                        [encoder setBytes:&IW      length:sizeof( int32_t) atIndex:4];
-                        [encoder setBytes:&IH      length:sizeof( int32_t) atIndex:5];
-                        [encoder setBytes:&CHW     length:sizeof( int32_t) atIndex:6];
-                        [encoder setBytes:&s0      length:sizeof( int32_t) atIndex:7];
-                        [encoder setBytes:&s1      length:sizeof( int32_t) atIndex:8];
-                        [encoder setBytes:&p0      length:sizeof( int32_t) atIndex:9];
-                        [encoder setBytes:&p1      length:sizeof( int32_t) atIndex:10];
-                        [encoder setBytes:&d0      length:sizeof( int32_t) atIndex:11];
-                        [encoder setBytes:&d1      length:sizeof( int32_t) atIndex:12];
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)];
-                    } break;
-                case GGML_OP_UPSCALE:
-                    {
-                        GGML_ASSERT(src0->type == GGML_TYPE_F32);
-
-                        const float sf0 = (float)ne0/src0->ne[0];
-                        const float sf1 = (float)ne1/src0->ne[1];
-                        const float sf2 = (float)ne2/src0->ne[2];
-                        const float sf3 = (float)ne3/src0->ne[3];
-
-                        const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
-                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                        [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
-                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
-                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
-                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
-                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
-                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
-                        [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
-                        [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
-                        [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
-                        [encoder setBytes:&sf0  length:sizeof(sf0)  atIndex:18];
-                        [encoder setBytes:&sf1  length:sizeof(sf1)  atIndex:19];
-                        [encoder setBytes:&sf2  length:sizeof(sf2)  atIndex:20];
-                        [encoder setBytes:&sf3  length:sizeof(sf3)  atIndex:21];
-
-                        const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
-                case GGML_OP_PAD:
-                    {
-                        GGML_ASSERT(src0->type == GGML_TYPE_F32);
-
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_F32].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
-                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                        [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
-                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
-                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
-                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
-                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
-                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
-                        [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
-                        [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
-                        [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
-
-                        const int nth = MIN(1024, ne0);
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
-                case GGML_OP_ARANGE:
-                    {
-                        GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-                        float start;
-                        float step;
-
-                        memcpy(&start, ((int32_t *) dst->op_params) + 0, sizeof(float));
-                        memcpy(&step,  ((int32_t *) dst->op_params) + 2, sizeof(float));
-
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARANGE_F32].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_dst  offset:offs_dst    atIndex:0];
-                        [encoder setBytes:&ne0   length:sizeof(ne0)   atIndex:1];
-                        [encoder setBytes:&start length:sizeof(start) atIndex:2];
-                        [encoder setBytes:&step  length:sizeof(step)  atIndex:3];
-
-                        const int nth = MIN(1024, ne0);
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
-                case GGML_OP_TIMESTEP_EMBEDDING:
-                    {
-                        GGML_ASSERT(src0->type == GGML_TYPE_F32);
-
-                        const int dim        = dst->op_params[0];
-                        const int max_period = dst->op_params[1];
-
-                        const int half = dim / 2;
-
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                        [encoder setBytes:&nb1   length:sizeof(nb1) atIndex:2];
-                        [encoder setBytes:&dim   length:sizeof(dim) atIndex:3];
-                        [encoder setBytes:&max_period length:sizeof(max_period) atIndex:4];
-
-                        const int nth = MIN(1024, half);
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne00, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
-                case GGML_OP_ARGSORT:
-                    {
-                        GGML_ASSERT(src0->type == GGML_TYPE_F32);
-                        GGML_ASSERT( dst->type == GGML_TYPE_I32);
-
-                        const int nrows = ggml_nrows(src0);
-
-                        enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
-
-                        // bitonic sort requires the number of elements to be power of 2
-                        int64_t ne00_padded = 1;
-                        while (ne00_padded < ne00) {
-                            ne00_padded *= 2;
-                        }
-
-                        // Metal kernels require the buffer size to be multiple of 16 bytes
-                        // https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/1443142-setthreadgroupmemorylength
-                        const int mem_size = GGML_PAD(ne00_padded*sizeof(int32_t), 16);
-
-                        id<MTLComputePipelineState> pipeline = nil;
-
-                        switch (order) {
-                            case GGML_SORT_ORDER_ASC:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline;  break;
-                            case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
-                            default: GGML_ABORT("fatal error");
-                        };
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0     offset:offs_src0        atIndex:0];
-                        [encoder setBuffer:id_dst      offset:offs_dst         atIndex:1];
-                        [encoder setBytes:&ne00        length:sizeof( int64_t) atIndex:2];
-                        [encoder setBytes:&ne00_padded length:sizeof( int64_t) atIndex:3];
-                        [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00_padded, 1, 1)];
-                    } break;
-                case GGML_OP_LEAKY_RELU:
-                    {
-                        GGML_ASSERT(src0->type == GGML_TYPE_F32);
-
-                        float slope;
-                        memcpy(&slope, dst->op_params, sizeof(float));
-
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst    atIndex:1];
-                        [encoder setBytes:&slope length:sizeof(slope) atIndex:2];
-
-                        const int64_t n = ggml_nelements(dst);
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                    } break;
-                case GGML_OP_FLASH_ATTN_EXT:
-                    {
-                        GGML_ASSERT(ne00 % 4  == 0);
-                        GGML_ASSERT(ne11 % 32 == 0);
-
-                        GGML_ASSERT(src0->type == GGML_TYPE_F32);
-
-                        GGML_ASSERT(ggml_are_same_shape (src1, src2));
-
-                        struct ggml_tensor * src3 = gf->nodes[i]->src[3];
-
-                        size_t offs_src3 = 0;
-
-                        id<MTLBuffer> id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil;
-
-                        GGML_ASSERT(!src3 || src3->type == GGML_TYPE_F16);
-                        GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) &&
-                                "the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
-
-                        const int64_t  ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
-                      //const int64_t  ne31 = src3 ? src3->ne[1] : 0;
-                        const int64_t  ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32);
-                        const int64_t  ne33 = src3 ? src3->ne[3] : 0; GGML_UNUSED(ne33);
-
-                        const uint64_t nb30 = src3 ? src3->nb[0] : 0; GGML_UNUSED(nb30);
-                        const uint64_t nb31 = src3 ? src3->nb[1] : 0;
-                        const uint64_t nb32 = src3 ? src3->nb[2] : 0; GGML_UNUSED(nb32);
-                        const uint64_t nb33 = src3 ? src3->nb[3] : 0; GGML_UNUSED(nb33);
-
-                        const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t);
-
-                        float scale;
-                        float max_bias;
-                        float logit_softcap;
-                        memcpy(&scale,         ((int32_t *) dst->op_params) + 0, sizeof(scale));
-                        memcpy(&max_bias,      ((int32_t *) dst->op_params) + 1, sizeof(max_bias));
-                        memcpy(&logit_softcap, ((int32_t *) dst->op_params) + 2, sizeof(logit_softcap));
-
-                        if (logit_softcap != 0.0f) {
-                            scale /= logit_softcap;
-                        }
-
-                        const uint32_t n_head      = src0->ne[2];
-                        const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
-
-                        const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
-                        const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
-
-                        id<MTLComputePipelineState> pipeline = nil;
-
-                        bool use_vec_kernel = false;
-
-                        if (ne01 >= 4 || (ne00%128 != 0)) {
-                            switch (ne00) {
-                                case 64:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64 ].pipeline; break;
-                                case 80:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80 ].pipeline; break;
-                                case 96:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break;
-                                case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break;
-                                case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break;
-                              //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break;
-                                default:
-                                          {
-                                              GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
-                                              GGML_METAL_LOG_ERROR("add template specialization for this size\n");
-                                              GGML_ABORT("add template specialization for this size");
-                                          }
-                            }
-                        } else {
-                            use_vec_kernel = true;
-
-                            switch (ne00) {
-                                case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break;
-                              //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break;
-                                default:
-                                          {
-                                              GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
-                                              GGML_METAL_LOG_ERROR("add template specialization for this size\n");
-                                              GGML_ABORT("add template specialization for this size");
-                                          }
-                            }
-                        }
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0     offset:offs_src0           atIndex:0];
-                        [encoder setBuffer:id_src1     offset:offs_src1           atIndex:1];
-                        [encoder setBuffer:id_src2     offset:offs_src2           atIndex:2];
-                        if (id_src3) {
-                            [encoder setBuffer:id_src3     offset:offs_src3           atIndex:3];
-                        } else {
-                            [encoder setBuffer:id_src0     offset:offs_src0           atIndex:3];
-                        }
-                        [encoder setBuffer:id_dst        offset:offs_dst              atIndex:4];
-                        [encoder setBytes:&ne01          length:sizeof( int64_t)      atIndex:5];
-                        [encoder setBytes:&ne02          length:sizeof( int64_t)      atIndex:6];
-                        [encoder setBytes:&ne03          length:sizeof( int64_t)      atIndex:7];
-                        [encoder setBytes:&nb01          length:sizeof(uint64_t)      atIndex:8];
-                        [encoder setBytes:&nb02          length:sizeof(uint64_t)      atIndex:9];
-                        [encoder setBytes:&nb03          length:sizeof(uint64_t)      atIndex:10];
-                        [encoder setBytes:&ne11          length:sizeof( int64_t)      atIndex:11];
-                        [encoder setBytes:&ne12          length:sizeof( int64_t)      atIndex:12];
-                        [encoder setBytes:&ne13          length:sizeof( int64_t)      atIndex:13];
-                        [encoder setBytes:&nb11          length:sizeof(uint64_t)      atIndex:14];
-                        [encoder setBytes:&nb12          length:sizeof(uint64_t)      atIndex:15];
-                        [encoder setBytes:&nb13          length:sizeof(uint64_t)      atIndex:16];
-                        [encoder setBytes:&nb21          length:sizeof(uint64_t)      atIndex:17];
-                        [encoder setBytes:&nb22          length:sizeof(uint64_t)      atIndex:18];
-                        [encoder setBytes:&nb23          length:sizeof(uint64_t)      atIndex:19];
-                        [encoder setBytes:&nb31          length:sizeof(uint64_t)      atIndex:20];
-                        [encoder setBytes:&ne1           length:sizeof( int64_t)      atIndex:21];
-                        [encoder setBytes:&ne2           length:sizeof( int64_t)      atIndex:22];
-                        [encoder setBytes:&scale         length:sizeof(   float)      atIndex:23];
-                        [encoder setBytes:&max_bias      length:sizeof(   float)      atIndex:24];
-                        [encoder setBytes:&m0            length:sizeof(m0)            atIndex:25];
-                        [encoder setBytes:&m1            length:sizeof(m1)            atIndex:26];
-                        [encoder setBytes:&n_head_log2   length:sizeof(n_head_log2)   atIndex:27];
-                        [encoder setBytes:&logit_softcap length:sizeof(logit_softcap) atIndex:28];
-
-                        if (!use_vec_kernel) {
-                            // half8x8 kernel
-                            const int64_t nqptg = 8;  // queries per threadgroup    !! sync with kernel template arguments !!
-                            const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !!
-
-                            GGML_ASSERT(nqptg <= 32);
-                            GGML_ASSERT(nqptg  % 8  == 0);
-                            GGML_ASSERT(ncpsg  % 32 == 0);
-
-                            int64_t nsgmax = 2;
-
-                            while (true) {
-                                const size_t smem = nqptg*(ne00 + 2*nsgmax*(ncpsg + nqptg))*(sizeof(float)/2);
-                                if (smem > ctx->device.maxThreadgroupMemoryLength) {
-                                    break;
-                                }
-                                nsgmax *= 2;
-                            }
-                            nsgmax /= 2;
-
-                            // simdgroups per threadgroup (a.k.a. warps)
-                            const int64_t nsg = ne01 <= nqptg ? MAX(4, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))) : 4;
-
-                            const size_t smem = nqptg*(ne00 + 2*nsg*(ncpsg + nqptg))*(sizeof(float)/2);
-
-                            //printf("smem: %zu, max: %zu\n", smem, ctx->device.maxThreadgroupMemoryLength);
-                            GGML_ASSERT(smem <= ctx->device.maxThreadgroupMemoryLength);
-
-                            [encoder setThreadgroupMemoryLength:GGML_PAD(smem, 16) atIndex:0];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)];
-                        } else {
-                            // half1x4 kernel
-                            const int64_t nqptg = 1;  // queries per threadgroup    !! sync with kernel template arguments !!
-                            const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !!
-
-                            GGML_ASSERT(nqptg <= 32);
-                            GGML_ASSERT(nqptg  % 1  == 0);
-                            GGML_ASSERT(ncpsg  % 32 == 0);
-
-                            // simdgroups per threadgroup (a.k.a. warps)
-                            const int64_t nsgt = MAX(2, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32));
-
-                            int64_t nsg = 1;
-                            while (nsg <= nsgt) {
-                                nsg *= 2;
-                            }
-                            nsg /= 2;
-
-                            const size_t smem = (nqptg*(ne00 + 2*nsg*(ncpsg + nqptg)) + nsg*ne00)*(sizeof(float)/2);
-
-                            //printf("smem: %zu, max: %zu\n", smem, ctx->device.maxThreadgroupMemoryLength);
-                            GGML_ASSERT(smem <= ctx->device.maxThreadgroupMemoryLength);
-                            [encoder setThreadgroupMemoryLength:GGML_PAD(smem, 16) atIndex:0];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)];
-                        }
-                    } break;
-                case GGML_OP_DUP:
-                case GGML_OP_CPY:
-                case GGML_OP_CONT:
-                    {
-                        GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
-
-                        int nth = MIN(1024, ne00/ggml_blck_size(src0->type));
-
-                        id<MTLComputePipelineState> pipeline = nil;
-
-                        switch (src0t) {
-                            case GGML_TYPE_F32:
-                                {
-                                    GGML_ASSERT(ne0 % ggml_blck_size(dst->type) == 0);
-
-                                    switch (dstt) {
-                                        case GGML_TYPE_F32:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline; break;
-                                        case GGML_TYPE_F16:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F16].pipeline; break;
-                                        case GGML_TYPE_Q8_0:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0].pipeline; break;
-                                        case GGML_TYPE_Q4_0:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0].pipeline; break;
-                                        case GGML_TYPE_Q4_1:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1].pipeline; break;
-                                        case GGML_TYPE_Q5_0:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break;
-                                        case GGML_TYPE_Q5_1:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break;
-                                        case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL].pipeline; break;
-                                        default: GGML_ABORT("not implemented");
-                                    };
-                                } break;
-                            case GGML_TYPE_F16:
-                                {
-                                    switch (dstt) {
-                                        case GGML_TYPE_F32:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break;
-                                        case GGML_TYPE_F16:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break;
-                                        default: GGML_ABORT("not implemented");
-                                    };
-                                } break;
-                            default: GGML_ABORT("not implemented");
-                        }
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                        [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                        [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
-                        [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
-                        [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
-                        [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
-                        [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
-                        [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
-                        [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
-                        [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
-                        [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
-                        [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
-                        [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
-                        [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
-                        [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
-                        [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
-                        [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
-                default:
-                    {
-                        GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-                        GGML_ABORT("fatal error");
+            } break;
+        case GGML_OP_MUL_MAT_ID:
+            {
+                const int n_as = src0->ne[2];
+
+                // src2 = ids
+                const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t);
+
+                GGML_ASSERT(src2t == GGML_TYPE_I32);
+
+                GGML_ASSERT(!ggml_is_transposed(src0));
+                GGML_ASSERT(!ggml_is_transposed(src1));
+
+                GGML_ASSERT(src1t == GGML_TYPE_F32);
+
+                // find the break-even point where the matrix-matrix kernel becomes more efficient compared
+                // to the matrix-vector kernel
+                // ne20 = n_used_experts
+                // ne21 = n_rows
+                const int dst_rows = ne20*ne21;
+                const int dst_rows_min = n_as;
+                const int dst_rows_max = (ctx->device.maxThreadgroupMemoryLength - 32 - 8192)/4;
+
+                // max size of the rowids array in the kernel shared buffer
+                GGML_ASSERT(dst_rows <= dst_rows_max);
+
+                // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
+                // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
+                // !!!
+                // TODO: for now, always use mat-vec kernels until we figure out how to improve the
+                //       indirect matrix multiplication
+                // !!!
+                if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
+                        ne00 % 32 == 0 && ne00 >= 64 &&
+                        dst_rows > dst_rows_min) {
+
+                    // some Metal matrix data types require aligned pointers
+                    // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
+                    switch (src0->type) {
+                        case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break;
+                        case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8  == 0); break;
+                        default: break;
                     }
-            }
 
-            if (should_capture) {
-                [encoder popDebugGroup];
+                    id<MTLComputePipelineState> pipeline = nil;
+
+                    switch (src0->type) {
+                        case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32    ].pipeline; break;
+                        case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32    ].pipeline; break;
+                        case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32   ].pipeline; break;
+                        case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32   ].pipeline; break;
+                        case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32   ].pipeline; break;
+                        case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32   ].pipeline; break;
+                        case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32   ].pipeline; break;
+                        case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32   ].pipeline; break;
+                        case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32   ].pipeline; break;
+                        case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32   ].pipeline; break;
+                        case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32   ].pipeline; break;
+                        case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32   ].pipeline; break;
+                        case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break;
+                        case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
+                        case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
+                        case GGML_TYPE_IQ3_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32  ].pipeline; break;
+                        case GGML_TYPE_IQ2_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32  ].pipeline; break;
+                        case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32  ].pipeline; break;
+                        case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32  ].pipeline; break;
+                        case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
+                        case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break;
+                        default: GGML_ABORT("MUL_MAT_ID not implemented");
+                    }
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
+                    [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
+                    [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
+                    [encoder setBuffer:id_src2 offset:offs_src2    atIndex:3];
+                    [encoder setBytes:&ne20    length:sizeof(ne20) atIndex:4];
+                    [encoder setBytes:&ne21    length:sizeof(ne21) atIndex:5];
+                    [encoder setBytes:&nb21    length:sizeof(nb21) atIndex:6];
+                    [encoder setBytes:&ne00    length:sizeof(ne00) atIndex:7];
+                    [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:8];
+                    [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:9];
+                    [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:10];
+                    [encoder setBytes:&ne11    length:sizeof(ne11) atIndex:11];
+                    [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:12];
+                    [encoder setBytes:&ne13    length:sizeof(ne13) atIndex:13];
+                    [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:14];
+                    [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:15];
+                    [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:16];
+                    [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:17];
+                    [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:18];
+                    [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:19];
+
+                    [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + dst_rows*4/*sizeof(ushort2)*/, 16) atIndex:0];
+
+                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 31)/32, (ne01 + 63)/64, n_as) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                } else {
+                    int nth0 = 32;
+                    int nth1 = 1;
+                    int nrows = 1;
+                    //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
+
+                    id<MTLComputePipelineState> pipeline = nil;
+
+                    // use custom matrix x vector kernel
+                    switch (src0t) {
+                        case GGML_TYPE_F32:
+                            {
+                                GGML_ASSERT(src1t == GGML_TYPE_F32);
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_F16:
+                            {
+                                GGML_ASSERT(src1t == GGML_TYPE_F32);
+                                nth0 = 32;
+                                nth1 = 1;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q4_0:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q4_1:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q5_0:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q5_1:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q8_0:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q2_K:
+                            {
+                                nth0 = 2;
+                                nth1 = 32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q3_K:
+                            {
+                                nth0 = 2;
+                                nth1 = 32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q4_K:
+                            {
+                                nth0 = 4; //1;
+                                nth1 = 8; //32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q5_K:
+                            {
+                                nth0 = 2;
+                                nth1 = 32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q6_K:
+                            {
+                                nth0 = 2;
+                                nth1 = 32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ2_XXS:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ2_XS:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ3_XXS:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ3_S:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ2_S:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ1_S:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ1_M:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ4_NL:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ4_XS:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
+                            } break;
+                        default:
+                            {
+                                GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
+                                GGML_ABORT("not implemented");
+                            }
+                    };
+
+                    if (ggml_is_quantized(src0t)) {
+                        GGML_ASSERT(ne00 >= nth0*nth1);
+                    }
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                    [encoder setBuffer:id_src2 offset:offs_src2 atIndex:3];
+                    [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4];
+                    [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5];
+                    [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:6];
+                    [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:7];
+                    [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:8];
+                    [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:9];
+                    [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:10];
+                    [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:11];
+                    [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:12];
+                    [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:13];
+                    [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:14];
+                    [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:15];
+                    [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:16];
+                    [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:17];
+                    [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:18];
+                    [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:19];
+                    [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:20];
+                    [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:21];
+                    [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:22];
+
+                    const int64_t _ne1 = 1;
+                    const int tgz = dst_rows;
+
+                    if (src0t == GGML_TYPE_Q4_0  || src0t == GGML_TYPE_Q4_1  || src0t == GGML_TYPE_Q5_0 ||
+                            src0t == GGML_TYPE_Q5_1  || src0t == GGML_TYPE_Q8_0  || src0t == GGML_TYPE_Q2_K ||
+                            src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
+                        const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
+                        [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) {
+                        const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
+                        [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) {
+                        const int mem_size = 32*sizeof(float);
+                        [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_Q4_K) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_Q3_K) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_Q5_K) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_Q6_K) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    } else {
+                        const int64_t ny = (_ne1 + nrows - 1)/nrows; // = _ne1
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                }
+            } break;
+        case GGML_OP_GET_ROWS:
+            {
+                id<MTLComputePipelineState> pipeline = nil;
+
+                switch (src0->type) {
+                    case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F32    ].pipeline; break;
+                    case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F16    ].pipeline; break;
+                    case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0   ].pipeline; break;
+                    case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1   ].pipeline; break;
+                    case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0   ].pipeline; break;
+                    case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1   ].pipeline; break;
+                    case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0   ].pipeline; break;
+                    case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K   ].pipeline; break;
+                    case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K   ].pipeline; break;
+                    case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K   ].pipeline; break;
+                    case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K   ].pipeline; break;
+                    case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K   ].pipeline; break;
+                    case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break;
+                    case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
+                    case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break;
+                    case GGML_TYPE_IQ3_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S  ].pipeline; break;
+                    case GGML_TYPE_IQ2_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S  ].pipeline; break;
+                    case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S  ].pipeline; break;
+                    case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_M  ].pipeline; break;
+                    case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break;
+                    case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS ].pipeline; break;
+                    case GGML_TYPE_I32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32    ].pipeline; break;
+                    default: GGML_ABORT("not implemented");
+                }
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0     offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_src1     offset:offs_src1 atIndex:1];
+                [encoder setBuffer:id_dst      offset:offs_dst  atIndex:2];
+                [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
+                [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
+                [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:5];
+                [encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:6];
+                [encoder setBytes:&nb10 length:sizeof( int64_t) atIndex:7];
+                [encoder setBytes:&nb11 length:sizeof( int64_t) atIndex:8];
+                [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:9];
+                [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:10];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne10, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
+            } break;
+        case GGML_OP_RMS_NORM:
+            {
+                GGML_ASSERT(ne00 % 4 == 0);
+                GGML_ASSERT(ggml_is_contiguous_1(src0));
+
+                float eps;
+                memcpy(&eps, dst->op_params, sizeof(float));
+
+                int nth = 32; // SIMD width
+
+                while (nth < ne00/4 && nth < 1024) {
+                    nth *= 2;
+                }
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
+                [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
+                [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
+
+                const int64_t nrows = ggml_nrows(src0);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_GROUP_NORM:
+            {
+                GGML_ASSERT(ne00 % 4 == 0);
+                GGML_ASSERT(ggml_is_contiguous(src0));
+
+                float eps;
+                memcpy(&eps, dst->op_params + 1, sizeof(float));
+
+                const int32_t n_groups = ((const int32_t *) dst->op_params)[0];
+
+                int nth = 32; // SIMD width
+
+                //while (nth < ne00/4 && nth < 1024) {
+                //    nth *= 2;
+                //}
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GROUP_NORM].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0  offset:offs_src0        atIndex:0];
+                [encoder setBuffer:id_dst   offset:offs_dst         atIndex:1];
+                [encoder setBytes:&ne00     length:sizeof( int64_t) atIndex:2];
+                [encoder setBytes:&ne01     length:sizeof( int64_t) atIndex:3];
+                [encoder setBytes:&ne02     length:sizeof( int64_t) atIndex:4];
+                [encoder setBytes:&nb00     length:sizeof(uint64_t) atIndex:5];
+                [encoder setBytes:&nb01     length:sizeof(uint64_t) atIndex:6];
+                [encoder setBytes:&nb02     length:sizeof(uint64_t) atIndex:7];
+                [encoder setBytes:&n_groups length:sizeof( int32_t) atIndex:8];
+                [encoder setBytes:&eps      length:sizeof(   float) atIndex:9];
+                [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(n_groups, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_NORM:
+            {
+                GGML_ASSERT(ggml_is_contiguous_1(src0));
+
+                float eps;
+                memcpy(&eps, dst->op_params, sizeof(float));
+
+                const int nth = MIN(256, ne00);
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NORM].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
+                [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
+                [encoder setThreadgroupMemoryLength:GGML_PAD(nth*sizeof(float), 16) atIndex:0];
+
+                const int64_t nrows = ggml_nrows(src0);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_ROPE:
+            {
+                GGML_ASSERT(ne10 == ne02);
+
+                const int nth = MIN(1024, ne00);
+
+                const int n_past     = ((const int32_t *) dst->op_params)[0];
+                const int n_dims     = ((const int32_t *) dst->op_params)[1];
+                const int mode       = ((const int32_t *) dst->op_params)[2];
+                // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
+                const int n_ctx_orig = ((const int32_t *) dst->op_params)[4];
+
+                float freq_base;
+                float freq_scale;
+                float ext_factor;
+                float attn_factor;
+                float beta_fast;
+                float beta_slow;
+
+                memcpy(&freq_base,   (const int32_t *) dst->op_params +  5, sizeof(float));
+                memcpy(&freq_scale,  (const int32_t *) dst->op_params +  6, sizeof(float));
+                memcpy(&ext_factor,  (const int32_t *) dst->op_params +  7, sizeof(float));
+                memcpy(&attn_factor, (const int32_t *) dst->op_params +  8, sizeof(float));
+                memcpy(&beta_fast,   (const int32_t *) dst->op_params +  9, sizeof(float));
+                memcpy(&beta_slow,   (const int32_t *) dst->op_params + 10, sizeof(float));
+
+                const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+
+                id<MTLComputePipelineState> pipeline = nil;
+
+                if (!is_neox) {
+                    switch (src0->type) {
+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
+                        default: GGML_ABORT("fatal error");
+                    };
+                } else {
+                    switch (src0->type) {
+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
+                        default: GGML_ABORT("fatal error");
+                    };
+                }
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0     offset:offs_src0        atIndex:0];
+                [encoder setBuffer:id_src1     offset:offs_src1        atIndex:1];
+                if (id_src2 != nil) {
+                    [encoder setBuffer:id_src2 offset:offs_src2        atIndex:2];
+                } else {
+                    [encoder setBuffer:id_src0 offset:offs_src0        atIndex:2];
+                }
+                [encoder setBuffer:id_dst      offset:offs_dst         atIndex:3];
+                [encoder setBytes:&ne00        length:sizeof( int64_t) atIndex:4];
+                [encoder setBytes:&ne01        length:sizeof( int64_t) atIndex:5];
+                [encoder setBytes:&ne02        length:sizeof( int64_t) atIndex:6];
+                [encoder setBytes:&ne03        length:sizeof( int64_t) atIndex:7];
+                [encoder setBytes:&nb00        length:sizeof(uint64_t) atIndex:8];
+                [encoder setBytes:&nb01        length:sizeof(uint64_t) atIndex:9];
+                [encoder setBytes:&nb02        length:sizeof(uint64_t) atIndex:10];
+                [encoder setBytes:&nb03        length:sizeof(uint64_t) atIndex:11];
+                [encoder setBytes:&ne0         length:sizeof( int64_t) atIndex:12];
+                [encoder setBytes:&ne1         length:sizeof( int64_t) atIndex:13];
+                [encoder setBytes:&ne2         length:sizeof( int64_t) atIndex:14];
+                [encoder setBytes:&ne3         length:sizeof( int64_t) atIndex:15];
+                [encoder setBytes:&nb0         length:sizeof(uint64_t) atIndex:16];
+                [encoder setBytes:&nb1         length:sizeof(uint64_t) atIndex:17];
+                [encoder setBytes:&nb2         length:sizeof(uint64_t) atIndex:18];
+                [encoder setBytes:&nb3         length:sizeof(uint64_t) atIndex:19];
+                [encoder setBytes:&n_past      length:sizeof(     int) atIndex:20];
+                [encoder setBytes:&n_dims      length:sizeof(     int) atIndex:21];
+                [encoder setBytes:&n_ctx_orig  length:sizeof(     int) atIndex:22];
+                [encoder setBytes:&freq_base   length:sizeof(   float) atIndex:23];
+                [encoder setBytes:&freq_scale  length:sizeof(   float) atIndex:24];
+                [encoder setBytes:&ext_factor  length:sizeof(   float) atIndex:25];
+                [encoder setBytes:&attn_factor length:sizeof(   float) atIndex:26];
+                [encoder setBytes:&beta_fast   length:sizeof(   float) atIndex:27];
+                [encoder setBytes:&beta_slow   length:sizeof(   float) atIndex:28];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_IM2COL:
+            {
+                GGML_ASSERT(src0->type == GGML_TYPE_F16);
+                GGML_ASSERT(src1->type == GGML_TYPE_F32);
+                GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
+
+                const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+                const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
+                const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
+                const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
+                const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
+                const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
+
+                const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
+
+                const int32_t N  = src1->ne[is_2D ? 3 : 2];
+                const int32_t IC = src1->ne[is_2D ? 2 : 1];
+                const int32_t IH = is_2D ? src1->ne[1] : 1;
+                const int32_t IW =         src1->ne[0];
+
+                const int32_t KH = is_2D ? src0->ne[1] : 1;
+                const int32_t KW =         src0->ne[0];
+
+                const int32_t OH = is_2D ? dst->ne[2] : 1;
+                const int32_t OW =         dst->ne[1];
+
+                const int32_t CHW = IC * KH * KW;
+
+                const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4;
+                const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4;
+
+                id<MTLComputePipelineState> pipeline = nil;
+
+                switch (dst->type) {
+                    case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break;
+                    case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break;
+                    default: GGML_ABORT("fatal error");
+                };
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src1 offset:offs_src1        atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                [encoder setBytes:&ofs0    length:sizeof( int32_t) atIndex:2];
+                [encoder setBytes:&ofs1    length:sizeof( int32_t) atIndex:3];
+                [encoder setBytes:&IW      length:sizeof( int32_t) atIndex:4];
+                [encoder setBytes:&IH      length:sizeof( int32_t) atIndex:5];
+                [encoder setBytes:&CHW     length:sizeof( int32_t) atIndex:6];
+                [encoder setBytes:&s0      length:sizeof( int32_t) atIndex:7];
+                [encoder setBytes:&s1      length:sizeof( int32_t) atIndex:8];
+                [encoder setBytes:&p0      length:sizeof( int32_t) atIndex:9];
+                [encoder setBytes:&p1      length:sizeof( int32_t) atIndex:10];
+                [encoder setBytes:&d0      length:sizeof( int32_t) atIndex:11];
+                [encoder setBytes:&d1      length:sizeof( int32_t) atIndex:12];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)];
+            } break;
+        case GGML_OP_UPSCALE:
+            {
+                GGML_ASSERT(src0->type == GGML_TYPE_F32);
+
+                const float sf0 = (float)ne0/src0->ne[0];
+                const float sf1 = (float)ne1/src0->ne[1];
+                const float sf2 = (float)ne2/src0->ne[2];
+                const float sf3 = (float)ne3/src0->ne[3];
+
+                const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
+                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
+                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
+                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
+                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
+                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
+                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
+                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
+                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
+                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
+                [encoder setBytes:&sf0  length:sizeof(sf0)  atIndex:18];
+                [encoder setBytes:&sf1  length:sizeof(sf1)  atIndex:19];
+                [encoder setBytes:&sf2  length:sizeof(sf2)  atIndex:20];
+                [encoder setBytes:&sf3  length:sizeof(sf3)  atIndex:21];
+
+                const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_PAD:
+            {
+                GGML_ASSERT(src0->type == GGML_TYPE_F32);
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_F32].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
+                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
+                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
+                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
+                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
+                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
+                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
+                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
+                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
+                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
+
+                const int nth = MIN(1024, ne0);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_ARANGE:
+            {
+                GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+                float start;
+                float step;
+
+                memcpy(&start, ((const int32_t *) dst->op_params) + 0, sizeof(float));
+                memcpy(&step,  ((const int32_t *) dst->op_params) + 2, sizeof(float));
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARANGE_F32].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_dst  offset:offs_dst    atIndex:0];
+                [encoder setBytes:&ne0   length:sizeof(ne0)   atIndex:1];
+                [encoder setBytes:&start length:sizeof(start) atIndex:2];
+                [encoder setBytes:&step  length:sizeof(step)  atIndex:3];
+
+                const int nth = MIN(1024, ne0);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_TIMESTEP_EMBEDDING:
+            {
+                GGML_ASSERT(src0->type == GGML_TYPE_F32);
+
+                const int dim        = dst->op_params[0];
+                const int max_period = dst->op_params[1];
+
+                const int half = dim / 2;
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                [encoder setBytes:&nb1   length:sizeof(nb1) atIndex:2];
+                [encoder setBytes:&dim   length:sizeof(dim) atIndex:3];
+                [encoder setBytes:&max_period length:sizeof(max_period) atIndex:4];
+
+                const int nth = MIN(1024, half);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne00, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_ARGSORT:
+            {
+                GGML_ASSERT(src0->type == GGML_TYPE_F32);
+                GGML_ASSERT( dst->type == GGML_TYPE_I32);
+
+                const int nrows = ggml_nrows(src0);
+
+                enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
+
+                // bitonic sort requires the number of elements to be power of 2
+                int64_t ne00_padded = 1;
+                while (ne00_padded < ne00) {
+                    ne00_padded *= 2;
+                }
+
+                // Metal kernels require the buffer size to be multiple of 16 bytes
+                // https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/1443142-setthreadgroupmemorylength
+                const int mem_size = GGML_PAD(ne00_padded*sizeof(int32_t), 16);
+
+                id<MTLComputePipelineState> pipeline = nil;
+
+                switch (order) {
+                    case GGML_SORT_ORDER_ASC:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline;  break;
+                    case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
+                    default: GGML_ABORT("fatal error");
+                };
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0     offset:offs_src0        atIndex:0];
+                [encoder setBuffer:id_dst      offset:offs_dst         atIndex:1];
+                [encoder setBytes:&ne00        length:sizeof( int64_t) atIndex:2];
+                [encoder setBytes:&ne00_padded length:sizeof( int64_t) atIndex:3];
+                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00_padded, 1, 1)];
+            } break;
+        case GGML_OP_LEAKY_RELU:
+            {
+                GGML_ASSERT(src0->type == GGML_TYPE_F32);
+
+                float slope;
+                memcpy(&slope, dst->op_params, sizeof(float));
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst    atIndex:1];
+                [encoder setBytes:&slope length:sizeof(slope) atIndex:2];
+
+                const int64_t n = ggml_nelements(dst);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+            } break;
+        case GGML_OP_FLASH_ATTN_EXT:
+            {
+                GGML_ASSERT(ne00 % 4  == 0);
+                GGML_ASSERT(ne11 % 32 == 0);
+
+                GGML_ASSERT(src0->type == GGML_TYPE_F32);
+
+                GGML_ASSERT(ggml_are_same_shape (src1, src2));
+
+                struct ggml_tensor * src3 = node->src[3];
+
+                size_t offs_src3 = 0;
+
+                id<MTLBuffer> id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil;
+
+                GGML_ASSERT(!src3 || src3->type == GGML_TYPE_F16);
+                GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) &&
+                        "the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
+
+                const int64_t  ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
+                //const int64_t  ne31 = src3 ? src3->ne[1] : 0;
+                const int64_t  ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32);
+                const int64_t  ne33 = src3 ? src3->ne[3] : 0; GGML_UNUSED(ne33);
+
+                const uint64_t nb30 = src3 ? src3->nb[0] : 0; GGML_UNUSED(nb30);
+                const uint64_t nb31 = src3 ? src3->nb[1] : 0;
+                const uint64_t nb32 = src3 ? src3->nb[2] : 0; GGML_UNUSED(nb32);
+                const uint64_t nb33 = src3 ? src3->nb[3] : 0; GGML_UNUSED(nb33);
+
+                const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t);
+
+                float scale;
+                float max_bias;
+                float logit_softcap;
+                memcpy(&scale,         ((const int32_t *) dst->op_params) + 0, sizeof(scale));
+                memcpy(&max_bias,      ((const int32_t *) dst->op_params) + 1, sizeof(max_bias));
+                memcpy(&logit_softcap, ((const int32_t *) dst->op_params) + 2, sizeof(logit_softcap));
+
+                if (logit_softcap != 0.0f) {
+                    scale /= logit_softcap;
+                }
+
+                const uint32_t n_head      = src0->ne[2];
+                const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
+
+                const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
+                const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+
+                id<MTLComputePipelineState> pipeline = nil;
+
+                bool use_vec_kernel = false;
+
+                if (ne01 >= 4 || (ne00%128 != 0)) {
+                    switch (ne00) {
+                        case 64:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64 ].pipeline; break;
+                        case 80:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80 ].pipeline; break;
+                        case 96:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break;
+                        case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break;
+                        case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break;
+                                  //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break;
+                        default:
+                                  {
+                                      GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
+                                      GGML_METAL_LOG_ERROR("add template specialization for this size\n");
+                                      GGML_ABORT("add template specialization for this size");
+                                  }
+                    }
+                } else {
+                    use_vec_kernel = true;
+
+                    switch (ne00) {
+                        case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break;
+                                  //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break;
+                        default:
+                                  {
+                                      GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
+                                      GGML_METAL_LOG_ERROR("add template specialization for this size\n");
+                                      GGML_ABORT("add template specialization for this size");
+                                  }
+                    }
+                }
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0     offset:offs_src0           atIndex:0];
+                [encoder setBuffer:id_src1     offset:offs_src1           atIndex:1];
+                [encoder setBuffer:id_src2     offset:offs_src2           atIndex:2];
+                if (id_src3) {
+                    [encoder setBuffer:id_src3     offset:offs_src3           atIndex:3];
+                } else {
+                    [encoder setBuffer:id_src0     offset:offs_src0           atIndex:3];
+                }
+                [encoder setBuffer:id_dst        offset:offs_dst              atIndex:4];
+                [encoder setBytes:&ne01          length:sizeof( int64_t)      atIndex:5];
+                [encoder setBytes:&ne02          length:sizeof( int64_t)      atIndex:6];
+                [encoder setBytes:&ne03          length:sizeof( int64_t)      atIndex:7];
+                [encoder setBytes:&nb01          length:sizeof(uint64_t)      atIndex:8];
+                [encoder setBytes:&nb02          length:sizeof(uint64_t)      atIndex:9];
+                [encoder setBytes:&nb03          length:sizeof(uint64_t)      atIndex:10];
+                [encoder setBytes:&ne11          length:sizeof( int64_t)      atIndex:11];
+                [encoder setBytes:&ne12          length:sizeof( int64_t)      atIndex:12];
+                [encoder setBytes:&ne13          length:sizeof( int64_t)      atIndex:13];
+                [encoder setBytes:&nb11          length:sizeof(uint64_t)      atIndex:14];
+                [encoder setBytes:&nb12          length:sizeof(uint64_t)      atIndex:15];
+                [encoder setBytes:&nb13          length:sizeof(uint64_t)      atIndex:16];
+                [encoder setBytes:&nb21          length:sizeof(uint64_t)      atIndex:17];
+                [encoder setBytes:&nb22          length:sizeof(uint64_t)      atIndex:18];
+                [encoder setBytes:&nb23          length:sizeof(uint64_t)      atIndex:19];
+                [encoder setBytes:&nb31          length:sizeof(uint64_t)      atIndex:20];
+                [encoder setBytes:&ne1           length:sizeof( int64_t)      atIndex:21];
+                [encoder setBytes:&ne2           length:sizeof( int64_t)      atIndex:22];
+                [encoder setBytes:&scale         length:sizeof(   float)      atIndex:23];
+                [encoder setBytes:&max_bias      length:sizeof(   float)      atIndex:24];
+                [encoder setBytes:&m0            length:sizeof(m0)            atIndex:25];
+                [encoder setBytes:&m1            length:sizeof(m1)            atIndex:26];
+                [encoder setBytes:&n_head_log2   length:sizeof(n_head_log2)   atIndex:27];
+                [encoder setBytes:&logit_softcap length:sizeof(logit_softcap) atIndex:28];
+
+                if (!use_vec_kernel) {
+                    // half8x8 kernel
+                    const int64_t nqptg = 8;  // queries per threadgroup    !! sync with kernel template arguments !!
+                    const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !!
+
+                    GGML_ASSERT(nqptg <= 32);
+                    GGML_ASSERT(nqptg  % 8  == 0);
+                    GGML_ASSERT(ncpsg  % 32 == 0);
+
+                    int64_t nsgmax = 2;
+
+                    while (true) {
+                        const size_t smem = nqptg*(ne00 + 2*nsgmax*(ncpsg + nqptg))*(sizeof(float)/2);
+                        if (smem > ctx->device.maxThreadgroupMemoryLength) {
+                            break;
+                        }
+                        nsgmax *= 2;
+                    }
+                    nsgmax /= 2;
+
+                    // simdgroups per threadgroup (a.k.a. warps)
+                    const int64_t nsg = ne01 <= nqptg ? MAX(4, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))) : 4;
+
+                    const size_t smem = nqptg*(ne00 + 2*nsg*(ncpsg + nqptg))*(sizeof(float)/2);
+
+                    //printf("smem: %zu, max: %zu\n", smem, ctx->device.maxThreadgroupMemoryLength);
+                    GGML_ASSERT(smem <= ctx->device.maxThreadgroupMemoryLength);
+
+                    [encoder setThreadgroupMemoryLength:GGML_PAD(smem, 16) atIndex:0];
+
+                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)];
+                } else {
+                    // half1x4 kernel
+                    const int64_t nqptg = 1;  // queries per threadgroup    !! sync with kernel template arguments !!
+                    const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !!
+
+                    GGML_ASSERT(nqptg <= 32);
+                    GGML_ASSERT(nqptg  % 1  == 0);
+                    GGML_ASSERT(ncpsg  % 32 == 0);
+
+                    // simdgroups per threadgroup (a.k.a. warps)
+                    const int64_t nsgt = MAX(2, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32));
+
+                    int64_t nsg = 1;
+                    while (nsg <= nsgt) {
+                        nsg *= 2;
+                    }
+                    nsg /= 2;
+
+                    const size_t smem = (nqptg*(ne00 + 2*nsg*(ncpsg + nqptg)) + nsg*ne00)*(sizeof(float)/2);
+
+                    //printf("smem: %zu, max: %zu\n", smem, ctx->device.maxThreadgroupMemoryLength);
+                    GGML_ASSERT(smem <= ctx->device.maxThreadgroupMemoryLength);
+                    [encoder setThreadgroupMemoryLength:GGML_PAD(smem, 16) atIndex:0];
+
+                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)];
+                }
+            } break;
+        case GGML_OP_DUP:
+        case GGML_OP_CPY:
+        case GGML_OP_CONT:
+            {
+                GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
+
+                int nth = MIN(1024, ne00/ggml_blck_size(src0->type));
+
+                id<MTLComputePipelineState> pipeline = nil;
+
+                switch (src0t) {
+                    case GGML_TYPE_F32:
+                        {
+                            GGML_ASSERT(ne0 % ggml_blck_size(dst->type) == 0);
+
+                            switch (dstt) {
+                                case GGML_TYPE_F32:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline; break;
+                                case GGML_TYPE_F16:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F16].pipeline; break;
+                                case GGML_TYPE_Q8_0:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0].pipeline; break;
+                                case GGML_TYPE_Q4_0:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0].pipeline; break;
+                                case GGML_TYPE_Q4_1:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1].pipeline; break;
+                                case GGML_TYPE_Q5_0:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break;
+                                case GGML_TYPE_Q5_1:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break;
+                                case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            };
+                        } break;
+                    case GGML_TYPE_F16:
+                        {
+                            switch (dstt) {
+                                case GGML_TYPE_F32:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break;
+                                case GGML_TYPE_F16:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            };
+                        } break;
+                    default: GGML_ABORT("not implemented");
+                }
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
+                [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
+                [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
+                [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
+                [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
+                [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
+                [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
+                [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
+                [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
+                [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
+                [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
+                [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
+                [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
+                [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
+                [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+       default:
+            {
+                GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+static enum ggml_status ggml_metal_graph_compute(
+        struct ggml_backend_metal_context * ctx,
+                       struct ggml_cgraph * gf) {
+    // number of nodes encoded by the main thread (empirically determined)
+    const int n_main = 128;
+
+    // number of threads in addition to the main thread
+    const int n_cb = ctx->n_cb;
+
+    // submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them
+    // the first n_nodes_0 are encoded and submitted for processing directly by the calling thread
+    // while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes
+    // each thread creates it's own command buffer and enqueues the ops in parallel
+    //
+    // tests on M1 Pro and M2 Ultra using LLaMA models, show that optimal values for n_cb are 1 or 2
+
+    @autoreleasepool {
+        ctx->gf = gf;
+
+        ctx->n_nodes_0 = MIN(n_main, gf->n_nodes);
+        ctx->n_nodes_1 = gf->n_nodes - ctx->n_nodes_0;
+
+        ctx->n_nodes_per_cb = (ctx->n_nodes_1 + ctx->n_cb - 1) / ctx->n_cb;
+
+        const bool should_capture = ctx->capture_next_compute;
+        if (should_capture) {
+            ctx->capture_next_compute = false;
+
+            if (!ctx->capture_started) {
+                // create capture scope
+                ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:ctx->device];
+
+                MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
+                descriptor.captureObject = ctx->capture_scope;
+                descriptor.destination = MTLCaptureDestinationGPUTraceDocument;
+                descriptor.outputURL = [NSURL fileURLWithPath:[NSString stringWithFormat:@"/tmp/perf-metal.gputrace"]];
+
+                NSError * error = nil;
+                if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
+                    GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
+                    GGML_ABORT("capture failed");
+                } else {
+                    [ctx->capture_scope beginScope];
+                    ctx->capture_started = true;
+                }
             }
         }
 
-        [encoder endEncoding];
+        // TODO: how to avoid this allocation? I tried initializing it in ggml_backend_metal_set_n_cb but it crashes.
+        ctx->encode_async = ^(size_t iter) {
+            const int cb_idx = iter;
+            const int n_cb_l = ctx->n_cb;
 
-        if (cb_idx < 2 || ctx->abort_callback == NULL) {
-            [command_buffer commit];
-        }
-    });
+            const int n_nodes_0 = ctx->n_nodes_0;
+            const int n_nodes_1 = ctx->n_nodes_1;
 
-    // Wait for completion and check status of each command buffer
-    // needed to detect if the device ran out-of-memory for example (#1881)
+            const int n_nodes_per_cb = ctx->n_nodes_per_cb;
 
-    for (int i = 0; i < n_cb; ++i) {
-        id<MTLCommandBuffer> command_buffer = command_buffers[i];
-        [command_buffer waitUntilCompleted];
+            id<MTLCommandBuffer> command_buffer  = ctx->command_buffers[cb_idx];
+            id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: ctx->edesc];
 
-        MTLCommandBufferStatus status = [command_buffer status];
-        if (status != MTLCommandBufferStatusCompleted) {
-            GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
-            if (status == MTLCommandBufferStatusError) {
-                GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
+            int node_start = 0;
+            int node_end   = n_nodes_0;
+
+            if (cb_idx < n_cb_l) {
+                node_start = n_nodes_0 + (                                         (cb_idx + 0) * n_nodes_per_cb);
+                node_end   = n_nodes_0 + (MIN((cb_idx == n_cb_l - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1));
             }
 
-            return GGML_STATUS_FAILED;
+            for (int idx = node_start; idx < node_end; ++idx) {
+                if (should_capture) {
+                    [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(gf, idx)) encoding:NSUTF8StringEncoding]];
+                }
+
+                ggml_metal_encode_node(ctx, idx, encoder);
+
+                if (should_capture) {
+                    [encoder popDebugGroup];
+                }
+            }
+
+            [encoder endEncoding];
+
+            if (cb_idx < 2 || ctx->abort_callback == NULL) {
+                [command_buffer commit];
+            }
+        };
+
+        // the main thread commits the first few commands immediately
+        // command_buffer[n_cb]
+        {
+            id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
+            ctx->command_buffers[n_cb] = command_buffer;
+
+            [command_buffer enqueue];
+            ctx->encode_async(n_cb);
         }
 
-        id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? command_buffers[i + 1] : nil);
-        if (!next_buffer) {
-            continue;
+        // prepare the rest of the command buffers asynchronously
+        // command_buffer[0.. n_cb)
+        for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
+            id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
+            ctx->command_buffers[cb_idx] = command_buffer;
+
+            // always enqueue the first two command buffers
+            // enqueue all of the command buffers if we don't need to abort
+            if (cb_idx < 2 || ctx->abort_callback == NULL) {
+                [command_buffer enqueue];
+            }
         }
 
-        bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
-        if (next_queued) {
-            continue;
+        dispatch_apply(n_cb, ctx->d_queue, ctx->encode_async);
+
+        // wait for completion and check status of each command buffer
+        // needed to detect if the device ran out-of-memory for example (#1881)
+        {
+            id<MTLCommandBuffer> command_buffer = ctx->command_buffers[n_cb];
+            [command_buffer waitUntilCompleted];
+
+            MTLCommandBufferStatus status = [command_buffer status];
+            if (status != MTLCommandBufferStatusCompleted) {
+                GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status);
+                if (status == MTLCommandBufferStatusError) {
+                    GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
+                }
+
+                return GGML_STATUS_FAILED;
+            }
         }
 
-        if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
-            GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i);
-            return GGML_STATUS_ABORTED;
+        for (int i = 0; i < n_cb; ++i) {
+            id<MTLCommandBuffer> command_buffer = ctx->command_buffers[i];
+            [command_buffer waitUntilCompleted];
+
+            MTLCommandBufferStatus status = [command_buffer status];
+            if (status != MTLCommandBufferStatusCompleted) {
+                GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
+                if (status == MTLCommandBufferStatusError) {
+                    GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
+                }
+
+                return GGML_STATUS_FAILED;
+            }
+
+            id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? ctx->command_buffers[i + 1] : nil);
+            if (!next_buffer) {
+                continue;
+            }
+
+            const bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
+            if (next_queued) {
+                continue;
+            }
+
+            if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
+                GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i);
+                return GGML_STATUS_ABORTED;
+            }
+
+            [next_buffer commit];
         }
 
-        [next_buffer commit];
+        if (!should_capture && ctx->capture_started) {
+            [ctx->capture_scope endScope];
+            [[MTLCaptureManager sharedCaptureManager] stopCapture];
+        }
     }
 
-    if (should_capture) {
-        [[MTLCaptureManager sharedCaptureManager] stopCapture];
-    }
-
-    }
     return GGML_STATUS_SUCCESS;
 }
 
@@ -3405,6 +3504,25 @@ GGML_CALL static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, g
     UNUSED(backend);
 }
 
+static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
+    GGML_ASSERT(ggml_backend_is_metal(backend));
+
+    struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
+
+    if (ctx->n_cb != n_cb) {
+        ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_COMMAND_BUFFERS);
+
+        if (ctx->n_cb > 2) {
+            GGML_METAL_LOG_WARN("%s: n_cb = %d, using n_cb > 2 is not recommended and can degrade the performance in some cases\n", __func__, n_cb);
+        }
+    }
+
+    // TODO: setting encode_async here causes crash during the next ggml_metal_graph_compute call. why?
+    //ctx->encode_async = ^(size_t iter) {
+    //    ...
+    //};
+}
+
 static struct ggml_backend_i ggml_backend_metal_i = {
     /* .get_name                = */ ggml_backend_metal_name,
     /* .free                    = */ ggml_backend_metal_free,
@@ -3439,35 +3557,29 @@ static ggml_guid_t ggml_backend_metal_guid(void) {
 }
 
 ggml_backend_t ggml_backend_metal_init(void) {
-    struct ggml_backend_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
+    struct ggml_backend_metal_context * ctx = ggml_metal_init();
     if (ctx == NULL) {
         GGML_METAL_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
         return NULL;
     }
 
-    ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
+    ggml_backend_t backend = malloc(sizeof(struct ggml_backend));
 
-    *metal_backend = (struct ggml_backend) {
+    *backend = (struct ggml_backend) {
         /* .guid      = */ ggml_backend_metal_guid(),
         /* .interface = */ ggml_backend_metal_i,
         /* .context   = */ ctx,
     };
 
-    return metal_backend;
+    ggml_backend_metal_set_n_cb(backend, 1);
+
+    return backend;
 }
 
 bool ggml_backend_is_metal(ggml_backend_t backend) {
     return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_metal_guid());
 }
 
-void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
-    GGML_ASSERT(ggml_backend_is_metal(backend));
-
-    struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
-
-    ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
-}
-
 void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data) {
     GGML_ASSERT(ggml_backend_is_metal(backend));
 
@@ -3489,7 +3601,7 @@ void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
     GGML_ASSERT(ggml_backend_is_metal(backend));
 
     struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
-    ctx->should_capture_next_compute = true;
+    ctx->capture_next_compute = true;
 }
 
 GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning
diff --git a/src/llama.cpp b/src/llama.cpp
index d1d27d21e..4c0a1bb61 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -17025,12 +17025,6 @@ static void llama_graph_compute(
             ggml_cgraph * gf,
                     int   n_threads,
         ggml_threadpool * threadpool) {
-#ifdef GGML_USE_METAL
-    if (ggml_backend_is_metal(lctx.backend_metal)) {
-        ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
-    }
-#endif
-
     if (lctx.backend_cpu != nullptr) {
         ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
         ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);

From 7254cdf7e8d6312840509ca8d766358c687c6266 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Sun, 29 Sep 2024 23:18:02 +0200
Subject: [PATCH 74/78] ggml: fix gradient allocation logic (ggml/966)

* ggml: fix gradient allocation logic

* gradient allocation in ggml_build_backward_expand

* fixup

* fix test-backend-ops grad

* suggestions by slaren

* fix test1.c

* fix legacy opt API

* fix test-grad0

* remove keep arg
---
 ggml/include/ggml.h        |   40 +-
 ggml/src/ggml.c            | 1464 +++++++++++-------------------------
 tests/test-backend-ops.cpp |   37 +-
 tests/test-grad0.cpp       |   14 +-
 4 files changed, 490 insertions(+), 1065 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index f46d4a8a6..a8a74bee1 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -577,10 +577,10 @@ extern "C" {
 
     // this tensor...
     enum ggml_tensor_flag {
-        GGML_TENSOR_FLAG_INPUT    = 1, // ...is an input for the GGML compute graph
-        GGML_TENSOR_FLAG_OUTPUT   = 2, // ...is an output for the GGML compute graph
-        GGML_TENSOR_FLAG_PARAM    = 4, // ...contains trainable parameters
-        GGML_TENSOR_FLAG_LOSS     = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
+        GGML_TENSOR_FLAG_INPUT  =  1, // ...is an input for the GGML compute graph
+        GGML_TENSOR_FLAG_OUTPUT =  2, // ...is an output for the GGML compute graph
+        GGML_TENSOR_FLAG_PARAM  =  4, // ...contains trainable parameters
+        GGML_TENSOR_FLAG_LOSS   =  8, // ...defines loss for numerical optimization (multiple loss tensors add up)
     };
 
     // n-dimensional tensor
@@ -1410,14 +1410,14 @@ extern "C" {
     // supports 3D: a->ne[2] == b->ne[1]
     GGML_API struct ggml_tensor * ggml_get_rows(
             struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+            struct ggml_tensor  * a,  // data
+            struct ggml_tensor  * b); // row indices
 
     GGML_API struct ggml_tensor * ggml_get_rows_back(
             struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            struct ggml_tensor  * c);
+            struct ggml_tensor  * a,  // gradients of ggml_get_rows result
+            struct ggml_tensor  * b,  // row indices
+            struct ggml_tensor  * c); // data for ggml_get_rows, only used for its shape
 
     GGML_API struct ggml_tensor * ggml_diag(
         struct ggml_context     * ctx,
@@ -1568,9 +1568,9 @@ extern "C" {
     // a - dy
     GGML_API struct ggml_tensor * ggml_rope_back(
             struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            struct ggml_tensor  * c,
+            struct ggml_tensor  * a, // gradients of ggml_rope result
+            struct ggml_tensor  * b, // positions
+            struct ggml_tensor  * c, // freq factors
             int                   n_dims,
             int                   mode,
             int                   n_ctx_orig,
@@ -2036,15 +2036,15 @@ extern "C" {
     // loss function
 
     GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
-            struct ggml_context         * ctx,
-            struct ggml_tensor          * a,
-            struct ggml_tensor          * b);
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,  // logits
+            struct ggml_tensor  * b); // labels
 
     GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
-            struct ggml_context         * ctx,
-            struct ggml_tensor          * a,
-            struct ggml_tensor          * b,
-            struct ggml_tensor          * c);
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,  // logits
+            struct ggml_tensor  * b,  // labels
+            struct ggml_tensor  * c); // gradients of cross_entropy_loss result
 
     // AdamW optimizer step
     // Paper: https://arxiv.org/pdf/1711.05101v3.pdf
@@ -2066,7 +2066,7 @@ extern "C" {
     GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
 
     GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
-    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate, bool keep);
+    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate);
 
     GGML_API void ggml_build_opt_adamw(
             struct ggml_context * ctx,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 81b651c6a..fa8d6c25a 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -4725,18 +4725,11 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam
 
 static struct ggml_tensor * ggml_dup_impl(
         struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
+        struct ggml_tensor  * a,
+        bool                  inplace) {
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    result->op   = GGML_OP_DUP;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_DUP;
     result->src[0] = a;
 
     return result;
@@ -4744,13 +4737,13 @@ static struct ggml_tensor * ggml_dup_impl(
 
 struct ggml_tensor * ggml_dup(
         struct ggml_context * ctx,
-        struct ggml_tensor * a) {
+        struct ggml_tensor  * a) {
     return ggml_dup_impl(ctx, a, false);
 }
 
 struct ggml_tensor * ggml_dup_inplace(
         struct ggml_context * ctx,
-        struct ggml_tensor * a) {
+        struct ggml_tensor  * a) {
     return ggml_dup_impl(ctx, a, true);
 }
 
@@ -4758,21 +4751,14 @@ struct ggml_tensor * ggml_dup_inplace(
 
 static struct ggml_tensor * ggml_add_impl(
         struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        bool inplace) {
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        bool                  inplace) {
     GGML_ASSERT(ggml_can_repeat(b, a));
 
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    result->op   = GGML_OP_ADD;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_ADD;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -4781,15 +4767,15 @@ static struct ggml_tensor * ggml_add_impl(
 
 struct ggml_tensor * ggml_add(
         struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
     return ggml_add_impl(ctx, a, b, false);
 }
 
 struct ggml_tensor * ggml_add_inplace(
         struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
     return ggml_add_impl(ctx, a, b, true);
 }
 
@@ -4797,9 +4783,9 @@ struct ggml_tensor * ggml_add_inplace(
 
 static struct ggml_tensor * ggml_add_cast_impl(
         struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        enum   ggml_type     type) {
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        enum   ggml_type      type) {
     // TODO: support less-strict constraint
     //       GGML_ASSERT(ggml_can_repeat(b, a));
     GGML_ASSERT(ggml_can_repeat_rows(b, a));
@@ -4809,18 +4795,9 @@ static struct ggml_tensor * ggml_add_cast_impl(
                 a->type == GGML_TYPE_F16 ||
                 a->type == GGML_TYPE_BF16);
 
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        // TODO: support backward pass for broadcasting
-        GGML_ASSERT(ggml_are_same_shape(a, b));
-        is_node = true;
-    }
-
     struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
 
-    result->op   = GGML_OP_ADD;
-    result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne) : NULL;
+    result->op     = GGML_OP_ADD;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -4829,9 +4806,9 @@ static struct ggml_tensor * ggml_add_cast_impl(
 
 struct ggml_tensor * ggml_add_cast(
         struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        enum   ggml_type     type) {
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        enum   ggml_type      type) {
     return ggml_add_cast_impl(ctx, a, b, type);
 }
 
@@ -4839,22 +4816,15 @@ struct ggml_tensor * ggml_add_cast(
 
 static struct ggml_tensor * ggml_add1_impl(
         struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        bool inplace) {
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        bool                  inplace) {
     GGML_ASSERT(ggml_is_scalar(b));
     GGML_ASSERT(ggml_is_padded_1d(a));
 
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
-
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    result->op   = GGML_OP_ADD1;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_ADD1;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -4863,15 +4833,15 @@ static struct ggml_tensor * ggml_add1_impl(
 
 struct ggml_tensor * ggml_add1(
         struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
     return ggml_add1_impl(ctx, a, b, false);
 }
 
 struct ggml_tensor * ggml_add1_inplace(
         struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
     return ggml_add1_impl(ctx, a, b, true);
 }
 
@@ -4879,31 +4849,24 @@ struct ggml_tensor * ggml_add1_inplace(
 
 static struct ggml_tensor * ggml_acc_impl(
         struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        size_t               nb1,
-        size_t               nb2,
-        size_t               nb3,
-        size_t               offset,
-        bool inplace) {
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        size_t                nb1,
+        size_t                nb2,
+        size_t                nb3,
+        size_t                offset,
+        bool                  inplace) {
     GGML_ASSERT(ggml_nelements(b) <= ggml_nelements(a));
     GGML_ASSERT(ggml_is_contiguous(a));
     GGML_ASSERT(a->type == GGML_TYPE_F32);
     GGML_ASSERT(b->type == GGML_TYPE_F32);
 
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
     ggml_set_op_params(result, params, sizeof(params));
 
-    result->op   = GGML_OP_ACC;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_ACC;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -4912,23 +4875,23 @@ static struct ggml_tensor * ggml_acc_impl(
 
 struct ggml_tensor * ggml_acc(
         struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        size_t               nb1,
-        size_t               nb2,
-        size_t               nb3,
-        size_t               offset) {
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        size_t                nb1,
+        size_t                nb2,
+        size_t                nb3,
+        size_t                offset) {
     return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
 }
 
 struct ggml_tensor * ggml_acc_inplace(
         struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        size_t               nb1,
-        size_t               nb2,
-        size_t               nb3,
-        size_t               offset) {
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        size_t                nb1,
+        size_t                nb2,
+        size_t                nb3,
+        size_t                offset) {
     return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
 }
 
@@ -4936,23 +4899,14 @@ struct ggml_tensor * ggml_acc_inplace(
 
 static struct ggml_tensor * ggml_sub_impl(
         struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        bool inplace) {
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        bool                  inplace) {
     GGML_ASSERT(ggml_can_repeat(b, a));
 
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        // TODO: support backward pass for broadcasting
-        GGML_ASSERT(ggml_are_same_shape(a, b));
-        is_node = true;
-    }
-
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    result->op   = GGML_OP_SUB;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_SUB;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -4961,15 +4915,15 @@ static struct ggml_tensor * ggml_sub_impl(
 
 struct ggml_tensor * ggml_sub(
         struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
     return ggml_sub_impl(ctx, a, b, false);
 }
 
 struct ggml_tensor * ggml_sub_inplace(
         struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
     return ggml_sub_impl(ctx, a, b, true);
 }
 
@@ -4977,27 +4931,14 @@ struct ggml_tensor * ggml_sub_inplace(
 
 static struct ggml_tensor * ggml_mul_impl(
         struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        bool inplace) {
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        bool                  inplace) {
     GGML_ASSERT(ggml_can_repeat(b, a));
 
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        // TODO: support backward pass for broadcasting
-        GGML_ASSERT(ggml_are_same_shape(a, b));
-        is_node = true;
-    }
-
-    if (inplace) {
-        GGML_ASSERT(!is_node);
-    }
-
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    result->op   = GGML_OP_MUL;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_MUL;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -5022,25 +4963,14 @@ struct ggml_tensor * ggml_mul_inplace(
 
 static struct ggml_tensor * ggml_div_impl(
         struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        bool inplace) {
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        bool                  inplace) {
     GGML_ASSERT(ggml_can_repeat(b, a));
 
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
-    if (inplace) {
-        GGML_ASSERT(!is_node);
-    }
-
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    result->op   = GGML_OP_DIV;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_DIV;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -5065,18 +4995,11 @@ struct ggml_tensor * ggml_div_inplace(
 
 static struct ggml_tensor * ggml_sqr_impl(
         struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
+        struct ggml_tensor  * a,
+        bool                  inplace) {
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    result->op   = GGML_OP_SQR;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_SQR;
     result->src[0] = a;
 
     return result;
@@ -5098,18 +5021,11 @@ struct ggml_tensor * ggml_sqr_inplace(
 
 static struct ggml_tensor * ggml_sqrt_impl(
         struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
+        struct ggml_tensor  * a,
+        bool                  inplace) {
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    result->op   = GGML_OP_SQRT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_SQRT;
     result->src[0] = a;
 
     return result;
@@ -5132,17 +5048,10 @@ struct ggml_tensor * ggml_sqrt_inplace(
 static struct ggml_tensor * ggml_log_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
+        bool                  inplace) {
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    result->op   = GGML_OP_LOG;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_LOG;
     result->src[0] = a;
 
     return result;
@@ -5165,17 +5074,10 @@ struct ggml_tensor * ggml_log_inplace(
 static struct ggml_tensor * ggml_sin_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
+        bool                  inplace) {
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    result->op   = GGML_OP_SIN;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_SIN;
     result->src[0] = a;
 
     return result;
@@ -5198,17 +5100,10 @@ struct ggml_tensor * ggml_sin_inplace(
 static struct ggml_tensor * ggml_cos_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
+        bool                  inplace) {
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    result->op   = GGML_OP_COS;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_COS;
     result->src[0] = a;
 
     return result;
@@ -5230,17 +5125,10 @@ struct ggml_tensor * ggml_cos_inplace(
 
 struct ggml_tensor * ggml_sum(
         struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
+        struct ggml_tensor  * a) {
     struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1);
 
-    result->op   = GGML_OP_SUM;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_SUM;
     result->src[0] = a;
 
     return result;
@@ -5250,13 +5138,7 @@ struct ggml_tensor * ggml_sum(
 
 struct ggml_tensor * ggml_sum_rows(
         struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
+        struct ggml_tensor  * a) {
     int64_t ne[GGML_MAX_DIMS] = { 1 };
     for (int i = 1; i < GGML_MAX_DIMS; ++i) {
         ne[i] = a->ne[i];
@@ -5264,8 +5146,7 @@ struct ggml_tensor * ggml_sum_rows(
 
     struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
 
-    result->op   = GGML_OP_SUM_ROWS;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_SUM_ROWS;
     result->src[0] = a;
 
     return result;
@@ -5275,19 +5156,11 @@ struct ggml_tensor * ggml_sum_rows(
 
 struct ggml_tensor * ggml_mean(
         struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ABORT("fatal error"); // TODO: implement
-        is_node = true;
-    }
-
+        struct ggml_tensor  * a) {
     int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] };
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
 
-    result->op   = GGML_OP_MEAN;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_MEAN;
     result->src[0] = a;
 
     return result;
@@ -5297,19 +5170,12 @@ struct ggml_tensor * ggml_mean(
 
 struct ggml_tensor * ggml_argmax(
         struct ggml_context * ctx,
-        struct ggml_tensor * a) {
+        struct ggml_tensor  * a) {
     GGML_ASSERT(ggml_is_matrix(a));
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ABORT("fatal error");
-        is_node = true;
-    }
 
     struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]);
 
-    result->op   = GGML_OP_ARGMAX;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_ARGMAX;
     result->src[0] = a;
 
     return result;
@@ -5319,20 +5185,13 @@ struct ggml_tensor * ggml_argmax(
 
 struct ggml_tensor * ggml_repeat(
         struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
     GGML_ASSERT(ggml_can_repeat(a, b));
 
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
     struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
 
-    result->op   = GGML_OP_REPEAT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_REPEAT;
     result->src[0] = a;
 
     return result;
@@ -5342,24 +5201,13 @@ struct ggml_tensor * ggml_repeat(
 
 struct ggml_tensor * ggml_repeat_back(
         struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
     GGML_ASSERT(ggml_can_repeat(b, a));
 
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    if (ggml_are_same_shape(a, b) && !is_node) {
-        return a;
-    }
-
     struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
 
-    result->op   = GGML_OP_REPEAT_BACK;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_REPEAT_BACK;
     result->src[0] = a;
 
     return result;
@@ -5369,9 +5217,9 @@ struct ggml_tensor * ggml_repeat_back(
 
 struct ggml_tensor * ggml_concat(
     struct ggml_context * ctx,
-    struct ggml_tensor * a,
-    struct ggml_tensor * b,
-    int dim) {
+    struct ggml_tensor  * a,
+    struct ggml_tensor  * b,
+    int                   dim) {
     GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
 
     int64_t ne[GGML_MAX_DIMS];
@@ -5384,19 +5232,11 @@ struct ggml_tensor * ggml_concat(
         ne[d] = a->ne[d];
     }
 
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        GGML_ABORT("fatal error"); // TODO: implement
-        is_node = true;
-    }
-
     struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
 
     ggml_set_op_params_i32(result, 0, dim);
 
-    result->op = GGML_OP_CONCAT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_CONCAT;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -5505,20 +5345,14 @@ struct ggml_tensor * ggml_relu_inplace(
 
 struct ggml_tensor * ggml_leaky_relu(
         struct ggml_context * ctx,
-        struct ggml_tensor  * a, float negative_slope, bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        GGML_ABORT("fatal error"); // TODO: not implemented
-        is_node = true;
-    }
-
+        struct ggml_tensor  * a,
+        float                 negative_slope,
+        bool                  inplace) {
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
 
-    result->op   = GGML_OP_LEAKY_RELU;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_LEAKY_RELU;
     result->src[0] = a;
 
     return result;
@@ -5586,17 +5420,9 @@ struct ggml_tensor * ggml_silu_back(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b) {
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        // TODO: implement backward
-        is_node = true;
-    }
-
     struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
 
-    result->op   = GGML_OP_SILU_BACK;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_SILU_BACK;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -5604,6 +5430,7 @@ struct ggml_tensor * ggml_silu_back(
 }
 
 // ggml hardswish
+
 struct ggml_tensor * ggml_hardswish(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
@@ -5611,6 +5438,7 @@ struct ggml_tensor * ggml_hardswish(
 }
 
 // ggml hardsigmoid
+
 struct ggml_tensor * ggml_hardsigmoid(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
@@ -5618,6 +5446,7 @@ struct ggml_tensor * ggml_hardsigmoid(
 }
 
 // ggml exp
+
 struct ggml_tensor * ggml_exp(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
@@ -5635,21 +5464,13 @@ struct ggml_tensor * ggml_exp_inplace(
 static struct ggml_tensor * ggml_norm_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        float eps,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        GGML_ABORT("fatal error"); // TODO: implement backward
-        is_node = true;
-    }
-
+        float                 eps,
+        bool                  inplace) {
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     ggml_set_op_params(result, &eps, sizeof(eps));
 
-    result->op   = GGML_OP_NORM;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_NORM;
     result->src[0] = a;
 
     return result;
@@ -5658,14 +5479,14 @@ static struct ggml_tensor * ggml_norm_impl(
 struct ggml_tensor * ggml_norm(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        float eps) {
+        float                 eps) {
     return ggml_norm_impl(ctx, a, eps, false);
 }
 
 struct ggml_tensor * ggml_norm_inplace(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        float eps) {
+        float                 eps) {
     return ggml_norm_impl(ctx, a, eps, true);
 }
 
@@ -5674,20 +5495,13 @@ struct ggml_tensor * ggml_norm_inplace(
 static struct ggml_tensor * ggml_rms_norm_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        float eps,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
+        float                 eps,
+        bool                  inplace) {
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     ggml_set_op_params(result, &eps, sizeof(eps));
 
-    result->op   = GGML_OP_RMS_NORM;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_RMS_NORM;
     result->src[0] = a;
 
     return result;
@@ -5696,14 +5510,14 @@ static struct ggml_tensor * ggml_rms_norm_impl(
 struct ggml_tensor * ggml_rms_norm(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        float  eps) {
+        float                 eps) {
     return ggml_rms_norm_impl(ctx, a, eps, false);
 }
 
 struct ggml_tensor * ggml_rms_norm_inplace(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        float eps) {
+        float                 eps) {
     return ggml_rms_norm_impl(ctx, a, eps, true);
 }
 
@@ -5713,20 +5527,12 @@ struct ggml_tensor * ggml_rms_norm_back(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b,
-        float  eps) {
-    bool is_node = false;
-
-    if (a->grad) {
-        // TODO: implement backward
-        is_node = true;
-    }
-
+        float                 eps) {
     struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
 
     ggml_set_op_params(result, &eps, sizeof(eps));
 
-    result->op   = GGML_OP_RMS_NORM_BACK;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_RMS_NORM_BACK;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -5736,43 +5542,35 @@ struct ggml_tensor * ggml_rms_norm_back(
 // ggml_group_norm
 
 static struct ggml_tensor * ggml_group_norm_impl(
-    struct ggml_context * ctx,
-    struct ggml_tensor * a,
-    int n_groups,
-    float eps,
-    bool inplace) {
-
-    bool is_node = false;
-    if (!inplace && (a->grad)) {
-        GGML_ABORT("fatal error"); // TODO: implement backward
-        is_node = true;
-    }
-
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_groups,
+        float                 eps,
+        bool                  inplace) {
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     ggml_set_op_params_i32(result, 0, n_groups);
     ggml_set_op_params_f32(result, 1, eps);
 
-    result->op = GGML_OP_GROUP_NORM;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_GROUP_NORM;
     result->src[0] = a;
 
     return result;
 }
 
 struct ggml_tensor * ggml_group_norm(
-    struct ggml_context * ctx,
-    struct ggml_tensor * a,
-    int n_groups,
-    float eps) {
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_groups,
+        float                 eps) {
     return ggml_group_norm_impl(ctx, a, n_groups, eps, false);
 }
 
 struct ggml_tensor * ggml_group_norm_inplace(
-    struct ggml_context * ctx,
-    struct ggml_tensor * a,
-    int n_groups,
-    float eps) {
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_groups,
+        float                 eps) {
     return ggml_group_norm_impl(ctx, a, n_groups, eps, true);
 }
 
@@ -5785,17 +5583,10 @@ struct ggml_tensor * ggml_mul_mat(
     GGML_ASSERT(ggml_can_mul_mat(a, b));
     GGML_ASSERT(!ggml_is_transposed(a));
 
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
-
     const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
 
-    result->op   = GGML_OP_MUL_MAT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_MUL_MAT;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -5841,17 +5632,10 @@ struct ggml_tensor * ggml_mul_mat_id(
     GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
     GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast
 
-    bool is_node = false;
-
-    if (as->grad || b->grad) {
-        is_node = true;
-    }
-
     const int64_t ne[4] = { as->ne[1], ids->ne[0], b->ne[2], 1 };
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
 
-    result->op   = GGML_OP_MUL_MAT_ID;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_MUL_MAT_ID;
     result->src[0] = as;
     result->src[1] = b;
     result->src[2] = ids;
@@ -5868,18 +5652,11 @@ struct ggml_tensor * ggml_out_prod(
     GGML_ASSERT(ggml_can_out_prod(a, b));
     GGML_ASSERT(!ggml_is_transposed(a));
 
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
-
     // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
     const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
 
-    result->op   = GGML_OP_OUT_PROD;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_OUT_PROD;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -5892,21 +5669,14 @@ static struct ggml_tensor * ggml_scale_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         float                 s,
-        bool inplace) {
+        bool                  inplace) {
     GGML_ASSERT(ggml_is_padded_1d(a));
 
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     ggml_set_op_params(result, &s, sizeof(s));
 
-    result->op   = GGML_OP_SCALE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_SCALE;
     result->src[0] = a;
 
     return result;
@@ -5914,15 +5684,15 @@ static struct ggml_tensor * ggml_scale_impl(
 
 struct ggml_tensor * ggml_scale(
         struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        float                s) {
+        struct ggml_tensor  * a,
+        float                 s) {
     return ggml_scale_impl(ctx, a, s, false);
 }
 
 struct ggml_tensor * ggml_scale_inplace(
         struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        float                s) {
+        struct ggml_tensor  * a,
+        float                 s) {
     return ggml_scale_impl(ctx, a, s, true);
 }
 
@@ -5936,15 +5706,9 @@ static struct ggml_tensor * ggml_set_impl(
         size_t                nb2,
         size_t                nb3,
         size_t                offset,
-        bool inplace) {
+        bool                  inplace) {
     GGML_ASSERT(ggml_nelements(a) >= ggml_nelements(b));
 
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
-
     // make a view of the destination
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
@@ -5952,8 +5716,7 @@ static struct ggml_tensor * ggml_set_impl(
     int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
     ggml_set_op_params(result, params, sizeof(params));
 
-    result->op   = GGML_OP_SET;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_SET;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -5962,8 +5725,8 @@ static struct ggml_tensor * ggml_set_impl(
 
 struct ggml_tensor * ggml_set(
         struct ggml_context * ctx,
-        struct ggml_tensor *  a,
-        struct ggml_tensor *  b,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
         size_t                nb1,
         size_t                nb2,
         size_t                nb3,
@@ -5973,8 +5736,8 @@ struct ggml_tensor * ggml_set(
 
 struct ggml_tensor * ggml_set_inplace(
         struct ggml_context * ctx,
-        struct ggml_tensor *  a,
-        struct ggml_tensor *  b,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
         size_t                nb1,
         size_t                nb2,
         size_t                nb3,
@@ -5984,24 +5747,24 @@ struct ggml_tensor * ggml_set_inplace(
 
 struct ggml_tensor * ggml_set_1d(
         struct ggml_context * ctx,
-        struct ggml_tensor *  a,
-        struct ggml_tensor *  b,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
         size_t                offset) {
     return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, false);
 }
 
 struct ggml_tensor * ggml_set_1d_inplace(
         struct ggml_context * ctx,
-        struct ggml_tensor *  a,
-        struct ggml_tensor *  b,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
         size_t                offset) {
     return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, true);
 }
 
 struct ggml_tensor * ggml_set_2d(
         struct ggml_context * ctx,
-        struct ggml_tensor *  a,
-        struct ggml_tensor *  b,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
         size_t                nb1,
         size_t                offset) {
     return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
@@ -6009,8 +5772,8 @@ struct ggml_tensor * ggml_set_2d(
 
 struct ggml_tensor * ggml_set_2d_inplace(
         struct ggml_context * ctx,
-        struct ggml_tensor *  a,
-        struct ggml_tensor *  b,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
         size_t                nb1,
         size_t                offset) {
     return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true);
@@ -6024,13 +5787,6 @@ static struct ggml_tensor * ggml_cpy_impl(
         struct ggml_tensor  * b) {
     GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
 
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        // inplace is false and either one have a grad
-        is_node = true;
-    }
-
     // make a view of the destination
     struct ggml_tensor * result = ggml_view_tensor(ctx, b);
     if (strlen(b->name) > 0) {
@@ -6039,8 +5795,7 @@ static struct ggml_tensor * ggml_cpy_impl(
         ggml_format_name(result, "%s (copy)", a->name);
     }
 
-    result->op   = GGML_OP_CPY;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_CPY;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -6058,15 +5813,11 @@ struct ggml_tensor * ggml_cast(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         enum   ggml_type      type) {
-    bool is_node = false;
-
     struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
     ggml_format_name(result, "%s (copy)", a->name);
 
-    result->op   = GGML_OP_CPY;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_CPY;
     result->src[0] = a;
-    result->src[1] = result;
 
     return result;
 }
@@ -6076,17 +5827,10 @@ struct ggml_tensor * ggml_cast(
 static struct ggml_tensor * ggml_cont_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
     struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
     ggml_format_name(result, "%s (cont)", a->name);
 
-    result->op   = GGML_OP_CONT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_CONT;
     result->src[0] = a;
 
     return result;
@@ -6132,13 +5876,10 @@ struct ggml_tensor * ggml_cont_4d(
         int64_t               ne3) {
     GGML_ASSERT(ggml_nelements(a) == (ne0*ne1*ne2*ne3));
 
-    bool is_node = false;
-
     struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
     ggml_format_name(result, "%s (cont)", a->name);
 
-    result->op   = GGML_OP_CONT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_CONT;
     result->src[0] = a;
 
     return result;
@@ -6154,22 +5895,10 @@ struct ggml_tensor * ggml_reshape(
     // as only the shape of b is relevant, and not its memory layout, b is allowed to be non contiguous.
     GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
 
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    if (b->grad) {
-        // gradient propagation is not supported
-        //GGML_ABORT("fatal error");
-    }
-
     struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
     ggml_format_name(result, "%s (reshaped)", a->name);
 
-    result->op   = GGML_OP_RESHAPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_RESHAPE;
     result->src[0] = a;
 
     return result;
@@ -6182,18 +5911,11 @@ struct ggml_tensor * ggml_reshape_1d(
     GGML_ASSERT(ggml_is_contiguous(a));
     GGML_ASSERT(ggml_nelements(a) == ne0);
 
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
     const int64_t ne[1] = { ne0 };
     struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0);
     ggml_format_name(result, "%s (reshaped)", a->name);
 
-    result->op   = GGML_OP_RESHAPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_RESHAPE;
     result->src[0] = a;
 
     return result;
@@ -6207,18 +5929,11 @@ struct ggml_tensor * ggml_reshape_2d(
     GGML_ASSERT(ggml_is_contiguous(a));
     GGML_ASSERT(ggml_nelements(a) == ne0*ne1);
 
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
     const int64_t ne[2] = { ne0, ne1 };
     struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0);
     ggml_format_name(result, "%s (reshaped)", a->name);
 
-    result->op   = GGML_OP_RESHAPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_RESHAPE;
     result->src[0] = a;
 
     return result;
@@ -6233,18 +5948,11 @@ struct ggml_tensor * ggml_reshape_3d(
     GGML_ASSERT(ggml_is_contiguous(a));
     GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2);
 
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
     const int64_t ne[3] = { ne0, ne1, ne2 };
     struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0);
     ggml_format_name(result, "%s (reshaped)", a->name);
 
-    result->op   = GGML_OP_RESHAPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_RESHAPE;
     result->src[0] = a;
 
     return result;
@@ -6260,18 +5968,11 @@ struct ggml_tensor * ggml_reshape_4d(
     GGML_ASSERT(ggml_is_contiguous(a));
     GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2*ne3);
 
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
     const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
     struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
     ggml_format_name(result, "%s (reshaped)", a->name);
 
-    result->op   = GGML_OP_RESHAPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_RESHAPE;
     result->src[0] = a;
 
     return result;
@@ -6283,20 +5984,12 @@ static struct ggml_tensor * ggml_view_impl(
         int                   n_dims,
         const int64_t       * ne,
         size_t                offset) {
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
     struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
     ggml_format_name(result, "%s (view)", a->name);
 
     ggml_set_op_params(result, &offset, sizeof(offset));
 
-    result->op   = GGML_OP_VIEW;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_VIEW;
     result->src[0] = a;
 
     return result;
@@ -6309,7 +6002,6 @@ struct ggml_tensor * ggml_view_1d(
         struct ggml_tensor  * a,
         int64_t               ne0,
         size_t                offset) {
-
     struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset);
 
     return result;
@@ -6324,7 +6016,6 @@ struct ggml_tensor * ggml_view_2d(
         int64_t               ne1,
         size_t                nb1,
         size_t                offset) {
-
     const int64_t ne[2] = { ne0, ne1 };
 
     struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset);
@@ -6347,7 +6038,6 @@ struct ggml_tensor * ggml_view_3d(
         size_t                nb1,
         size_t                nb2,
         size_t                offset) {
-
     const int64_t ne[3] = { ne0, ne1, ne2 };
 
     struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset);
@@ -6372,7 +6062,6 @@ struct ggml_tensor * ggml_view_4d(
         size_t                nb2,
         size_t                nb3,
         size_t                offset) {
-
     const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
 
     struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset);
@@ -6405,12 +6094,6 @@ struct ggml_tensor * ggml_permute(
     GGML_ASSERT(axis1 != axis3);
     GGML_ASSERT(axis2 != axis3);
 
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
     struct ggml_tensor * result = ggml_view_tensor(ctx, a);
     ggml_format_name(result, "%s (permuted)", a->name);
 
@@ -6437,8 +6120,7 @@ struct ggml_tensor * ggml_permute(
     result->nb[2] = nb[2];
     result->nb[3] = nb[3];
 
-    result->op   = GGML_OP_PERMUTE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_PERMUTE;
     result->src[0] = a;
 
     int32_t params[] = { axis0, axis1, axis2, axis3 };
@@ -6452,12 +6134,6 @@ struct ggml_tensor * ggml_permute(
 struct ggml_tensor * ggml_transpose(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
     struct ggml_tensor * result = ggml_view_tensor(ctx, a);
     ggml_format_name(result, "%s (transposed)", a->name);
 
@@ -6467,8 +6143,7 @@ struct ggml_tensor * ggml_transpose(
     result->nb[0] = a->nb[1];
     result->nb[1] = a->nb[0];
 
-    result->op   = GGML_OP_TRANSPOSE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_TRANSPOSE;
     result->src[0] = a;
 
     return result;
@@ -6484,12 +6159,6 @@ struct ggml_tensor * ggml_get_rows(
     GGML_ASSERT(b->ne[3] == 1);
     GGML_ASSERT(b->type == GGML_TYPE_I32);
 
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
-
     // TODO: implement non F32 return
     enum ggml_type type = GGML_TYPE_F32;
     if (a->type == GGML_TYPE_I32) {
@@ -6497,8 +6166,7 @@ struct ggml_tensor * ggml_get_rows(
     }
     struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
 
-    result->op   = GGML_OP_GET_ROWS;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_GET_ROWS;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -6515,18 +6183,11 @@ struct ggml_tensor * ggml_get_rows_back(
     GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
     GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0]));
 
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
-
     // TODO: implement non F32 return
     //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
     struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]);
 
-    result->op   = GGML_OP_GET_ROWS_BACK;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_GET_ROWS_BACK;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -6539,17 +6200,11 @@ struct ggml_tensor * ggml_diag(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
     GGML_ASSERT(a->ne[1] == 1);
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
 
     const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
     struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne);
 
-    result->op   = GGML_OP_DIAG;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_DIAG;
     result->src[0] = a;
 
     return result;
@@ -6562,19 +6217,12 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
         struct ggml_tensor  * a,
         int                   n_past,
         bool                  inplace) {
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     int32_t params[] = { n_past };
     ggml_set_op_params(result, params, sizeof(params));
 
-    result->op   = GGML_OP_DIAG_MASK_INF;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_DIAG_MASK_INF;
     result->src[0] = a;
 
     return result;
@@ -6601,19 +6249,12 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
         struct ggml_tensor  * a,
         int                   n_past,
         bool                  inplace) {
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     int32_t params[] = { n_past };
     ggml_set_op_params(result, params, sizeof(params));
 
-    result->op   = GGML_OP_DIAG_MASK_ZERO;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_DIAG_MASK_ZERO;
     result->src[0] = a;
 
     return result;
@@ -6656,19 +6297,12 @@ static struct ggml_tensor * ggml_soft_max_impl(
         GGML_ASSERT(mask);
     }
 
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     float params[] = { scale, max_bias };
     ggml_set_op_params(result, params, sizeof(params));
 
-    result->op   = GGML_OP_SOFT_MAX;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_SOFT_MAX;
     result->src[0] = a;
     result->src[1] = mask;
 
@@ -6703,16 +6337,9 @@ static struct ggml_tensor * ggml_soft_max_back_impl(
         struct ggml_tensor  * a,
         struct ggml_tensor  * b,
         bool                  inplace) {
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true; // TODO : implement backward pass
-    }
-
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    result->op   = GGML_OP_SOFT_MAX_BACK;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_SOFT_MAX_BACK;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -6761,12 +6388,6 @@ static struct ggml_tensor * ggml_rope_impl(
         GGML_ASSERT(c->ne[0] >= n_dims / 2);
     }
 
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
@@ -6778,8 +6399,7 @@ static struct ggml_tensor * ggml_rope_impl(
     memcpy(params + 10, &beta_slow,    sizeof(float));
     ggml_set_op_params(result, params, sizeof(params));
 
-    result->op   = GGML_OP_ROPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_ROPE;
     result->src[0] = a;
     result->src[1] = b;
     result->src[2] = c;
@@ -6907,13 +6527,6 @@ struct ggml_tensor * ggml_rope_back(
     GGML_ASSERT(b->type == GGML_TYPE_I32);
     GGML_ASSERT(a->ne[2] == b->ne[0]);
 
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false && "backwards pass not implemented");
-        is_node = false;
-    }
-
     struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
 
     int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
@@ -6925,8 +6538,7 @@ struct ggml_tensor * ggml_rope_back(
     memcpy(params + 10, &beta_slow,    sizeof(float));
     ggml_set_op_params(result, params, sizeof(params));
 
-    result->op   = GGML_OP_ROPE_BACK;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_ROPE_BACK;
     result->src[0] = a;
     result->src[1] = b;
     result->src[2] = c;
@@ -6941,21 +6553,13 @@ struct ggml_tensor * ggml_clamp(
         struct ggml_tensor  * a,
         float                 min,
         float                 max) {
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ABORT("fatal error"); // TODO: implement backward
-        is_node = true;
-    }
-
     // TODO: when implement backward, fix this:
     struct ggml_tensor * result = ggml_view_tensor(ctx, a);
 
     float params[] = { min, max };
     ggml_set_op_params(result, params, sizeof(params));
 
-    result->op   = GGML_OP_CLAMP;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_CLAMP;
     result->src[0] = a;
 
     return result;
@@ -7017,13 +6621,6 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
     GGML_ASSERT(p0 == 0);
     GGML_ASSERT(d0 == 1);
 
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        GGML_ABORT("fatal error"); // TODO: implement backward
-        is_node = true;
-    }
-
     const int64_t ne[4] = {
         ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
         a->ne[1], b->ne[2], 1,
@@ -7033,8 +6630,7 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
     int32_t params[] = { s0, p0, d0 };
     ggml_set_op_params(result, params, sizeof(params));
 
-    result->op = GGML_OP_CONV_TRANSPOSE_1D;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_CONV_TRANSPOSE_1D;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -7042,17 +6638,17 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
 }
 
 // ggml_conv_depthwise
-struct ggml_tensor * ggml_conv_depthwise_2d(
-    struct ggml_context * ctx,
-    struct ggml_tensor * a,
-    struct ggml_tensor * b,
-    int                  s0,
-    int                  s1,
-    int                  p0,
-    int                  p1,
-    int                  d0,
-    int                  d1) {
 
+struct ggml_tensor * ggml_conv_depthwise_2d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1) {
     struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
     struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
                                         ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
@@ -7072,29 +6668,23 @@ struct ggml_tensor * ggml_conv_depthwise_2d(
 // b: [N, IC, IH, IW]
 // result: [N, OH, OW, IC*KH*KW]
 struct ggml_tensor * ggml_im2col(
-    struct ggml_context * ctx,
-    struct ggml_tensor  * a,
-    struct ggml_tensor  * b,
-    int                  s0,
-    int                  s1,
-    int                  p0,
-    int                  p1,
-    int                  d0,
-    int                  d1,
-    bool                 is_2D,
-    enum ggml_type       dst_type) {
-
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1,
+        bool                  is_2D,
+        enum ggml_type        dst_type) {
     if(is_2D) {
         GGML_ASSERT(a->ne[2] == b->ne[2]);
     } else {
         GGML_ASSERT(a->ne[1] == b->ne[1]);
         GGML_ASSERT(b->ne[3] == 1);
     }
-    bool is_node = false;
-
-    if (/*a->grad ||*/ b->grad) { // a is only used for its shape, not its data
-        is_node = true;
-    }
 
     const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
     const int64_t OW =         ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
@@ -7113,8 +6703,7 @@ struct ggml_tensor * ggml_im2col(
     int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
     ggml_set_op_params(result, params, sizeof(params));
 
-    result->op = GGML_OP_IM2COL;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_IM2COL;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -7122,30 +6711,22 @@ struct ggml_tensor * ggml_im2col(
 }
 
 struct ggml_tensor * ggml_im2col_back(
-    struct ggml_context * ctx,
-    struct ggml_tensor  * a,
-    struct ggml_tensor  * b,
-    int64_t             * ne,
-    int                   s0,
-    int                   s1,
-    int                   p0,
-    int                   p1,
-    int                   d0,
-    int                   d1,
-    bool                  is_2D) {
-
-    bool is_node = false;
-
-    if (/*a->grad ||*/ b->grad) { // a is only used for its shape, not its data
-        is_node = true;
-    }
-
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int64_t             * ne,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1,
+        bool                  is_2D) {
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
     int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
     ggml_set_op_params(result, params, sizeof(params));
 
-    result->op = GGML_OP_IM2COL_BACK;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_IM2COL_BACK;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -7159,12 +6740,12 @@ struct ggml_tensor * ggml_conv_2d(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b,
-        int                  s0,
-        int                  s1,
-        int                  p0,
-        int                  p1,
-        int                  d0,
-        int                  d1) {
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1) {
     struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, a->type); // [N, OH, OW, IC * KH * KW]
 
     struct ggml_tensor * result =
@@ -7180,6 +6761,7 @@ struct ggml_tensor * ggml_conv_2d(
 }
 
 // ggml_conv_2d_sk_p0
+
 struct ggml_tensor * ggml_conv_2d_sk_p0(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
@@ -7209,13 +6791,6 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0(
         int                   stride) {
     GGML_ASSERT(a->ne[3] == b->ne[2]);
 
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        GGML_ABORT("fatal error"); // TODO: implement backward
-        is_node = true;
-    }
-
     const int64_t ne[4] = {
         ggml_calc_conv_transpose_output_size(b->ne[0], a->ne[0], stride, 0 /*p0*/),
         ggml_calc_conv_transpose_output_size(b->ne[1], a->ne[1], stride, 0 /*p1*/),
@@ -7226,8 +6801,7 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0(
 
     ggml_set_op_params_i32(result, 0, stride);
 
-    result->op = GGML_OP_CONV_TRANSPOSE_2D;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_CONV_TRANSPOSE_2D;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -7249,14 +6823,6 @@ struct ggml_tensor * ggml_pool_1d(
         int                   k0,
         int                   s0,
         int                   p0) {
-
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ABORT("fatal error"); // TODO: implement backward
-        is_node = true;
-    }
-
     const int64_t ne[4] = {
         ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
         a->ne[1],
@@ -7268,8 +6834,7 @@ struct ggml_tensor * ggml_pool_1d(
     int32_t params[] = { op, k0, s0, p0 };
     ggml_set_op_params(result, params, sizeof(params));
 
-    result->op = GGML_OP_POOL_1D;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_POOL_1D;
     result->src[0] = a;
 
     return result;
@@ -7287,13 +6852,6 @@ struct ggml_tensor * ggml_pool_2d(
         int                   s1,
         float                 p0,
         float                 p1) {
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
     struct ggml_tensor * result;
     const int64_t ne[4] = {
         ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
@@ -7306,9 +6864,9 @@ struct ggml_tensor * ggml_pool_2d(
     int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
     ggml_set_op_params(result, params, sizeof(params));
 
-    result->op = GGML_OP_POOL_2D;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_POOL_2D;
     result->src[0] = a;
+
     return result;
 }
 
@@ -7323,100 +6881,74 @@ struct ggml_tensor * ggml_pool_2d_back(
         int                   s1,
         float                 p0,
         float                 p1) {
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
     struct ggml_tensor * result;
     result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, af->ne);
 
     int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
     ggml_set_op_params(result, params, sizeof(params));
 
-    result->op = GGML_OP_POOL_2D_BACK;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_POOL_2D_BACK;
     result->src[0] = a;
     result->src[1] = af;
+
     return result;
 }
 
 // ggml_upscale
 
 static struct ggml_tensor * ggml_upscale_impl(
-    struct ggml_context * ctx,
-    struct ggml_tensor * a,
-    int ne0,
-    int ne1,
-    int ne2,
-    int ne3) {
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ABORT("fatal error"); // TODO: implement backward
-        is_node = true;
-    }
-
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   ne0,
+        int                   ne1,
+        int                   ne2,
+        int                   ne3) {
     GGML_ASSERT(a->ne[0] <= ne0);
     GGML_ASSERT(a->ne[1] <= ne1);
     GGML_ASSERT(a->ne[2] <= ne2);
     GGML_ASSERT(a->ne[3] <= ne3);
 
-    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
-            ne0,
-            ne1,
-            ne2,
-            ne3
-            );
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
 
-    result->op = GGML_OP_UPSCALE;
-
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_UPSCALE;
     result->src[0] = a;
 
     return result;
 }
 
 struct ggml_tensor * ggml_upscale(
-    struct ggml_context * ctx,
-    struct ggml_tensor * a,
-    int scale_factor) {
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   scale_factor) {
     return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3]);
 }
 
 struct ggml_tensor * ggml_upscale_ext(
-    struct ggml_context * ctx,
-    struct ggml_tensor * a,
-    int ne0,
-    int ne1,
-    int ne2,
-    int ne3) {
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   ne0,
+        int                   ne1,
+        int                   ne2,
+        int                   ne3) {
     return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3);
 }
 
 // ggml_pad
 
 struct ggml_tensor * ggml_pad(
-    struct ggml_context * ctx,
-    struct ggml_tensor  * a,
-    int p0, int p1, int p2, int p3) {
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ABORT("fatal error"); // TODO: implement backward
-        is_node = true;
-    }
-
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   p0,
+        int                   p1,
+        int                   p2,
+        int                   p3) {
     struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
             a->ne[0] + p0,
             a->ne[1] + p1,
             a->ne[2] + p2,
             a->ne[3] + p3);
 
-    result->op = GGML_OP_PAD;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_PAD;
     result->src[0] = a;
 
     return result;
@@ -7425,39 +6957,32 @@ struct ggml_tensor * ggml_pad(
 // ggml_arange
 
 struct ggml_tensor * ggml_arange(
-    struct ggml_context * ctx,
-    float start,
-    float stop,
-    float step) {
-
+        struct ggml_context * ctx,
+        float                 start,
+        float                 stop,
+        float                 step) {
     GGML_ASSERT(stop > start);
 
     const int64_t steps = (int64_t) ceilf((stop - start) / step);
 
     struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps);
 
-    result->op = GGML_OP_ARANGE;
     ggml_set_op_params_f32(result, 0, start);
     ggml_set_op_params_f32(result, 1, stop);
     ggml_set_op_params_f32(result, 2, step);
 
+    result->op = GGML_OP_ARANGE;
+
     return result;
 }
 
 // ggml_timestep_embedding
 
 struct ggml_tensor * ggml_timestep_embedding(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * timesteps,
-            int                   dim,
-            int                   max_period) {
-    bool is_node = false;
-
-    if (timesteps->grad) {
-        GGML_ABORT("fatal error"); // TODO: implement backward
-        is_node = true;
-    }
-
+        struct ggml_context * ctx,
+        struct ggml_tensor  * timesteps,
+        int                   dim,
+        int                   max_period) {
     int actual_dim = dim;
     if (dim % 2 != 0) {
         actual_dim = dim + 1;
@@ -7465,11 +6990,10 @@ struct ggml_tensor * ggml_timestep_embedding(
 
     struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]);
 
-    result->op = GGML_OP_TIMESTEP_EMBEDDING;
     ggml_set_op_params_i32(result, 0, dim);
     ggml_set_op_params_i32(result, 1, max_period);
 
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_TIMESTEP_EMBEDDING;
     result->src[0] = timesteps;
 
     return result;
@@ -7478,22 +7002,14 @@ struct ggml_tensor * ggml_timestep_embedding(
 // ggml_argsort
 
 struct ggml_tensor * ggml_argsort(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        enum ggml_sort_order  order) {
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ABORT("fatal error"); // TODO: not implemented
-        is_node = true;
-    }
-
+        struct ggml_context  * ctx,
+        struct ggml_tensor   * a,
+        enum ggml_sort_order   order) {
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
 
     ggml_set_op_params_i32(result, 0, (int32_t) order);
 
-    result->op   = GGML_OP_ARGSORT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_ARGSORT;
     result->src[0] = a;
 
     return result;
@@ -7546,10 +7062,6 @@ struct ggml_tensor * ggml_flash_attn_ext(
 
     bool is_node = false;
 
-    if (q->grad || k->grad || v->grad) {
-        is_node = true;
-    }
-
     // permute(0, 2, 1, 3)
     int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
@@ -7676,17 +7188,9 @@ struct ggml_tensor * ggml_ssm_conv(
     GGML_ASSERT(sx->ne[1] == d_inner);
     GGML_ASSERT(n_t >= 0);
 
-    bool is_node = false;
-
-    if (sx->grad || c->grad) {
-        GGML_ABORT("fatal error"); // TODO: implement
-        is_node = true;
-    }
-
     struct ggml_tensor * result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_t, n_s);
 
-    result->op   = GGML_OP_SSM_CONV;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_SSM_CONV;
     result->src[0] = sx;
     result->src[1] = c;
 
@@ -7730,18 +7234,10 @@ struct ggml_tensor * ggml_ssm_scan(
         GGML_ASSERT(B->ne[2] == n_seqs);
     }
 
-    bool is_node = false;
-
-    if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad) {
-        GGML_ABORT("fatal error"); // TODO: implement
-        is_node = true;
-    }
-
     // concatenated y + ssm_states
     struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ggml_nelements(x) + ggml_nelements(s));
 
     result->op   = GGML_OP_SSM_SCAN;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = s;
     result->src[1] = x;
     result->src[2] = dt;
@@ -7761,13 +7257,6 @@ struct ggml_tensor * ggml_win_part(
     GGML_ASSERT(a->ne[3] == 1);
     GGML_ASSERT(a->type  == GGML_TYPE_F32);
 
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ABORT("fatal error"); // TODO: implement backward
-        is_node = true;
-    }
-
     // padding
     const int px = (w - a->ne[1]%w)%w;
     const int py = (w - a->ne[2]%w)%w;
@@ -7782,8 +7271,7 @@ struct ggml_tensor * ggml_win_part(
     int32_t params[] = { npx, npy, w };
     ggml_set_op_params(result, params, sizeof(params));
 
-    result->op   = GGML_OP_WIN_PART;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_WIN_PART;
     result->src[0] = a;
 
     return result;
@@ -7799,21 +7287,13 @@ struct ggml_tensor * ggml_win_unpart(
         int                   w) {
     GGML_ASSERT(a->type == GGML_TYPE_F32);
 
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ABORT("fatal error"); // TODO: implement backward
-        is_node = true;
-    }
-
     const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
 
     int32_t params[] = { w };
     ggml_set_op_params(result, params, sizeof(params));
 
-    result->op   = GGML_OP_WIN_UNPART;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_WIN_UNPART;
     result->src[0] = a;
 
     return result;
@@ -7829,18 +7309,10 @@ struct ggml_tensor * ggml_get_rel_pos(
     GGML_ASSERT(qh == kh);
     GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]);
 
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ABORT("fatal error"); // TODO: implement backward
-        is_node = true;
-    }
-
     const int64_t ne[4] = { a->ne[0], kh, qh, 1, };
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 3, ne);
 
-    result->op   = GGML_OP_GET_REL_POS;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_GET_REL_POS;
     result->src[0] = a;
 
     return result;
@@ -7864,17 +7336,10 @@ static struct ggml_tensor * ggml_add_rel_pos_impl(
     GGML_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]);
     GGML_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]);
 
-    bool is_node = false;
-
-    if (!inplace && (a->grad || pw->grad || ph->grad)) {
-        is_node = true;
-    }
-
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
     ggml_set_op_params_i32(result, 0, inplace ? 1 : 0);
 
-    result->op   = GGML_OP_ADD_REL_POS;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_ADD_REL_POS;
     result->src[0] = a;
     result->src[1] = pw;
     result->src[2] = ph;
@@ -7902,12 +7367,12 @@ struct ggml_tensor * ggml_add_rel_pos_inplace(
 
 struct ggml_tensor * ggml_rwkv_wkv(
         struct ggml_context * ctx,
-        struct ggml_tensor * k,
-        struct ggml_tensor * v,
-        struct ggml_tensor * r,
-        struct ggml_tensor * tf,
-        struct ggml_tensor * td,
-        struct ggml_tensor * state) {
+        struct ggml_tensor  * k,
+        struct ggml_tensor  * v,
+        struct ggml_tensor  * r,
+        struct ggml_tensor  * tf,
+        struct ggml_tensor  * td,
+        struct ggml_tensor  * state) {
     GGML_ASSERT(ggml_is_contiguous(k));
     GGML_ASSERT(ggml_is_contiguous(v));
     GGML_ASSERT(ggml_is_contiguous(r));
@@ -7928,19 +7393,11 @@ struct ggml_tensor * ggml_rwkv_wkv(
         GGML_ASSERT(ggml_nelements(state) == S * S * H * n_seqs);
     }
 
-    bool is_node = false;
-
-    if (k->grad || v->grad || r->grad || tf->grad || td->grad || state->grad) {
-        GGML_ABORT("fatal error"); // TODO: implement backward
-        is_node = true;
-    }
-
     // concat output and new_state
     const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 };
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
 
-    result->op   = GGML_OP_RWKV_WKV;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_RWKV_WKV;
     result->src[0] = k;
     result->src[1] = v;
     result->src[2] = r;
@@ -7955,23 +7412,16 @@ struct ggml_tensor * ggml_rwkv_wkv(
 
 static struct ggml_tensor * ggml_unary_impl(
         struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        enum ggml_unary_op op,
-        bool inplace) {
+        struct ggml_tensor  * a,
+        enum ggml_unary_op    op,
+        bool                  inplace) {
     GGML_ASSERT(ggml_is_contiguous_1(a));
 
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     ggml_set_op_params_i32(result, 0, (int32_t) op);
 
-    result->op   = GGML_OP_UNARY;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_UNARY;
     result->src[0] = a;
 
     return result;
@@ -7980,14 +7430,14 @@ static struct ggml_tensor * ggml_unary_impl(
 struct ggml_tensor * ggml_unary(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        enum ggml_unary_op op) {
+        enum ggml_unary_op    op) {
     return ggml_unary_impl(ctx, a, op, false);
 }
 
 struct ggml_tensor * ggml_unary_inplace(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        enum ggml_unary_op op) {
+        enum ggml_unary_op    op) {
     return ggml_unary_impl(ctx, a, op, true);
 }
 
@@ -7996,20 +7446,13 @@ struct ggml_tensor * ggml_unary_inplace(
 static struct ggml_tensor * ggml_map_unary_impl_f32(
         struct ggml_context        * ctx,
         struct ggml_tensor         * a,
-        const  ggml_unary_op_f32_t fun,
-        bool   inplace) {
-    bool is_node = false;
-
-    if (!inplace && a->grad) {
-        is_node = true;
-    }
-
+        const  ggml_unary_op_f32_t   fun,
+        bool                         inplace) {
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
 
-    result->op = GGML_OP_MAP_UNARY;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_MAP_UNARY;
     result->src[0] = a;
 
     return result;
@@ -8018,14 +7461,14 @@ static struct ggml_tensor * ggml_map_unary_impl_f32(
 struct ggml_tensor * ggml_map_unary_f32(
         struct ggml_context        * ctx,
         struct ggml_tensor         * a,
-        const  ggml_unary_op_f32_t fun) {
+        const  ggml_unary_op_f32_t   fun) {
     return ggml_map_unary_impl_f32(ctx, a, fun, false);
 }
 
 struct ggml_tensor * ggml_map_unary_inplace_f32(
         struct ggml_context        * ctx,
         struct ggml_tensor         * a,
-        const  ggml_unary_op_f32_t fun) {
+        const  ggml_unary_op_f32_t   fun) {
     return ggml_map_unary_impl_f32(ctx, a, fun, true);
 }
 
@@ -8035,22 +7478,15 @@ static struct ggml_tensor * ggml_map_binary_impl_f32(
         struct ggml_context         * ctx,
         struct ggml_tensor          * a,
         struct ggml_tensor          * b,
-        const  ggml_binary_op_f32_t fun,
-        bool   inplace) {
+        const  ggml_binary_op_f32_t   fun,
+        bool                          inplace) {
     GGML_ASSERT(ggml_are_same_shape(a, b));
 
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
 
-    result->op = GGML_OP_MAP_BINARY;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_MAP_BINARY;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -8061,7 +7497,7 @@ struct ggml_tensor * ggml_map_binary_f32(
         struct ggml_context         * ctx,
         struct ggml_tensor          * a,
         struct ggml_tensor          * b,
-        const  ggml_binary_op_f32_t fun) {
+        const  ggml_binary_op_f32_t   fun) {
     return ggml_map_binary_impl_f32(ctx, a, b, fun, false);
 }
 
@@ -8069,7 +7505,7 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
         struct ggml_context         * ctx,
         struct ggml_tensor          * a,
         struct ggml_tensor          * b,
-        const  ggml_binary_op_f32_t fun) {
+        const  ggml_binary_op_f32_t   fun) {
     return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
 }
 
@@ -8079,19 +7515,12 @@ static struct ggml_tensor * ggml_map_custom1_impl_f32(
         struct ggml_context          * ctx,
         struct ggml_tensor           * a,
         const  ggml_custom1_op_f32_t   fun,
-        bool   inplace) {
-    bool is_node = false;
-
-    if (!inplace && a->grad) {
-        is_node = true;
-    }
-
+        bool                           inplace) {
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
 
-    result->op = GGML_OP_MAP_CUSTOM1_F32;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_MAP_CUSTOM1_F32;
     result->src[0] = a;
 
     return result;
@@ -8118,19 +7547,12 @@ static struct ggml_tensor * ggml_map_custom2_impl_f32(
         struct ggml_tensor           * a,
         struct ggml_tensor           * b,
         const  ggml_custom2_op_f32_t   fun,
-        bool   inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
+        bool                           inplace) {
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
 
-    result->op = GGML_OP_MAP_CUSTOM2_F32;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_MAP_CUSTOM2_F32;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -8161,19 +7583,12 @@ static struct ggml_tensor * ggml_map_custom3_impl_f32(
         struct ggml_tensor           * b,
         struct ggml_tensor           * c,
         const  ggml_custom3_op_f32_t   fun,
-        bool   inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad || c->grad)) {
-        is_node = true;
-    }
-
+        bool                           inplace) {
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
 
-    result->op = GGML_OP_MAP_CUSTOM3_F32;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_MAP_CUSTOM3_F32;
     result->src[0] = a;
     result->src[1] = b;
     result->src[2] = c;
@@ -8201,26 +7616,20 @@ struct ggml_tensor * ggml_map_custom3_inplace_f32(
 
 // ggml_map_custom1
 struct ggml_map_custom1_op_params {
-    ggml_custom1_op_t fun;
-    int n_tasks;
-    void * userdata;
+    ggml_custom1_op_t  fun;
+    int                n_tasks;
+    void             * userdata;
 };
 
 static struct ggml_tensor * ggml_map_custom1_impl(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        const  ggml_custom1_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata,
-        bool                           inplace) {
+        struct ggml_context      * ctx,
+        struct ggml_tensor       * a,
+        const  ggml_custom1_op_t   fun,
+        int                        n_tasks,
+        void                     * userdata,
+        bool                       inplace) {
     GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
 
-    bool is_node = false;
-
-    if (!inplace && a->grad) {
-        is_node = true;
-    }
-
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     struct ggml_map_custom1_op_params params = {
@@ -8230,55 +7639,48 @@ static struct ggml_tensor * ggml_map_custom1_impl(
     };
     ggml_set_op_params(result, (const void *) &params, sizeof(params));
 
-    result->op = GGML_OP_MAP_CUSTOM1;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_MAP_CUSTOM1;
     result->src[0] = a;
 
     return result;
 }
 
 struct ggml_tensor * ggml_map_custom1(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        const  ggml_custom1_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata) {
+        struct ggml_context      * ctx,
+        struct ggml_tensor       * a,
+        const  ggml_custom1_op_t   fun,
+        int                        n_tasks,
+        void                     * userdata) {
     return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false);
 }
 
 struct ggml_tensor * ggml_map_custom1_inplace(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        const  ggml_custom1_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata) {
+        struct ggml_context      * ctx,
+        struct ggml_tensor       * a,
+        const  ggml_custom1_op_t   fun,
+        int                        n_tasks,
+        void                     * userdata) {
     return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true);
 }
 
 // ggml_map_custom2
 
 struct ggml_map_custom2_op_params {
-    ggml_custom2_op_t fun;
-    int n_tasks;
-    void * userdata;
+    ggml_custom2_op_t   fun;
+    int                 n_tasks;
+    void              * userdata;
 };
 
 static struct ggml_tensor * ggml_map_custom2_impl(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        const  ggml_custom2_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata,
-        bool                           inplace) {
+        struct ggml_context      * ctx,
+        struct ggml_tensor       * a,
+        struct ggml_tensor       * b,
+        const  ggml_custom2_op_t   fun,
+        int                        n_tasks,
+        void                     * userdata,
+        bool                       inplace) {
     GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
 
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     struct ggml_map_custom2_op_params params = {
@@ -8288,8 +7690,7 @@ static struct ggml_tensor * ggml_map_custom2_impl(
     };
     ggml_set_op_params(result, (const void *) &params, sizeof(params));
 
-    result->op = GGML_OP_MAP_CUSTOM2;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_MAP_CUSTOM2;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -8297,22 +7698,22 @@ static struct ggml_tensor * ggml_map_custom2_impl(
 }
 
 struct ggml_tensor * ggml_map_custom2(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        const  ggml_custom2_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata) {
+        struct ggml_context      * ctx,
+        struct ggml_tensor       * a,
+        struct ggml_tensor       * b,
+        const  ggml_custom2_op_t   fun,
+        int                        n_tasks,
+        void                     * userdata) {
     return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false);
 }
 
 struct ggml_tensor * ggml_map_custom2_inplace(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        const  ggml_custom2_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata) {
+        struct ggml_context      * ctx,
+        struct ggml_tensor       * a,
+        struct ggml_tensor       * b,
+        const  ggml_custom2_op_t   fun,
+        int                        n_tasks,
+        void                     * userdata) {
     return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true);
 }
 
@@ -8325,22 +7726,16 @@ struct ggml_map_custom3_op_params {
 };
 
 static struct ggml_tensor * ggml_map_custom3_impl(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        struct ggml_tensor           * c,
-        const  ggml_custom3_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata,
-        bool                           inplace) {
+        struct ggml_context      * ctx,
+        struct ggml_tensor       * a,
+        struct ggml_tensor       * b,
+        struct ggml_tensor       * c,
+        const  ggml_custom3_op_t   fun,
+        int                        n_tasks,
+        void                     * userdata,
+        bool                       inplace) {
     GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
 
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad || c->grad)) {
-        is_node = true;
-    }
-
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     struct ggml_map_custom3_op_params params = {
@@ -8350,8 +7745,7 @@ static struct ggml_tensor * ggml_map_custom3_impl(
     };
     ggml_set_op_params(result, (const void *) &params, sizeof(params));
 
-    result->op = GGML_OP_MAP_CUSTOM3;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_MAP_CUSTOM3;
     result->src[0] = a;
     result->src[1] = b;
     result->src[2] = c;
@@ -8360,44 +7754,38 @@ static struct ggml_tensor * ggml_map_custom3_impl(
 }
 
 struct ggml_tensor * ggml_map_custom3(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        struct ggml_tensor           * c,
-        const  ggml_custom3_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata) {
+        struct ggml_context      * ctx,
+        struct ggml_tensor       * a,
+        struct ggml_tensor       * b,
+        struct ggml_tensor       * c,
+        const  ggml_custom3_op_t   fun,
+        int                        n_tasks,
+        void                     * userdata) {
     return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false);
 }
 
 struct ggml_tensor * ggml_map_custom3_inplace(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        struct ggml_tensor           * c,
-        const  ggml_custom3_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata) {
+        struct ggml_context      * ctx,
+        struct ggml_tensor       * a,
+        struct ggml_tensor       * b,
+        struct ggml_tensor       * c,
+        const  ggml_custom3_op_t   fun,
+        int                        n_tasks,
+        void                     * userdata) {
     return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
 }
 
 // ggml_cross_entropy_loss
 
 struct ggml_tensor * ggml_cross_entropy_loss(
-        struct ggml_context         * ctx,
-        struct ggml_tensor          * a,
-        struct ggml_tensor          * b) {
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
     GGML_ASSERT(ggml_are_same_shape(a, b));
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
 
     struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1);
 
-    result->op   = GGML_OP_CROSS_ENTROPY_LOSS;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_CROSS_ENTROPY_LOSS;
     result->src[0] = a;
     result->src[1] = b;
 
@@ -8407,17 +7795,16 @@ struct ggml_tensor * ggml_cross_entropy_loss(
 // ggml_cross_entropy_loss_back
 
 struct ggml_tensor * ggml_cross_entropy_loss_back(
-        struct ggml_context         * ctx,
-        struct ggml_tensor          * a,
-        struct ggml_tensor          * b,
-        struct ggml_tensor          * c) {
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        struct ggml_tensor  * c) {
     GGML_ASSERT(ggml_are_same_shape(a, b));
     GGML_ASSERT(ggml_is_scalar(c));
 
     struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
 
-    result->op   = GGML_OP_CROSS_ENTROPY_LOSS_BACK;
-    result->grad = NULL;
+    result->op     = GGML_OP_CROSS_ENTROPY_LOSS_BACK;
     result->src[0] = a;
     result->src[1] = b;
     result->src[2] = c;
@@ -8435,7 +7822,7 @@ struct ggml_tensor * ggml_opt_step_adamw(
         float                 beta2,
         float                 eps,
         float                 wd) {
-    GGML_ASSERT(a->grad);
+    GGML_ASSERT(a->flags & GGML_TENSOR_FLAG_PARAM);
     GGML_ASSERT(alpha >  0.0f);
     GGML_ASSERT(beta1 >= 0.0f && beta1 <= 1.0f);
     GGML_ASSERT(beta2 >= 0.0f && beta2 <= 1.0f);
@@ -8444,13 +7831,6 @@ struct ggml_tensor * ggml_opt_step_adamw(
 
     struct ggml_tensor * result = ggml_view_tensor(ctx, a);
 
-    result->op   = GGML_OP_OPT_STEP_ADAMW;
-    result->grad = NULL;
-    result->src[0] = a;
-    result->src[1] = a->grad;
-    result->src[2] = ggml_dup_tensor(ctx, a->grad);
-    result->src[3] = ggml_dup_tensor(ctx, a->grad);
-
     const int64_t iter = 1;
     memcpy(&result->op_params[0], &iter, sizeof(int64_t));
     ggml_set_op_params_f32(result, 2, alpha);
@@ -8459,26 +7839,17 @@ struct ggml_tensor * ggml_opt_step_adamw(
     ggml_set_op_params_f32(result, 5, eps);
     ggml_set_op_params_f32(result, 6, wd);
 
+    result->op     = GGML_OP_OPT_STEP_ADAMW;
+    result->src[0] = a;
+    result->src[1] = a->grad;
+    result->src[2] = ggml_dup_tensor(ctx, a);
+    result->src[3] = ggml_dup_tensor(ctx, a);
+
     return result;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 
-void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor) {
-    tensor->flags |= GGML_TENSOR_FLAG_PARAM;
-
-    GGML_ASSERT(tensor->grad == NULL);
-    tensor->grad = ggml_dup_tensor(ctx, tensor);
-    ggml_format_name(tensor->grad, "%s (grad)", tensor->name);
-}
-
-void ggml_set_loss(struct ggml_tensor * tensor) {
-    GGML_ASSERT(ggml_is_scalar(tensor));
-    GGML_ASSERT(tensor->type == GGML_TYPE_F32);
-    GGML_ASSERT(tensor->grad);
-    tensor->flags |= GGML_TENSOR_FLAG_LOSS;
-}
-
 // ggml_compute_forward_dup
 
 static void ggml_compute_forward_dup_same_cont(
@@ -18198,7 +17569,7 @@ void ggml_build_backward_gradient_checkpointing(
         struct ggml_tensor  * * checkpoints,
         int                     n_checkpoints) {
     ggml_graph_cpy(gf, gb_tmp);
-    ggml_build_backward_expand(ctx, gf, gb_tmp, false, true);
+    ggml_build_backward_expand(ctx, gf, gb_tmp, false);
 
     if (n_checkpoints <= 0) {
         ggml_graph_cpy(gb_tmp, gb);
@@ -18850,7 +18221,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                             ggml_soft_max_back(ctx, tensor->grad, tensor),
                         zero_table, acc_table);
                 }
-
+                GGML_ASSERT((!src1 || !src1->grad) && "backward pass for softmax mask not implemented");
             } break;
         case GGML_OP_SOFT_MAX_BACK:
             {
@@ -18891,6 +18262,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                 beta_slow),
                             zero_table, acc_table);
                 }
+                GGML_ASSERT((!src2 || !src2->grad) && "gradients for freq factors not implemented");
             } break;
         case GGML_OP_ROPE_BACK:
             {
@@ -19012,6 +18384,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             }
         case GGML_OP_FLASH_ATTN_EXT:
             {
+                GGML_ABORT("FA backward pass not adapted after rework");
                 struct ggml_tensor * flash_grad = NULL;
                 if (src0->grad || src1->grad || tensor->src[2]->grad) {
                     int32_t t = ggml_get_op_params_i32(tensor, 0);
@@ -19186,6 +18559,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                     tensor->grad),
                                 zero_table, acc_table);
                 }
+                GGML_ASSERT(!src1->grad && "backward pass for labels not implemented");
             } break;
         case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
             {
@@ -19236,7 +18610,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
         }
     }
 
-    if (node->op == GGML_OP_NONE && node->grad == NULL) {
+    if (node->op == GGML_OP_NONE && !(node->flags & GGML_TENSOR_FLAG_PARAM)) {
         // reached a leaf node, not part of the gradient graph (e.g. a constant)
         GGML_ASSERT(cgraph->n_leafs < cgraph->size);
 
@@ -19254,9 +18628,6 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
         }
 
         cgraph->nodes[cgraph->n_nodes] = node;
-        if (cgraph->grads) {
-            cgraph->grads[cgraph->n_nodes] = node->grad;
-        }
         cgraph->n_nodes++;
     }
 }
@@ -19284,20 +18655,58 @@ void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor *
     ggml_build_forward_impl(cgraph, tensor, true);
 }
 
-void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate, bool keep) {
+void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate) {
     GGML_ASSERT(gf->n_nodes > 0);
     GGML_ASSERT(gf->grads);
 
-    // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
-    if (keep) {
-        for (int i = 0; i < gf->n_nodes; i++) {
-            struct ggml_tensor * node = gf->nodes[i];
+    for (int i = 0; i < gf->n_nodes; ++i) {
+        struct ggml_tensor * node = gf->nodes[i];
 
-            if (node->grad) {
-                node->grad = ggml_dup_tensor(ctx, node);
-                gf->grads[i] = node->grad;
-            }
+        bool needs_grad = node->flags & GGML_TENSOR_FLAG_PARAM;
+        bool ignore_src[GGML_MAX_SRC] = {false};
+        switch (node->op) {
+            // gradients in node->src[0] for one reason or another have no effect on output gradients
+            case GGML_OP_IM2COL:      // only used for its shape
+            case GGML_OP_IM2COL_BACK: // same as IM2COL
+                ignore_src[0] = true;
+                break;
+            case GGML_OP_UNARY: {
+                const enum ggml_unary_op uop = ggml_get_unary_op(node);
+                // SGN and STEP unary ops are piecewise constant
+                if (uop == GGML_UNARY_OP_SGN || uop == GGML_UNARY_OP_STEP) {
+                    ignore_src[0] = true;
+                }
+            } break;
+
+            // gradients in node->src[1] for one reason or another have no effect on output gradients
+            case GGML_OP_CPY:           // gradients in CPY target  are irrelevant
+            case GGML_OP_GET_ROWS:      // row indices not differentiable
+            case GGML_OP_GET_ROWS_BACK: // same as for GET_ROWS
+            case GGML_OP_ROPE:          // positions not differentiable
+                ignore_src[1] = true;
+                break;
+
+            default:
+                break;
         }
+        for (int j = 0; j < GGML_MAX_SRC; ++j) {
+            if (!node->src[j] || !node->src[j]->grad || ignore_src[j]) {
+                continue;
+            }
+            GGML_ASSERT(node->src[j]->type == GGML_TYPE_F32 || node->src[j]->type == GGML_TYPE_F16);
+            needs_grad = true;
+            break;
+        }
+        if (!needs_grad) {
+            continue;
+        }
+
+        // inplace operations are currently not supported
+        GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW ||
+            node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
+
+        // create a new tensor with the same type and shape as the node and set it as grad
+        node->grad = ggml_dup_tensor(ctx, node);
     }
 
     // keep tables of original gradients for replacement/accumulation logic
@@ -22162,8 +21571,6 @@ enum ggml_opt_result ggml_opt(
         struct ggml_context * ctx,
         struct ggml_opt_params params,
         struct ggml_tensor * f) {
-    GGML_ASSERT(f->grad && "ggml_set_param called for at least one parent tensor.");
-
     bool free_ctx = false;
     if (ctx == NULL) {
         struct ggml_init_params params_ctx = {
@@ -22204,7 +21611,7 @@ enum ggml_opt_result ggml_opt_resume(
     ggml_build_forward_expand(gf, f);
 
     struct ggml_cgraph * gb = ggml_graph_dup(ctx, gf);
-    ggml_build_backward_expand(ctx, gf, gb, false, true);
+    ggml_build_backward_expand(ctx, gf, gb, false);
 
     return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
 }
@@ -22257,6 +21664,17 @@ void ggml_set_output(struct ggml_tensor * tensor) {
     tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
 }
 
+void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor) {
+    GGML_UNUSED(ctx); // TODO: remove this parameter
+    tensor->flags |= GGML_TENSOR_FLAG_PARAM;
+}
+
+void ggml_set_loss(struct ggml_tensor * tensor) {
+    GGML_ASSERT(ggml_is_scalar(tensor));
+    GGML_ASSERT(tensor->type == GGML_TYPE_F32);
+    tensor->flags |= GGML_TENSOR_FLAG_LOSS;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 void ggml_quantize_init(enum ggml_type type) {
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index d2cfe06b5..5c78b6704 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1,6 +1,6 @@
 // This file defines tests for various GGML ops and backends.
 // For the forward pass it asserts that the results of multiple backends computing the same GGML ops are consistent.
-// For the backwards pass it asserts that the gradients from backpropagation are consistent
+// For the backward pass it asserts that the gradients from backpropagation are consistent
 // with the gradients obtained via the method of finite differences ("grad" mode, this is optional).
 // It is also possible to check the performance ("perf" mode).
 //
@@ -740,7 +740,7 @@ struct test_case {
 
         ggml_tensor * out = build_graph(ctx);
 
-        if (op_name != nullptr && op_desc(out) != op_name) {
+        if ((op_name != nullptr && op_desc(out) != op_name) || out->op == GGML_OP_OPT_STEP_ADAMW) {
             //printf("  %s: skipping\n", op_desc(out).c_str());
             ggml_free(ctx);
             return true;
@@ -749,11 +749,6 @@ struct test_case {
         printf("  %s(%s): ", op_desc(out).c_str(), vars().c_str());
         fflush(stdout);
 
-        if (out->grad == nullptr) {
-            printf("backwards pass not supported \n");
-            ggml_free(ctx);
-            return true;
-        }
         if (out->type != GGML_TYPE_F32) {
             ggml_free(ctx);
             printf("not supported [%s->type != FP32]\n", out->name);
@@ -762,18 +757,26 @@ struct test_case {
 
         // check if the backend supports the ops
         bool supported = true;
+        bool any_params = false;
         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
             if (!ggml_backend_supports_op(backend, t)) {
                 printf("not supported [%s] ", ggml_backend_name(backend));
                 supported = false;
                 break;
             }
-            if ((t->flags & GGML_TENSOR_FLAG_PARAM) && t->type != GGML_TYPE_F32) {
-                printf("not supported [%s->type != FP32] ", t->name);
-                supported = false;
-                break;
+            if ((t->flags & GGML_TENSOR_FLAG_PARAM)) {
+                any_params = true;
+                if (t->type != GGML_TYPE_F32) {
+                    printf("not supported [%s->type != FP32] ", t->name);
+                    supported = false;
+                    break;
+                }
             }
         }
+        if (!any_params) {
+            printf("not supported [%s] \n", op_name);
+            supported = false;
+        }
         if (!supported) {
             printf("\n");
             ggml_free(ctx);
@@ -801,7 +804,7 @@ struct test_case {
 
         ggml_build_forward_expand(gf, out);
         ggml_graph_cpy(gf, gb);
-        ggml_build_backward_expand(ctx, gf, gb, false, false);
+        ggml_build_backward_expand(ctx, gf, gb, false);
         if (expect.size() != 1 || expect[0] != 0.0f) {
             GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
             for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
@@ -984,7 +987,7 @@ struct test_example : public test_case {
     }
     // In order to also check the gradients for your op, add calls like ggml_set_param(ctx, a)
     // immediately after you create the tensors.
-    // This is optional and only makes sense if a backwards pass has actually been implemented for the new op.
+    // This is optional and only makes sense if a backward pass has actually been implemented for the new op.
 };
 
 
@@ -1223,7 +1226,7 @@ struct test_set : public test_case {
             offset += ((ne_dst[i] - ne[i])/2)*dst->nb[i];
         }
         ggml_tensor * out = ggml_set(ctx, dst, src,
-            // The backwards pass requires setting a contiguous region:
+            // The backward pass requires setting a contiguous region:
             src->nb[1], src->nb[2], src->nb[3], offset);
         ggml_set_name(out, "out");
 
@@ -1335,7 +1338,7 @@ struct test_bin_bcast : public test_case {
         ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
         ggml_set_name(b, "b");
 
-        // The backwards pass supports broadcasting only for GGML_ADD:
+        // The backward pass supports broadcasting only for GGML_ADD:
         const bool grad_supported = op == ggml_add || ggml_are_same_shape(a, b);
         if (grad_supported) {
             ggml_set_param(ctx, a);
@@ -1830,7 +1833,7 @@ struct test_log : public test_case {
 
     void initialize_tensors(ggml_context * ctx) override {
         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            // log(1) == 0, cluster values there to keep the sum low for better precision in the backwards pass:
+            // log(1) == 0, cluster values there to keep the sum low for better precision in the backward pass:
             init_tensor_uniform(t, 0.9f, 1.1f);
         }
     }
@@ -3257,7 +3260,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
     test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
 
-    for (int ne3 : {1, 3}) { // CUDA backwards pass only supports ne3 == 1
+    for (int ne3 : {1, 3}) { // CUDA backward pass only supports ne3 == 1
         test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 1}));
         test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
         test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 2, 1, 1}));
diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp
index 2ef606d2c..2200ad93d 100644
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@@ -240,12 +240,14 @@ static bool check_gradient(
     struct ggml_cgraph * gb = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
     ggml_build_forward_expand(gf, f);
     ggml_graph_cpy(gf, gb);
-    ggml_build_backward_expand(ctx0, gf, gb, false, false);
+    ggml_build_backward_expand(ctx0, gf, gb, false);
 
     ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
 
-    ggml_graph_reset  (gf);
-    ggml_set_f32      (f->grad, 1.0f);
+    ggml_graph_reset(gb);
+    if (f->grad) {
+        ggml_set_f32(f->grad, 1.0f);
+    }
 
     ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
 
@@ -298,8 +300,10 @@ static bool check_gradient(
             ggml_set_f32_1d(x[i], k, x0);
 
             // compute gradient using backward graph
-            ggml_graph_reset  (gf);
-            ggml_set_f32      (f->grad, 1.0f);
+            ggml_graph_reset(gb);
+            if (f->grad) {
+                ggml_set_f32(f->grad, 1.0f);
+            }
 
             ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
 

From 6c5322481a75f1be54e58a000c3f78484d07f948 Mon Sep 17 00:00:00 2001
From: Borislav Stanimirov <b.stanimirov@abv.bg>
Date: Mon, 30 Sep 2024 10:11:41 +0300
Subject: [PATCH 75/78] ggml : fix ggml_cast (ggml/973)

---
 ggml/src/ggml.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index fa8d6c25a..aac4e3a7b 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -5818,6 +5818,7 @@ struct ggml_tensor * ggml_cast(
 
     result->op     = GGML_OP_CPY;
     result->src[0] = a;
+    result->src[1] = result;
 
     return result;
 }

From cb00020504416606601f8cb35f55ee07b710b4b7 Mon Sep 17 00:00:00 2001
From: Salvatore Mesoraca <s.mesoraca16@gmail.com>
Date: Mon, 30 Sep 2024 09:14:09 +0200
Subject: [PATCH 76/78] vulkan : mul_mat: fix UB with small warps (ggml/952)

When the device's warp size is less than 16,
it is possible for loadstride_a (mul_mm.comp:114)
and loadstride_b (mul_mm.comp:115) to be set to 0.
Because they are calculated as: the workgroup size,
multiplied by LOAD_VEC_* (which can be 1) and divided by 16.
And the workgroup size is set to be the same as the
warp/subgroup size.

The loadstride_* variables are used as increments in the
loops that populate the buffers used for the multiplication.

When they are 0 they cause an infinite loop.
But infinite loops without side-effects are UB and the
values of loadstride_* are known at compile time.
So, the compiler quietly optimizes all the loops away.
As a consequence, the buffers are not populated and
the multiplication result is just a matrix with all elements
set to 0.

We prevent the UB by making sure that the workgroup size
will never be less than 16, even if our device has a
smaller warp size (e.g. 8).

Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>
---
 ggml/src/ggml-vulkan.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index c677a2728..00ad13bb9 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -1164,11 +1164,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
     // mulmat
     std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
     std::initializer_list<uint32_t> warptile_m = { 128,  64,  64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
-    std::initializer_list<uint32_t> warptile_s = { device->subgroup_size,  32,  32, 16, 32, 32, 2, 2, 2, device->subgroup_size };
+    std::initializer_list<uint32_t> warptile_s = { std::max(device->subgroup_size, 16u),  32,  32, 16, 32, 32, 2, 2, 2, device->subgroup_size };
 
     std::initializer_list<uint32_t> warptile_mmq_l = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
     std::initializer_list<uint32_t> warptile_mmq_m = { 128,  64,  64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
-    std::initializer_list<uint32_t> warptile_mmq_s = { device->subgroup_size,  32,  32, 32, 32, 32, 2, 2, 2, device->subgroup_size };
+    std::initializer_list<uint32_t> warptile_mmq_s = { std::max(device->subgroup_size, 16u),  32,  32, 32, 32, 32, 2, 2, 2, device->subgroup_size };
 
     std::array<uint32_t, 3> l_wg_denoms = {128, 128, 1 };
     std::array<uint32_t, 3> m_wg_denoms = { 64,  64, 1 };

From e98c1c188ebbeb55d0cd6389ad03f0cbf995b451 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Mon, 30 Sep 2024 09:55:23 +0200
Subject: [PATCH 77/78] test: fix OPT_STEP_ADAMW for test-backend-ops
 (ggml/974)

---
 ggml/include/ggml.h        |  1 +
 ggml/src/ggml.c            | 10 ++++++----
 tests/test-backend-ops.cpp |  5 ++++-
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index a8a74bee1..ce3d92cb2 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2052,6 +2052,7 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_opt_step_adamw(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
+            struct ggml_tensor  * grad,
             float                 alpha,
             float                 beta1,
             float                 beta2,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index aac4e3a7b..bcbc32d91 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -7818,12 +7818,14 @@ struct ggml_tensor * ggml_cross_entropy_loss_back(
 struct ggml_tensor * ggml_opt_step_adamw(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
+        struct ggml_tensor  * grad,
         float                 alpha,
         float                 beta1,
         float                 beta2,
         float                 eps,
         float                 wd) {
     GGML_ASSERT(a->flags & GGML_TENSOR_FLAG_PARAM);
+    GGML_ASSERT(ggml_are_same_shape(a, grad));
     GGML_ASSERT(alpha >  0.0f);
     GGML_ASSERT(beta1 >= 0.0f && beta1 <= 1.0f);
     GGML_ASSERT(beta2 >= 0.0f && beta2 <= 1.0f);
@@ -7842,9 +7844,9 @@ struct ggml_tensor * ggml_opt_step_adamw(
 
     result->op     = GGML_OP_OPT_STEP_ADAMW;
     result->src[0] = a;
-    result->src[1] = a->grad;
-    result->src[2] = ggml_dup_tensor(ctx, a);
-    result->src[3] = ggml_dup_tensor(ctx, a);
+    result->src[1] = grad;
+    result->src[2] = ggml_dup_tensor(ctx, grad);
+    result->src[3] = ggml_dup_tensor(ctx, grad);
 
     return result;
 }
@@ -18769,7 +18771,7 @@ void ggml_build_opt_adamw(
 
         if (node->flags & GGML_TENSOR_FLAG_PARAM) {
             GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
-            struct ggml_tensor * opt_step = ggml_opt_step_adamw(ctx, node, alpha, beta1, beta2, eps, wd);
+            struct ggml_tensor * opt_step = ggml_opt_step_adamw(ctx, node, node->grad, alpha, beta1, beta2, eps, wd);
             ggml_build_forward_expand(gb, opt_step);
         }
     }
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 5c78b6704..95d983aa0 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -2751,7 +2751,10 @@ struct test_opt_step_adamw : public test_case {
         ggml_set_param(ctx, a); // Despite tensor a having gradients the output tensor will not.
         ggml_set_name(a, "a");
 
-        ggml_tensor * out = ggml_opt_step_adamw(ctx, a, alpha, beta1, beta2, eps, wd);
+        ggml_tensor * grad = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_name(grad, "grad");
+
+        ggml_tensor * out = ggml_opt_step_adamw(ctx, a, grad, alpha, beta1, beta2, eps, wd);
         ggml_set_name(out, "out");
 
         return out;

From f1b8c4271125c2bfb9cfebd72a8b9e9f99061a30 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 1 Oct 2024 16:09:42 +0300
Subject: [PATCH 78/78] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index aa301462a..23c24899e 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-9a24b8c8c40eab7262d067e91d08df160678df8d
+4de6ee8e6a4b2145d6b92162bc87722fecb4ea46