From 266b8519ee6d21e7ba2bf56f5629e20a181fee8b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?=
 <alberto.cabrera@codeplay.com>
Date: Fri, 29 Nov 2024 09:49:43 +0000
Subject: [PATCH 01/24] sycl : Reroute permuted mul_mats through oneMKL
 (#10408)

This PR fixes the failing MUL_MAT tests for the sycl backend.
---
 ggml/src/ggml-sycl/ggml-sycl.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index b6392ed8d..aabcdc224 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -3447,8 +3447,15 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
         use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
 
     if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
-        // KQ single-batch
-        ggml_sycl_mul_mat_vec_p021(ctx, src0, src1, dst);
+        // TODO: Refactor and cleanup of mul mat dispatching.
+        if (src0->ne[3] == 1 && src1->ne[3] == 1) {
+            // KQ single-batch
+            // mmv p021 was specific for these dimensions
+            ggml_sycl_mul_mat_vec_p021(ctx, src0, src1, dst);
+        } else {
+            // The kernel from the if path is faster for that specific case, but does not support all mul mats.
+            ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
+        }
     } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
         // KQV single-batch
         ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);

From 0f77aae5608f16780a49926b67be6d56ec4b09bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?=
 <alberto.cabrera@codeplay.com>
Date: Fri, 29 Nov 2024 12:38:45 +0000
Subject: [PATCH 02/24] sycl : offload of get_rows set to 0 (#10432)

---
 ggml/src/ggml-sycl/ggml-sycl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index aabcdc224..808f74fa0 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -4493,7 +4493,7 @@ static bool ggml_backend_sycl_device_supports_buft(ggml_backend_dev_t dev, ggml_
 static int64_t get_op_batch_size(const ggml_tensor * op) {
     switch (op->op) {
         case GGML_OP_GET_ROWS:
-            return op->ne[1]; // this will increse the speed of prefill in test
+            return 0;
         case GGML_OP_MUL_MAT:
             return op->ne[1];
         case GGML_OP_MUL_MAT_ID:

From 4b3242bbea172ac0980378496fbc676d44c4f459 Mon Sep 17 00:00:00 2001
From: Shupei Fan <dymarkfan@outlook.com>
Date: Fri, 29 Nov 2024 21:49:02 +0800
Subject: [PATCH 03/24] ggml-cpu: fix typo in gemv/gemm iq4_nl_4_4 (#10580)

---
 ggml/src/ggml-cpu/ggml-cpu-aarch64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
index 69d3d327d..14a1f00eb 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
@@ -1020,7 +1020,7 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void
         float * res_ptr = s;
 
         for (int x = 0; x < nc / ncols_interleaved; x++) {
-            const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+            const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
 
             float32x4_t sumf = vdupq_n_f32(0);
             for (int l = 0; l < nb; l++) {
@@ -3507,7 +3507,7 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void
         for (int y = 0; y < nr / 4; y++) {
             const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
             for (int x = 0; x < nc / ncols_interleaved; x++) {
-                const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+                const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
 
                 float32x4_t sumf[4];
                 for (int m = 0; m < 4; m++) {

From f0678c5ff4cb8873d6ff48801475ff270db656fa Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 29 Nov 2024 16:25:39 +0200
Subject: [PATCH 04/24] ggml : fix I8MM Q4_1 scaling factor conversion (#10562)

ggml-ci
---
 ggml/src/ggml-cpu/ggml-cpu-quants.c | 57 +++++++++++++++++------------
 ggml/src/ggml-cpu/ggml-cpu.c        |  4 +-
 tests/test-backend-ops.cpp          |  2 +
 3 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c
index 11e8df253..634c5fa11 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
@@ -1791,11 +1791,12 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
             const int8x16_t y1_l = vld1q_s8(b_y1->qs);
             const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
 
-            float32_t _scale[4] = { GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
-                                    GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
-                                    GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
-                                    GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
-
+            float32_t _scale[4] = {
+                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
+                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
+                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
+                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
+            };
             float32x4_t scale = vld1q_f32(_scale);
 
             int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
@@ -1811,7 +1812,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
             int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
 
             sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
-                                                                                l1, r1)), l2, r2)), l3, r3))), scale);
+                                                l1, r1)), l2, r2)), l3, r3))), scale);
         }
 
         float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
@@ -2347,10 +2348,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
             const block_q8_1 * restrict b_y0 = &vy0[i];
             const block_q8_1 * restrict b_y1 = &vy1[i];
 
-            float32_t summs_t[4] = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
-                                    GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
-                                    GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
-                                    GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
+            float32_t summs_t[4] = {
+                GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
+                GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
+                GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
+                GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)
+            };
             summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
 
             const uint8x16_t m4b = vdupq_n_u8(0x0F);
@@ -2371,10 +2374,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
             const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
 
             // mmla into int32x4_t
-            float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
-                                   GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
-                                   GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
-                                   GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
+            float32_t _scale[4] = {
+                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
+                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
+                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
+                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
+            };
             float32x4_t scale = vld1q_f32(_scale);
 
             int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
@@ -2389,15 +2394,17 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
             int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
             int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
             sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
-                                                                                l1, r1)), l2, r2)), l3, r3))), scale);
+                                                l1, r1)), l2, r2)), l3, r3))), scale);
         }
 
-        float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
+        float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
         float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
+
         sumv2 = vaddq_f32(sumv2, summs0);
 
         vst1_f32(s,      vget_low_f32 (sumv2));
         vst1_f32(s + bs, vget_high_f32(sumv2));
+
         return;
     }
 #endif
@@ -3374,10 +3381,12 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
             const int8x16_t y1_l = vld1q_s8(b_y1->qs);
             const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
 
-            float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
-                                   GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
-                                   GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
-                                   GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
+            float32_t _scale[4] = {
+                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
+                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
+                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
+                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
+            };
             float32x4_t scale = vld1q_f32(_scale);
 
             int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
@@ -3393,13 +3402,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
             int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
 
             sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
-                                                                                       l1, r1)), l2, r2)), l3, r3))), scale);
+                                                l1, r1)), l2, r2)), l3, r3))), scale);
         }
-        float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
+
+        float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
         float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
 
-        vst1_f32(s, vget_low_f32(sumv2));
+        vst1_f32(s,      vget_low_f32 (sumv2));
         vst1_f32(s + bs, vget_high_f32(sumv2));
+
         return;
     }
 #endif
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 1c88e5d81..e0cefc20b 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -7641,8 +7641,8 @@ UseGgmlGemm2:;
         // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
         int64_t num_rows_per_vec_dot = vec_dot_num_rows;
 
-        // TODO: currently the mmla kernels support only even numbered rows/cols.
-        // this check can be removed once they are extended to support odd numbered rows/cols too
+        // these checks are needed to avoid crossing dim1 boundaries
+        // can be optimized, but the logic would become more complicated, so keeping it like this for simplicity
         if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
             num_rows_per_vec_dot = 1;
         }
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index da66ed856..4d9df1a64 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -3334,7 +3334,9 @@ static const ggml_type all_types[] = {
 
 static const ggml_type base_types[] = {
     GGML_TYPE_F32, GGML_TYPE_F16,
+    GGML_TYPE_Q8_0, // for I8MM tests
     GGML_TYPE_Q4_0,
+    GGML_TYPE_Q4_1, // for I8MM tests
     GGML_TYPE_Q4_K,
     GGML_TYPE_IQ2_XXS
 };

From a3a3048e7a0f9464d0d625a29257d8bce5da5090 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Fri, 29 Nov 2024 17:45:08 +0100
Subject: [PATCH 05/24] cleanup UI link list (#10577)

* cleanup UI link list

* sort list alphabetically

* add missing licenses
---
 README.md | 58 ++++++++++++++++++++++++-------------------------------
 1 file changed, 25 insertions(+), 33 deletions(-)

diff --git a/README.md b/README.md
index 414c5b1c0..8aa3d1e59 100644
--- a/README.md
+++ b/README.md
@@ -140,44 +140,36 @@ Typically finetunes of the base models below are supported as well.
 
 **UI:**
 
-Unless otherwise noted these projects are open-source with permissive licensing:
-
-- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
-- [iohub/collama](https://github.com/iohub/coLLaMA)
+- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
+- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
+- [Dot](https://github.com/alexpinel/Dot) (GPL)
+- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
+- [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0)
 - [janhq/jan](https://github.com/janhq/jan) (AGPL)
-- [nat/openplayground](https://github.com/nat/openplayground)
-- [Faraday](https://faraday.dev/) (proprietary)
+- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0)
+- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
+- [LARS](https://github.com/abgulati/LARS) (AGPL)
+- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
+- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
+- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
 - [LMStudio](https://lmstudio.ai/) (proprietary)
-- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
-- [ramalama](https://github.com/containers/ramalama) (MIT)
 - [LocalAI](https://github.com/mudler/LocalAI) (MIT)
 - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
-- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
-- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
-- [ollama/ollama](https://github.com/ollama/ollama)
-- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
-- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
-- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
-- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
-- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
-- [RAGNA Desktop](https://ragna.app/) (proprietary)
-- [RecurseChat](https://recurse.chat/) (proprietary)
-- [semperai/amica](https://github.com/semperai/amica)
-- [withcatai/catai](https://github.com/withcatai/catai)
-- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
-- [Msty](https://msty.app) (proprietary)
-- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
-- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
-- [Dot](https://github.com/alexpinel/Dot) (GPL)
 - [MindMac](https://mindmac.app) (proprietary)
-- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
-- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
-- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
-- [AIKit](https://github.com/sozercan/aikit) (MIT)
-- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
-- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
-- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
-- [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
+- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
+- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
+- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) (Apache-2.0)
+- [nat/openplayground](https://github.com/nat/openplayground) (MIT)
+- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) (MIT)
+- [ollama/ollama](https://github.com/ollama/ollama) (MIT)
+- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
+- [PocketPal AI](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
+- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) (MIT)
+- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) (MIT)
+- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
+- [ramalama](https://github.com/containers/ramalama) (MIT)
+- [semperai/amica](https://github.com/semperai/amica) (MIT)
+- [withcatai/catai](https://github.com/withcatai/catai) (MIT)
 
 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
 

From 3a8e9af402f7893423bdab444aa16c5d9a2d429a Mon Sep 17 00:00:00 2001
From: Robert Collins <roberto.tomas.cuentas@gmail.com>
Date: Fri, 29 Nov 2024 12:21:37 -0500
Subject: [PATCH 06/24] imatrix : support combine-only (#10492)

* imatrix-combine-only idea

* ensured that behavior consistent with log
---
 examples/imatrix/imatrix.cpp | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 70ff47768..45206f4a7 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -637,10 +637,19 @@ int main(int argc, char ** argv) {
         LOG_INF("%s\n", common_params_get_system_info(params).c_str());
     }
 
-    if (!compute_imatrix(ctx, params)) {
-        return 1;
+    if (params.prompt.empty()) {
+        if (params.in_files.empty()) {
+            LOG_ERR("Error: No prompt provided and no precomputed matrices (--in-file) to combine.\n");
+            return 1;
+        }
+        LOG_INF("No prompt provided; combining precomputed matrices only.\n");
+    } else {
+        if (!compute_imatrix(ctx, params)) {
+            return 1;
+        }
     }
 
+
     g_collector.save_imatrix();
 
     LOG("\n");

From b782e5c7d453b3f1fa8dc6c34cde7e2fa946af93 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Fri, 29 Nov 2024 21:48:56 +0100
Subject: [PATCH 07/24] server : add more test cases (#10569)

* server : add split model test

* add test speculative

* add invalid cases
---
 examples/server/tests/unit/test_basic.py      |  14 +++
 .../server/tests/unit/test_chat_completion.py |  19 ++++
 examples/server/tests/unit/test_infill.py     |  22 ++++
 examples/server/tests/unit/test_rerank.py     |  17 +++
 .../server/tests/unit/test_speculative.py     | 103 ++++++++++++++++++
 examples/server/tests/utils.py                |  12 +-
 6 files changed, 186 insertions(+), 1 deletion(-)
 create mode 100644 examples/server/tests/unit/test_speculative.py

diff --git a/examples/server/tests/unit/test_basic.py b/examples/server/tests/unit/test_basic.py
index 84db5ca1c..d82d54a5a 100644
--- a/examples/server/tests/unit/test_basic.py
+++ b/examples/server/tests/unit/test_basic.py
@@ -32,3 +32,17 @@ def test_server_models():
     assert res.status_code == 200
     assert len(res.body["data"]) == 1
     assert res.body["data"][0]["id"] == server.model_alias
+
+def test_load_split_model():
+    global server
+    server.model_hf_repo = "ggml-org/models"
+    server.model_hf_file = "tinyllamas/split/stories15M-q8_0-00001-of-00003.gguf"
+    server.model_alias = "tinyllama-split"
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": 16,
+        "prompt": "Hello",
+        "temperature": 0.0,
+    })
+    assert res.status_code == 200
+    assert match_regex("(little|girl)+", res.body["content"])
diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py
index d7aeb288d..1048d6fca 100644
--- a/examples/server/tests/unit/test_chat_completion.py
+++ b/examples/server/tests/unit/test_chat_completion.py
@@ -127,3 +127,22 @@ def test_completion_with_response_format(response_format: dict, n_predicted: int
         assert res.status_code != 200
         assert "error" in res.body
 
+
+@pytest.mark.parametrize("messages", [
+    None,
+    "string",
+    [123],
+    [{}],
+    [{"role": 123}],
+    [{"role": "system", "content": 123}],
+    # [{"content": "hello"}], # TODO: should not be a valid case
+    [{"role": "system", "content": "test"}, {}],
+])
+def test_invalid_chat_completion_req(messages):
+    global server
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "messages": messages,
+    })
+    assert res.status_code == 400 or res.status_code == 500
+    assert "error" in res.body
diff --git a/examples/server/tests/unit/test_infill.py b/examples/server/tests/unit/test_infill.py
index 38ce6c429..6a6d40a1c 100644
--- a/examples/server/tests/unit/test_infill.py
+++ b/examples/server/tests/unit/test_infill.py
@@ -8,6 +8,7 @@ def create_server():
     global server
     server = ServerPreset.tinyllama_infill()
 
+
 def test_infill_without_input_extra():
     global server
     server.start()
@@ -19,6 +20,7 @@ def test_infill_without_input_extra():
     assert res.status_code == 200
     assert match_regex("(One|day|she|saw|big|scary|bird)+", res.body["content"])
 
+
 def test_infill_with_input_extra():
     global server
     server.start()
@@ -33,3 +35,23 @@ def test_infill_with_input_extra():
     })
     assert res.status_code == 200
     assert match_regex("(cuts|Jimmy|mom|came|into|the|room)+", res.body["content"])
+
+
+@pytest.mark.parametrize("input_extra", [
+    {},
+    {"filename": "ok"},
+    {"filename": 123},
+    {"filename": 123, "text": "abc"},
+    {"filename": 123, "text": 456},
+])
+def test_invalid_input_extra_req(input_extra):
+    global server
+    server.start()
+    res = server.make_request("POST", "/infill", data={
+        "prompt": "Complete this",
+        "input_extra": [input_extra],
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_",
+        "input_suffix": "}\n",
+    })
+    assert res.status_code == 400
+    assert "error" in res.body
diff --git a/examples/server/tests/unit/test_rerank.py b/examples/server/tests/unit/test_rerank.py
index 3a49fd3ac..189bc4c96 100644
--- a/examples/server/tests/unit/test_rerank.py
+++ b/examples/server/tests/unit/test_rerank.py
@@ -36,3 +36,20 @@ def test_rerank():
     assert most_relevant["relevance_score"] > least_relevant["relevance_score"]
     assert most_relevant["index"] == 2
     assert least_relevant["index"] == 3
+
+
+@pytest.mark.parametrize("documents", [
+    [],
+    None,
+    123,
+    [1, 2, 3],
+])
+def test_invalid_rerank_req(documents):
+    global server
+    server.start()
+    res = server.make_request("POST", "/rerank", data={
+        "query": "Machine learning is",
+        "documents": documents,
+    })
+    assert res.status_code == 400
+    assert "error" in res.body
diff --git a/examples/server/tests/unit/test_speculative.py b/examples/server/tests/unit/test_speculative.py
new file mode 100644
index 000000000..982d6abb4
--- /dev/null
+++ b/examples/server/tests/unit/test_speculative.py
@@ -0,0 +1,103 @@
+import pytest
+from utils import *
+
+# We use a F16 MOE gguf as main model, and q4_0 as draft model
+
+server = ServerPreset.stories15m_moe()
+
+MODEL_DRAFT_FILE_URL = "https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories15M-q4_0.gguf"
+
+def create_server():
+    global server
+    server = ServerPreset.stories15m_moe()
+    # download draft model file if needed
+    file_name = MODEL_DRAFT_FILE_URL.split('/').pop()
+    model_draft_file = f'../../../{file_name}'
+    if not os.path.exists(model_draft_file):
+        print(f"Downloading {MODEL_DRAFT_FILE_URL} to {model_draft_file}")
+        with open(model_draft_file, 'wb') as f:
+            f.write(requests.get(MODEL_DRAFT_FILE_URL).content)
+        print(f"Done downloading draft model file")
+    # set default values
+    server.model_draft = model_draft_file
+    server.draft_min = 4
+    server.draft_max = 8
+
+
+@pytest.fixture(scope="module", autouse=True)
+def fixture_create_server():
+    return create_server()
+
+
+def test_with_and_without_draft():
+    global server
+    server.model_draft = None  # disable draft model
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "temperature": 0.0,
+        "top_k": 1,
+    })
+    assert res.status_code == 200
+    content_no_draft = res.body["content"]
+    server.stop()
+
+    # create new server with draft model
+    create_server()
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "temperature": 0.0,
+        "top_k": 1,
+    })
+    assert res.status_code == 200
+    content_draft = res.body["content"]
+
+    assert content_no_draft == content_draft
+
+
+def test_different_draft_min_draft_max():
+    global server
+    test_values = [
+        (1, 2),
+        (1, 4),
+        (4, 8),
+        (4, 12),
+        (8, 16),
+    ]
+    last_content = None
+    for draft_min, draft_max in test_values:
+        server.stop()
+        server.draft_min = draft_min
+        server.draft_max = draft_max
+        server.start()
+        res = server.make_request("POST", "/completion", data={
+            "prompt": "I believe the meaning of life is",
+            "temperature": 0.0,
+            "top_k": 1,
+        })
+        assert res.status_code == 200
+        if last_content is not None:
+            assert last_content == res.body["content"]
+        last_content = res.body["content"]
+
+
+@pytest.mark.parametrize("n_slots,n_requests", [
+    (1, 2),
+    (2, 2),
+])
+def test_multi_requests_parallel(n_slots: int, n_requests: int):
+    global server
+    server.n_slots = n_slots
+    server.start()
+    tasks = []
+    for _ in range(n_requests):
+        tasks.append((server.make_request, ("POST", "/completion", {
+            "prompt": "I believe the meaning of life is",
+            "temperature": 0.0,
+            "top_k": 1,
+        })))
+    results = parallel_function_calls(tasks)
+    for res in results:
+        assert res.status_code == 200
+        assert match_regex("(wise|kind|owl|answer)+", res.body["content"])
diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py
index a831f113f..e17a05ff6 100644
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
@@ -46,6 +46,7 @@ class ServerProcess:
     model_alias: str | None = None
     model_url: str | None = None
     model_file: str | None = None
+    model_draft: str | None = None
     n_threads: int | None = None
     n_gpu_layer: int | None = None
     n_batch: int | None = None
@@ -68,6 +69,8 @@ class ServerProcess:
     response_format: str | None = None
     lora_files: List[str] | None = None
     disable_ctx_shift: int | None = False
+    draft_min: int | None = None
+    draft_max: int | None = None
 
     # session variables
     process: subprocess.Popen | None = None
@@ -102,6 +105,8 @@ class ServerProcess:
             server_args.extend(["--model", self.model_file])
         if self.model_url:
             server_args.extend(["--model-url", self.model_url])
+        if self.model_draft:
+            server_args.extend(["--model-draft", self.model_draft])
         if self.model_hf_repo:
             server_args.extend(["--hf-repo", self.model_hf_repo])
         if self.model_hf_file:
@@ -147,6 +152,10 @@ class ServerProcess:
             server_args.extend(["--no-context-shift"])
         if self.api_key:
             server_args.extend(["--api-key", self.api_key])
+        if self.draft_max:
+            server_args.extend(["--draft-max", self.draft_max])
+        if self.draft_min:
+            server_args.extend(["--draft-min", self.draft_min])
 
         args = [str(arg) for arg in [server_path, *server_args]]
         print(f"bench: starting server with: {' '.join(args)}")
@@ -185,7 +194,8 @@ class ServerProcess:
         raise TimeoutError(f"Server did not start within {timeout_seconds} seconds")
 
     def stop(self) -> None:
-        server_instances.remove(self)
+        if self in server_instances:
+            server_instances.remove(self)
         if self.process:
             print(f"Stopping server with pid={self.process.pid}")
             self.process.kill()

From 7cc2d2c88908fc92b97b28acafb82f7d6e425b85 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Fri, 29 Nov 2024 21:54:58 +0100
Subject: [PATCH 08/24] ggml : move AMX to the CPU backend (#10570)

* ggml : move AMX to the CPU backend

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 .clang-tidy                                   |   2 +
 .github/workflows/build.yml                   |   5 +
 Makefile                                      |   9 +-
 Package.swift                                 |   4 +-
 common/CMakeLists.txt                         |   2 +-
 common/common.cpp                             |  10 +
 examples/batched-bench/CMakeLists.txt         |   2 +-
 examples/batched/CMakeLists.txt               |   2 +-
 .../convert-llama2c-to-ggml/CMakeLists.txt    |   2 +-
 examples/cvector-generator/CMakeLists.txt     |   2 +-
 examples/embedding/CMakeLists.txt             |   2 +-
 examples/eval-callback/CMakeLists.txt         |   2 +-
 examples/export-lora/CMakeLists.txt           |   2 +-
 examples/gbnf-validator/CMakeLists.txt        |   2 +-
 examples/gen-docs/CMakeLists.txt              |   2 +-
 examples/gguf-hash/CMakeLists.txt             |   2 +-
 examples/gguf-split/CMakeLists.txt            |   2 +-
 examples/gguf/CMakeLists.txt                  |   2 +-
 examples/gritlm/CMakeLists.txt                |   2 +-
 examples/imatrix/CMakeLists.txt               |   2 +-
 examples/infill/CMakeLists.txt                |   2 +-
 examples/llama-bench/CMakeLists.txt           |   2 +-
 examples/llava/CMakeLists.txt                 |   6 +-
 examples/lookahead/CMakeLists.txt             |   2 +-
 examples/lookup/CMakeLists.txt                |   8 +-
 examples/main-cmake-pkg/CMakeLists.txt        |   2 +-
 examples/main/CMakeLists.txt                  |   2 +-
 examples/parallel/CMakeLists.txt              |   2 +-
 examples/passkey/CMakeLists.txt               |   2 +-
 examples/perplexity/CMakeLists.txt            |   2 +-
 examples/quantize-stats/CMakeLists.txt        |   2 +-
 examples/quantize/CMakeLists.txt              |   2 +-
 examples/retrieval/CMakeLists.txt             |   2 +-
 examples/run/CMakeLists.txt                   |   2 +-
 examples/save-load-state/CMakeLists.txt       |   2 +-
 examples/server/CMakeLists.txt                |   2 +-
 examples/simple-chat/CMakeLists.txt           |   2 +-
 examples/simple/CMakeLists.txt                |   2 +-
 examples/speculative-simple/CMakeLists.txt    |   2 +-
 examples/speculative/CMakeLists.txt           |   2 +-
 examples/tokenize/CMakeLists.txt              |   2 +-
 ggml/CMakeLists.txt                           |   1 -
 ggml/include/ggml-amx.h                       |  25 -
 ggml/src/CMakeLists.txt                       |  16 +-
 ggml/src/ggml-amx/CMakeLists.txt              | 105 ----
 ggml/src/ggml-amx/ggml-amx.cpp                | 449 ------------------
 ggml/src/ggml-backend-reg.cpp                 |   7 -
 ggml/src/ggml-backend.cpp                     |   3 +-
 ggml/src/ggml-cpu/CMakeLists.txt              |  92 ++--
 ggml/src/ggml-cpu/amx/amx.cpp                 | 196 ++++++++
 ggml/src/ggml-cpu/amx/amx.h                   |  20 +
 ggml/src/{ggml-amx => ggml-cpu/amx}/common.h  |  23 +-
 ggml/src/{ggml-amx => ggml-cpu/amx}/mmq.cpp   |  96 ++--
 ggml/src/{ggml-amx => ggml-cpu/amx}/mmq.h     |   3 +-
 ggml/src/ggml-cpu/ggml-cpu-impl.h             |  15 +
 ggml/src/ggml-cpu/ggml-cpu.c                  |  84 ++--
 ggml/src/ggml-cpu/ggml-cpu.cpp                |  34 +-
 ggml/src/ggml-cpu/llamafile/sgemm.cpp         |   3 +-
 ggml/src/ggml-impl.h                          |  10 +-
 .../ggml-vulkan/vulkan-shaders/CMakeLists.txt |   2 +-
 pocs/vdot/CMakeLists.txt                      |   4 +-
 src/CMakeLists.txt                            |   2 +-
 src/unicode.cpp                               |  11 +
 tests/test-sampling.cpp                       |   2 +-
 64 files changed, 514 insertions(+), 801 deletions(-)
 delete mode 100644 ggml/include/ggml-amx.h
 delete mode 100644 ggml/src/ggml-amx/CMakeLists.txt
 delete mode 100644 ggml/src/ggml-amx/ggml-amx.cpp
 create mode 100644 ggml/src/ggml-cpu/amx/amx.cpp
 create mode 100644 ggml/src/ggml-cpu/amx/amx.h
 rename ggml/src/{ggml-amx => ggml-cpu/amx}/common.h (77%)
 rename ggml/src/{ggml-amx => ggml-cpu/amx}/mmq.cpp (98%)
 rename ggml/src/{ggml-amx => ggml-cpu/amx}/mmq.h (72%)

diff --git a/.clang-tidy b/.clang-tidy
index 952c0cca8..310c3d182 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -17,8 +17,10 @@ Checks: >
     -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
     performance-*,
     portability-*,
+    -portability-simd-intrinsics,
     misc-*,
     -misc-const-correctness,
     -misc-non-private-member-variables-in-classes,
     -misc-no-recursion,
+    -misc-use-anonymous-namespace,
 FormatStyle: none
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 48953dafa..e2291bd34 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1121,6 +1121,11 @@ jobs:
         run: |
           & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
 
+      - name: Install ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+        with:
+          key: ${{ github.job }}
+
       - name: Build
         id: cmake_build
         run: |
diff --git a/Makefile b/Makefile
index 25214ec05..83adcef28 100644
--- a/Makefile
+++ b/Makefile
@@ -254,8 +254,8 @@ endif
 # keep standard at C11 and C++11
 MK_CPPFLAGS  = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU
 MK_CFLAGS    = -std=c11   -fPIC
-MK_CXXFLAGS  = -std=c++11 -fPIC
-MK_NVCCFLAGS = -std=c++11
+MK_CXXFLAGS  = -std=c++17 -fPIC
+MK_NVCCFLAGS = -std=c++17
 
 ifdef LLAMA_NO_CCACHE
 GGML_NO_CCACHE := 1
@@ -575,9 +575,12 @@ endif
 
 ifndef GGML_NO_AMX
 	MK_CPPFLAGS += -DGGML_USE_AMX
-	OBJ_GGML_EXT += ggml/src/ggml-amx/ggml-amx.o ggml/src/ggml-amx/mmq.o
+	OBJ_GGML_EXT += ggml/src/ggml-cpu/amx/amx.o ggml/src/ggml-cpu/amx/mmq.o
 endif
 
+# only necessary for the CPU backend files
+MK_CPPFLAGS += -Iggml/src/ggml-cpu
+
 ifdef GGML_RPC
 	MK_CPPFLAGS  += -DGGML_USE_RPC
 	OBJ_GGML_EXT += ggml/src/ggml-rpc.o
diff --git a/Package.swift b/Package.swift
index d9e8a4e2d..1e75aa7e2 100644
--- a/Package.swift
+++ b/Package.swift
@@ -28,13 +28,16 @@ var cSettings: [CSetting] =  [
     .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
     .unsafeFlags(["-fno-objc-arc"]),
     .headerSearchPath("ggml/src"),
+    .headerSearchPath("ggml/src/ggml-cpu"),
     // NOTE: NEW_LAPACK will required iOS version 16.4+
     // We should consider add this in the future when we drop support for iOS 14
     // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
     // .define("ACCELERATE_NEW_LAPACK"),
     // .define("ACCELERATE_LAPACK_ILP64")
+    .define("GGML_USE_CPU"),
 ]
 
+
 #if canImport(Darwin)
 sources.append("ggml/src/ggml-common.h")
 sources.append("ggml/src/ggml-metal/ggml-metal.m")
@@ -44,7 +47,6 @@ cSettings.append(
     contentsOf: [
         .define("GGML_USE_ACCELERATE"),
         .define("GGML_USE_METAL"),
-        .define("GGML_USE_CPU")
     ]
 )
 #endif
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 223174884..89862fe11 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -88,5 +88,5 @@ if (LLAMA_CURL)
 endif ()
 
 target_include_directories(${TARGET} PUBLIC .)
-target_compile_features   (${TARGET} PUBLIC cxx_std_11)
+target_compile_features   (${TARGET} PUBLIC cxx_std_17)
 target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
diff --git a/common/common.cpp b/common/common.cpp
index 2b2f00098..6143516d2 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -652,7 +652,17 @@ bool fs_validate_filename(const std::string & filename) {
 
     std::u32string filename_utf32;
     try {
+#if defined(__clang__)
+        // disable C++17 deprecation warning for std::codecvt_utf8
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif
         std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
+
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#endif
+
         filename_utf32 = converter.from_bytes(filename);
 
         // If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
diff --git a/examples/batched-bench/CMakeLists.txt b/examples/batched-bench/CMakeLists.txt
index 959acaeee..68ad707f3 100644
--- a/examples/batched-bench/CMakeLists.txt
+++ b/examples/batched-bench/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-batched-bench)
 add_executable(${TARGET} batched-bench.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/batched/CMakeLists.txt b/examples/batched/CMakeLists.txt
index 77e33343b..0d439f498 100644
--- a/examples/batched/CMakeLists.txt
+++ b/examples/batched/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-batched)
 add_executable(${TARGET} batched.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/convert-llama2c-to-ggml/CMakeLists.txt b/examples/convert-llama2c-to-ggml/CMakeLists.txt
index a6790e617..44e5f722a 100644
--- a/examples/convert-llama2c-to-ggml/CMakeLists.txt
+++ b/examples/convert-llama2c-to-ggml/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-convert-llama2c-to-ggml)
 add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/cvector-generator/CMakeLists.txt b/examples/cvector-generator/CMakeLists.txt
index 0a559d60c..49ad9561c 100644
--- a/examples/cvector-generator/CMakeLists.txt
+++ b/examples/cvector-generator/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-cvector-generator)
 add_executable(${TARGET} cvector-generator.cpp pca.hpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt
index 8256e789a..809040307 100644
--- a/examples/embedding/CMakeLists.txt
+++ b/examples/embedding/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-embedding)
 add_executable(${TARGET} embedding.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt
index 5d1048aad..95915ed91 100644
--- a/examples/eval-callback/CMakeLists.txt
+++ b/examples/eval-callback/CMakeLists.txt
@@ -2,7 +2,7 @@ set(TARGET llama-eval-callback)
 add_executable(${TARGET} eval-callback.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 set(TEST_TARGET test-eval-callback)
 add_test(NAME ${TEST_TARGET}
diff --git a/examples/export-lora/CMakeLists.txt b/examples/export-lora/CMakeLists.txt
index 1cef6e716..310455787 100644
--- a/examples/export-lora/CMakeLists.txt
+++ b/examples/export-lora/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-export-lora)
 add_executable(${TARGET} export-lora.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/gbnf-validator/CMakeLists.txt b/examples/gbnf-validator/CMakeLists.txt
index 4edd6ec73..d2cb524c0 100644
--- a/examples/gbnf-validator/CMakeLists.txt
+++ b/examples/gbnf-validator/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-gbnf-validator)
 add_executable(${TARGET} gbnf-validator.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/gen-docs/CMakeLists.txt b/examples/gen-docs/CMakeLists.txt
index c94cda776..25de0af35 100644
--- a/examples/gen-docs/CMakeLists.txt
+++ b/examples/gen-docs/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-gen-docs)
 add_executable(${TARGET} gen-docs.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/gguf-hash/CMakeLists.txt b/examples/gguf-hash/CMakeLists.txt
index 7a494ce32..15c5c68c6 100644
--- a/examples/gguf-hash/CMakeLists.txt
+++ b/examples/gguf-hash/CMakeLists.txt
@@ -19,4 +19,4 @@ add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h)
 target_link_libraries(${TARGET} PRIVATE sha256)
 
 target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/gguf-split/CMakeLists.txt b/examples/gguf-split/CMakeLists.txt
index f63887da7..c407e2f0a 100644
--- a/examples/gguf-split/CMakeLists.txt
+++ b/examples/gguf-split/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-gguf-split)
 add_executable(${TARGET} gguf-split.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/gguf/CMakeLists.txt b/examples/gguf/CMakeLists.txt
index a9569b411..fb04eb83f 100644
--- a/examples/gguf/CMakeLists.txt
+++ b/examples/gguf/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-gguf)
 add_executable(${TARGET} gguf.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/gritlm/CMakeLists.txt b/examples/gritlm/CMakeLists.txt
index 86dfddca3..fa1b4dc70 100644
--- a/examples/gritlm/CMakeLists.txt
+++ b/examples/gritlm/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-gritlm)
 add_executable(${TARGET} gritlm.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/imatrix/CMakeLists.txt b/examples/imatrix/CMakeLists.txt
index d4c8265bd..412696c47 100644
--- a/examples/imatrix/CMakeLists.txt
+++ b/examples/imatrix/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-imatrix)
 add_executable(${TARGET} imatrix.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/infill/CMakeLists.txt b/examples/infill/CMakeLists.txt
index 9b1aa3b63..fb26628d8 100644
--- a/examples/infill/CMakeLists.txt
+++ b/examples/infill/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-infill)
 add_executable(${TARGET} infill.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/llama-bench/CMakeLists.txt b/examples/llama-bench/CMakeLists.txt
index 5bdbea4e2..17e3b9b87 100644
--- a/examples/llama-bench/CMakeLists.txt
+++ b/examples/llama-bench/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-bench)
 add_executable(${TARGET} llama-bench.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt
index bbf5fec58..5d32f377f 100644
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -11,7 +11,7 @@ target_include_directories(llava PUBLIC .)
 target_include_directories(llava PUBLIC ../..)
 target_include_directories(llava PUBLIC ../../common)
 
-target_compile_features(llava PRIVATE cxx_std_11)
+target_compile_features(llava PRIVATE cxx_std_17)
 
 add_library(llava_static STATIC $<TARGET_OBJECTS:llava>)
 if (BUILD_SHARED_LIBS)
@@ -35,11 +35,11 @@ add_executable(${TARGET} llava-cli.cpp)
 set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 set(TARGET llama-minicpmv-cli)
 add_executable(${TARGET} minicpmv-cli.cpp)
 set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/lookahead/CMakeLists.txt b/examples/lookahead/CMakeLists.txt
index f0ae5cd89..346861314 100644
--- a/examples/lookahead/CMakeLists.txt
+++ b/examples/lookahead/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-lookahead)
 add_executable(${TARGET} lookahead.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/lookup/CMakeLists.txt b/examples/lookup/CMakeLists.txt
index ef19fe25e..fba78ceda 100644
--- a/examples/lookup/CMakeLists.txt
+++ b/examples/lookup/CMakeLists.txt
@@ -2,22 +2,22 @@ set(TARGET llama-lookup)
 add_executable(${TARGET} lookup.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 set(TARGET llama-lookup-create)
 add_executable(${TARGET} lookup-create.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 set(TARGET llama-lookup-merge)
 add_executable(${TARGET} lookup-merge.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 set(TARGET llama-lookup-stats)
 add_executable(${TARGET} lookup-stats.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/main-cmake-pkg/CMakeLists.txt b/examples/main-cmake-pkg/CMakeLists.txt
index 3b38db292..5563f4de0 100644
--- a/examples/main-cmake-pkg/CMakeLists.txt
+++ b/examples/main-cmake-pkg/CMakeLists.txt
@@ -29,4 +29,4 @@ add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp)
 target_include_directories(${TARGET} PRIVATE ${_common_path})
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt
index 5f6efaa9a..af3d9150f 100644
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-cli)
 add_executable(${TARGET} main.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/parallel/CMakeLists.txt b/examples/parallel/CMakeLists.txt
index c13557bac..847e916de 100644
--- a/examples/parallel/CMakeLists.txt
+++ b/examples/parallel/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-parallel)
 add_executable(${TARGET} parallel.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/passkey/CMakeLists.txt b/examples/passkey/CMakeLists.txt
index dc467a5d3..9bc5110c2 100644
--- a/examples/passkey/CMakeLists.txt
+++ b/examples/passkey/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-passkey)
 add_executable(${TARGET} passkey.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt
index be0f2fd02..3e6864093 100644
--- a/examples/perplexity/CMakeLists.txt
+++ b/examples/perplexity/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-perplexity)
 add_executable(${TARGET} perplexity.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt
index bb986a716..9a3a0d3cd 100644
--- a/examples/quantize-stats/CMakeLists.txt
+++ b/examples/quantize-stats/CMakeLists.txt
@@ -3,4 +3,4 @@ add_executable(${TARGET} quantize-stats.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt
index 62680cda4..47e5cbe30 100644
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -3,4 +3,4 @@ add_executable(${TARGET} quantize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/retrieval/CMakeLists.txt b/examples/retrieval/CMakeLists.txt
index 66610f311..512a602ec 100644
--- a/examples/retrieval/CMakeLists.txt
+++ b/examples/retrieval/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-retrieval)
 add_executable(${TARGET} retrieval.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/run/CMakeLists.txt b/examples/run/CMakeLists.txt
index 084f1e92d..52add51ef 100644
--- a/examples/run/CMakeLists.txt
+++ b/examples/run/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-run)
 add_executable(${TARGET} run.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt
index 0fb5e359b..0f50e50de 100644
--- a/examples/save-load-state/CMakeLists.txt
+++ b/examples/save-load-state/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-save-load-state)
 add_executable(${TARGET} save-load-state.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index 93e876f5a..e82f91533 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -50,4 +50,4 @@ if (WIN32)
     TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
 
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/simple-chat/CMakeLists.txt b/examples/simple-chat/CMakeLists.txt
index 87723533b..567f7fbbb 100644
--- a/examples/simple-chat/CMakeLists.txt
+++ b/examples/simple-chat/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-simple-chat)
 add_executable(${TARGET} simple-chat.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt
index b63afbb8b..104ecabfd 100644
--- a/examples/simple/CMakeLists.txt
+++ b/examples/simple/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-simple)
 add_executable(${TARGET} simple.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/speculative-simple/CMakeLists.txt b/examples/speculative-simple/CMakeLists.txt
index 7a3a141c2..aeaea74fc 100644
--- a/examples/speculative-simple/CMakeLists.txt
+++ b/examples/speculative-simple/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-speculative-simple)
 add_executable(${TARGET} speculative-simple.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/speculative/CMakeLists.txt b/examples/speculative/CMakeLists.txt
index aa208e7aa..c84196bd9 100644
--- a/examples/speculative/CMakeLists.txt
+++ b/examples/speculative/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-speculative)
 add_executable(${TARGET} speculative.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/tokenize/CMakeLists.txt b/examples/tokenize/CMakeLists.txt
index b704dcae1..1690b53e5 100644
--- a/examples/tokenize/CMakeLists.txt
+++ b/examples/tokenize/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-tokenize)
 add_executable(${TARGET} tokenize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 70b5cfdf7..789fa3b0c 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -161,7 +161,6 @@ set   (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
 set   (GGML_METAL_STD "" CACHE STRING       "ggml: metal standard version (-std flag)")
 option(GGML_OPENMP                          "ggml: use OpenMP"                                ON)
 option(GGML_RPC                             "ggml: use RPC"                                   OFF)
-option(GGML_AMX                             "ggml: use AMX"                                   OFF)
 option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
diff --git a/ggml/include/ggml-amx.h b/ggml/include/ggml-amx.h
deleted file mode 100644
index 042d6d919..000000000
--- a/ggml/include/ggml-amx.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-// buffer_type API
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
-
-GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend);
-
-// backend API
-GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void);
-
-GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void);
-
-#ifdef  __cplusplus
-}
-#endif
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 9022aa3ae..19289f32b 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -261,21 +261,15 @@ function(ggml_add_backend backend)
     if (${backend_id})
         string(TOLOWER "ggml-${backend}" backend_target)
         add_subdirectory(${backend_target})
-        # check again in case the backend disabled itself
-        # note that this should NOT be the normal behavior, in case of errors the backend should fail the build
-        # however, currently it is necessary for AMX, since it is enabled by default on llama.cpp
-        if (${backend_id})
-            message(STATUS "Including ${backend} backend")
-            if (NOT GGML_BACKEND_DL)
-                string(TOUPPER "GGML_USE_${backend}" backend_use)
-                target_compile_definitions(ggml PUBLIC ${backend_use})
-            endif()
+        message(STATUS "Including ${backend} backend")
+        if (NOT GGML_BACKEND_DL)
+            string(TOUPPER "GGML_USE_${backend}" backend_use)
+            target_compile_definitions(ggml PUBLIC ${backend_use})
         endif()
     endif()
 endfunction()
 
 ggml_add_backend(CPU)
-ggml_add_backend(AMX)
 ggml_add_backend(BLAS)
 ggml_add_backend(CANN)
 ggml_add_backend(CUDA)
@@ -289,7 +283,7 @@ ggml_add_backend(Vulkan)
 
 foreach (target ggml-base ggml)
     target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
-    target_compile_features   (${target} PRIVATE c_std_11) # don't bump
+    target_compile_features   (${target} PRIVATE c_std_11 cxx_std_17) # don't bump
 endforeach()
 
 target_link_libraries(ggml-base PRIVATE Threads::Threads)
diff --git a/ggml/src/ggml-amx/CMakeLists.txt b/ggml/src/ggml-amx/CMakeLists.txt
deleted file mode 100644
index cf3ade6f0..000000000
--- a/ggml/src/ggml-amx/CMakeLists.txt
+++ /dev/null
@@ -1,105 +0,0 @@
-if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
-        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-         CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$") AND
-        CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
-    message(STATUS "Using AMX")
-
-    file(GLOB   GGML_HEADERS_AMX "*.h")
-    list(APPEND GGML_HEADERS_AMX "../../include/ggml-amx.h")
-
-    file(GLOB   GGML_SOURCES_AMX "*.cpp")
-
-    ggml_add_backend_library(ggml-amx
-                             ${GGML_HEADERS_AMX}
-                             ${GGML_SOURCES_AMX}
-                            )
-
-    # this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags
-    # TODO: integrate AMX backend into the CPU backend
-    if (MSVC)
-        # instruction set detection for MSVC only
-        if (GGML_NATIVE)
-            # TODO: improve, should not reference files from the parent folder
-            include(../ggml-cpu/cmake/FindSIMD.cmake)
-        endif ()
-        if (GGML_AVX512)
-            list(APPEND ARCH_FLAGS /arch:AVX512)
-            # MSVC has no compile-time flags enabling specific
-            # AVX512 extensions, neither it defines the
-            # macros corresponding to the extensions.
-            # Do it manually.
-            if (GGML_AVX512_VBMI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
-            endif()
-            if (GGML_AVX512_VNNI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
-            endif()
-            if (GGML_AVX512_BF16)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
-            endif()
-            if (GGML_AMX_TILE)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
-            endif()
-            if (GGML_AMX_INT8)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
-            endif()
-            if (GGML_AMX_BF16)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
-            endif()
-        elseif (GGML_AVX2)
-            list(APPEND ARCH_FLAGS /arch:AVX2)
-        elseif (GGML_AVX)
-            list(APPEND ARCH_FLAGS /arch:AVX)
-        endif()
-    else()
-        if (GGML_NATIVE)
-            list(APPEND ARCH_FLAGS -march=native)
-        endif()
-        if (GGML_F16C)
-            list(APPEND ARCH_FLAGS -mf16c)
-        endif()
-        if (GGML_FMA)
-            list(APPEND ARCH_FLAGS -mfma)
-        endif()
-        if (GGML_AVX)
-            list(APPEND ARCH_FLAGS -mavx)
-        endif()
-        if (GGML_AVX2)
-            list(APPEND ARCH_FLAGS -mavx2)
-        endif()
-        if (GGML_AVX512)
-            list(APPEND ARCH_FLAGS -mavx512f)
-            list(APPEND ARCH_FLAGS -mavx512dq)
-            list(APPEND ARCH_FLAGS -mavx512bw)
-        endif()
-        if (GGML_AVX512_VBMI)
-            list(APPEND ARCH_FLAGS -mavx512vbmi)
-        endif()
-        if (GGML_AVX512_VNNI)
-            list(APPEND ARCH_FLAGS -mavx512vnni)
-        endif()
-        if (GGML_AVX512_BF16)
-            list(APPEND ARCH_FLAGS -mavx512bf16)
-        endif()
-        if (GGML_AMX_TILE)
-            list(APPEND ARCH_FLAGS -mamx-tile)
-        endif()
-        if (GGML_AMX_INT8)
-            list(APPEND ARCH_FLAGS -mamx-int8)
-        endif()
-        if (GGML_AMX_BF16)
-            list(APPEND ARCH_FLAGS -mamx-bf16)
-        endif()
-    endif()
-
-    target_compile_options(ggml-amx PRIVATE ${ARCH_FLAGS})
-else()
-    set(GGML_AMX OFF PARENT_SCOPE)
-    message(WARNING "AMX requires x86 and gcc version > 11.0. Turning off GGML_AMX.")
-endif()
diff --git a/ggml/src/ggml-amx/ggml-amx.cpp b/ggml/src/ggml-amx/ggml-amx.cpp
deleted file mode 100644
index 6bfb3da27..000000000
--- a/ggml/src/ggml-amx/ggml-amx.cpp
+++ /dev/null
@@ -1,449 +0,0 @@
-#include "ggml-amx.h"
-#include "ggml-amx/common.h"
-#include "ggml-amx/mmq.h"
-#include "ggml-backend-impl.h"
-#include "ggml-impl.h"
-
-#if defined(__gnu_linux__)
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif
-
-#include <cstdlib>
-#include <cstring>
-#include <memory>
-
-#if defined(__AMX_INT8__)
-
-// AMX buffer interface
-static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    free(buffer->context);
-}
-
-static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
-    return (void *)(buffer->context);
-}
-
-static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
-    memset((char *)tensor->data + offset, value, size);
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    if (qtype_has_amx_kernels(tensor->type)) {
-        ggml_backend_amx_convert_weight(tensor, data, offset, size);
-    } else {
-        memcpy((char *)tensor->data + offset, data, size);
-    }
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
-    memcpy(data, (const char *)tensor->data + offset, size);
-
-    GGML_UNUSED(buffer);
-}
-
-static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
-    if (ggml_backend_buffer_is_host(src->buffer)) {
-        if (qtype_has_amx_kernels(src->type)) {
-            ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_backend_amx_get_alloc_size(dst));
-        } else {
-            memcpy(dst->data, src->data, ggml_nbytes(src));
-        }
-        return true;
-    }
-    return false;
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    memset(buffer->context, value, buffer->size);
-}
-
-static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
-    /* .free_buffer     = */ ggml_backend_amx_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_amx_buffer_get_base,
-    /* .init_tensor     = */ NULL, // no initialization required
-    /* .memset_tensor   = */ ggml_backend_amx_buffer_memset_tensor,
-    /* .set_tensor      = */ ggml_backend_amx_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_amx_buffer_get_tensor,
-    /* .cpy_tensor      = */ ggml_backend_amx_buffer_cpy_tensor,
-    /* .clear           = */ ggml_backend_amx_buffer_clear,
-    /* .reset           = */ NULL,
-};
-
-static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    return "AMX";
-
-    GGML_UNUSED(buft);
-}
-
-static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
-    if (data == NULL) {
-        fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
-        return NULL;
-    }
-
-    return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
-}
-
-static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return TENSOR_ALIGNMENT;
-
-    GGML_UNUSED(buft);
-}
-
-static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
-    return ggml_backend_amx_get_alloc_size(tensor);
-
-    GGML_UNUSED(buft);
-}
-
-static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
-    return false;
-
-    GGML_UNUSED(buft);
-}
-
-ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
-    static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
-        /* .iface = */ {
-            /* .get_name         = */ ggml_backend_amx_buffer_type_get_name,
-            /* .alloc_buffer     = */ ggml_backend_amx_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_amx_buffer_type_get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ ggml_backend_amx_buffer_type_get_alloc_size,
-            /* .is_host          = */ ggml_backend_amx_buffer_type_is_host,
-        },
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
-        /* .context = */ NULL,
-    };
-
-    return &ggml_backend_buffer_type_amx;
-}
-
-// backend interface
-
-static const char * ggml_backend_amx_name(ggml_backend_t backend) {
-    return "AMX";
-
-    GGML_UNUSED(backend);
-}
-
-static void ggml_backend_amx_free(ggml_backend_t backend) {
-    ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
-    delete ctx;
-    delete backend;
-}
-
-static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * node = cgraph->nodes[i];
-
-        switch (node->op) {
-        case GGML_OP_MUL_MAT:
-            ggml_backend_amx_mul_mat(ctx, node);
-            break;
-
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-            break;
-
-        default:
-            fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node));
-            GGML_ASSERT(false);
-        }
-    }
-
-    return GGML_STATUS_SUCCESS;
-
-    GGML_UNUSED(backend);
-}
-
-static struct ggml_backend_i ggml_backend_amx_i = {
-    /* .get_name                = */ ggml_backend_amx_name,
-    /* .free                    = */ ggml_backend_amx_free,
-    /* .set_tensor_async        = */ NULL,
-    /* .get_tensor_async        = */ NULL,
-    /* .cpy_tensor_async        = */ NULL,
-    /* .synchronize             = */ NULL,
-    /* .graph_plan_create       = */ NULL,
-    /* .graph_plan_free         = */ NULL,
-    /* .graph_plan_update       = */ NULL,
-    /* .graph_plan_compute      = */ NULL,
-    /* .graph_compute           = */ ggml_backend_amx_graph_compute,
-    /* .event_record            = */ NULL,
-    /* .event_wait              = */ NULL,
-};
-
-static ggml_guid_t ggml_backend_amx_guid() {
-    static ggml_guid guid = { 0x13, 0xb8, 0xa4, 0xc4, 0xba, 0xfe, 0x51, 0x67, 0x87, 0x44, 0x55, 0x15, 0xb2, 0x35, 0x62, 0x3e };
-    return &guid;
-}
-
-#define ARCH_GET_XCOMP_PERM     0x1022
-#define ARCH_REQ_XCOMP_PERM     0x1023
-#define XFEATURE_XTILECFG       17
-#define XFEATURE_XTILEDATA      18
-
-static bool ggml_amx_init() {
-#if defined(__gnu_linux__)
-    if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
-        fprintf(stderr, "AMX is not ready to be used!\n");
-        return false;
-    }
-    return true;
-#elif defined(_WIN32)
-    return true;
-#endif
-}
-
-ggml_backend_t ggml_backend_amx_init() {
-
-    // invoke a Linux system call to request access to AMX features
-    ggml_amx_init();
-
-    // backend context
-    ggml_backend_amx_context * ctx = new ggml_backend_amx_context;
-
-    // ggml amx backend
-    ggml_backend_t backend = new ggml_backend {
-        /* .guid      = */ ggml_backend_amx_guid(),
-        /* .interface = */ ggml_backend_amx_i,
-        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
-        /* .context   = */ ctx,
-    };
-
-    return backend;
-}
-
-bool ggml_backend_is_amx(ggml_backend_t backend) {
-    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_amx_guid());
-}
-
-void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
-    GGML_ASSERT(ggml_backend_is_amx(backend_amx));
-
-    ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend_amx->context;
-    ctx->n_threads = n_threads;
-}
-
-// device interface
-
-static const char * ggml_backend_amx_device_get_name(ggml_backend_dev_t dev) {
-    return "AMX";
-
-    GGML_UNUSED(dev);
-}
-
-static const char * ggml_backend_amx_device_get_description(ggml_backend_dev_t dev) {
-    return "Intel Advanced Matrix Extensions";
-
-    GGML_UNUSED(dev);
-}
-
-static void ggml_backend_amx_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    // TODO
-    *free = 0;
-    *total = 0;
-
-    GGML_UNUSED(dev);
-}
-
-static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) {
-    return GGML_BACKEND_DEVICE_TYPE_ACCEL;
-
-    GGML_UNUSED(dev);
-}
-
-static void ggml_backend_amx_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
-    props->name        = ggml_backend_amx_device_get_name(dev);
-    props->description = ggml_backend_amx_device_get_description(dev);
-    props->type        = ggml_backend_amx_device_get_type(dev);
-    ggml_backend_amx_device_get_memory(dev, &props->memory_free, &props->memory_total);
-
-    // `buffer_from_host_ptr` is intended to be used in mmap, when memory layout unchanged
-    props->caps = {
-        /* .async                 = */ false,
-        /* .host_buffer           = */ false,
-        /* .buffer_from_host_ptr  = */ false,
-        /* .events                = */ false,
-    };
-}
-
-static ggml_backend_t ggml_backend_amx_device_init(ggml_backend_dev_t dev, const char * params) {
-    return ggml_backend_amx_init();
-
-    GGML_UNUSED(dev);
-    GGML_UNUSED(params);
-}
-
-static ggml_backend_buffer_type_t ggml_backend_amx_device_get_buffer_type(ggml_backend_dev_t dev) {
-    return ggml_backend_amx_buffer_type();
-
-    GGML_UNUSED(dev);
-}
-
-static bool ggml_backend_amx_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
-
-    // handle only 2d gemm for now
-    auto is_contiguous_2d = [](const struct ggml_tensor * t) {
-        return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
-    };
-
-    switch (op->op) {
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-            return true;
-
-        case GGML_OP_MUL_MAT: {
-            const struct ggml_tensor * src0 = op->src[0];
-            const struct ggml_tensor * src1 = op->src[1];
-
-            const enum ggml_type type = src0->type;
-            const int64_t ne0 = op->ne[0];
-
-            // amx kernels enables for Q4_0, Q4_1, Q8_0, F16
-            // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
-            bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
-
-            bool can_use_amx =
-                is_contiguous_2d(src0) &&       // src0 must be contiguous
-                is_contiguous_2d(src1) &&       // src1 must be contiguous
-                src1->type == GGML_TYPE_F32 &&  // src1 must be float32
-                has_amx_kernels &&              // with amx kernel impls
-                ne0 % (TILE_N * 2) == 0;        // out_features is 32x
-
-            return can_use_amx;
-        }
-        default:
-            return false;
-    }
-
-    GGML_UNUSED(dev);
-}
-
-static bool ggml_backend_amx_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
-
-    GGML_UNUSED(dev);
-}
-
-static const struct ggml_backend_device_i ggml_backend_amx_device_i = {
-    /* .get_name             = */ ggml_backend_amx_device_get_name,
-    /* .get_description      = */ ggml_backend_amx_device_get_description,
-    /* .get_memory           = */ ggml_backend_amx_device_get_memory,
-    /* .get_type             = */ ggml_backend_amx_device_get_type,
-    /* .get_props            = */ ggml_backend_amx_device_get_props,
-    /* .init_backend         = */ ggml_backend_amx_device_init,
-    /* .get_buffer_type      = */ ggml_backend_amx_device_get_buffer_type,
-    /* .get_host_buffer_type = */ NULL,
-    /* .buffer_from_host_ptr = */ NULL,
-    /* .supports_op          = */ ggml_backend_amx_device_supports_op,
-    /* .supports_buft        = */ ggml_backend_amx_device_supports_buft,
-    /* .offload_op           = */ NULL,
-    /* .event_new            = */ NULL,
-    /* .event_free           = */ NULL,
-    /* .event_synchronize    = */ NULL,
-};
-
-// backend reg interface
-
-static const char * ggml_backend_amx_reg_get_name(ggml_backend_reg_t reg) {
-    return "AMX";
-
-    GGML_UNUSED(reg);
-}
-
-static size_t ggml_backend_amx_reg_get_device_count(ggml_backend_reg_t reg) {
-    return 1;
-
-    GGML_UNUSED(reg);
-}
-
-static ggml_backend_dev_t ggml_backend_amx_reg_get_device(ggml_backend_reg_t reg, size_t index) {
-    GGML_ASSERT(index == 0);
-
-    static ggml_backend_device ggml_backend_amx_device = {
-        /* .iface   = */ ggml_backend_amx_device_i,
-        /* .reg     = */ reg,
-        /* .context = */ nullptr,
-    };
-
-    return &ggml_backend_amx_device;
-
-    GGML_UNUSED(reg);
-    GGML_UNUSED(index);
-}
-
-static void * ggml_backend_amx_get_proc_address(ggml_backend_reg_t reg, const char * name) {
-    if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
-        return (void *)ggml_backend_amx_set_n_threads;
-    }
-    return NULL;
-
-    GGML_UNUSED(reg);
-    GGML_UNUSED(name);
-}
-
-static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = {
-    /* .get_name         = */ ggml_backend_amx_reg_get_name,
-    /* .get_device_count = */ ggml_backend_amx_reg_get_device_count,
-    /* .get_device       = */ ggml_backend_amx_reg_get_device,
-    /* .get_proc_address = */ ggml_backend_amx_get_proc_address,
-};
-
-ggml_backend_reg_t ggml_backend_amx_reg(void) {
-    static struct ggml_backend_reg ggml_backend_amx_reg = {
-        /* .api_version = */ GGML_BACKEND_API_VERSION,
-        /* .iface       = */ ggml_backend_amx_reg_i,
-        /* .context     = */ NULL,
-    };
-
-    return &ggml_backend_amx_reg;
-}
-
-#else // if defined(__AMX_INT8__)
-
-ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void) {
-    return nullptr;
-}
-
-bool ggml_backend_is_amx(ggml_backend_t backend) {
-    GGML_UNUSED(backend);
-    return false;
-}
-
-ggml_backend_t ggml_backend_amx_init(void) {
-    fprintf(stderr, "GGML is not compiled with AMX support!\n");
-    return nullptr;
-}
-
-void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
-    fprintf(stderr, "GGML is not compiled with AMX support!\n");
-
-    GGML_UNUSED(backend_amx);
-    GGML_UNUSED(n_threads);
-}
-
-ggml_backend_reg_t ggml_backend_amx_reg(void) {
-    return nullptr;
-}
-
-#endif
-
-GGML_BACKEND_DL_IMPL(ggml_backend_amx_reg)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index a0e0e2c58..3182b84f5 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -49,10 +49,6 @@
 #include "ggml-rpc.h"
 #endif
 
-#ifdef GGML_USE_AMX
-#  include "ggml-amx.h"
-#endif
-
 #ifdef GGML_USE_CANN
 #include "ggml-cann.h"
 #endif
@@ -92,9 +88,6 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_RPC
         register_backend(ggml_backend_rpc_reg());
 #endif
-#ifdef GGML_USE_AMX
-        register_backend(ggml_backend_amx_reg());
-#endif
 #ifdef GGML_USE_KOMPUTE
         register_backend(ggml_backend_kompute_reg());
 #endif
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 45da0c27d..fdb4b986f 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -742,7 +742,8 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
 
     if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
         // since the tensor is pre-allocated, it cannot be moved to another backend
-        GGML_ABORT("pre-allocated tensor (%s) in a backend that cannot run the operation", tensor->name);
+        ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+        GGML_ABORT("pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)", tensor->name, ggml_backend_buffer_name(buffer), ggml_op_name(tensor->op));
     }
 
     // graph input
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index 4dbc1f75b..fe2222084 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -1,12 +1,20 @@
-ggml_add_backend_library(ggml-cpu
-                         ggml-cpu.c
-                         ggml-cpu.cpp
-                         ggml-cpu-aarch64.c
-                         ggml-cpu-aarch64.h
-                         ggml-cpu-quants.c
-                         ggml-cpu-quants.h
-                        )
+ggml_add_backend_library(ggml-cpu)
 
+list (APPEND GGML_CPU_SOURCES
+    ggml-cpu.c
+    ggml-cpu.cpp
+    ggml-cpu-aarch64.c
+    ggml-cpu-aarch64.h
+    ggml-cpu-quants.c
+    ggml-cpu-quants.h
+    amx/amx.cpp
+    amx/amx.h
+    amx/mmq.cpp
+    amx/mmq.h
+    ggml-cpu-impl.h
+    )
+
+target_compile_features(ggml-cpu PRIVATE c_std_11 cxx_std_17)
 target_include_directories(ggml-cpu PRIVATE .)
 
 if (APPLE AND GGML_ACCELERATE)
@@ -14,9 +22,9 @@ if (APPLE AND GGML_ACCELERATE)
     if (ACCELERATE_FRAMEWORK)
         message(STATUS "Accelerate framework found")
 
-        add_compile_definitions(GGML_USE_ACCELERATE)
-        add_compile_definitions(ACCELERATE_NEW_LAPACK)
-        add_compile_definitions(ACCELERATE_LAPACK_ILP64)
+        target_compile_definitions(ggml-cpu PRIVATE GGML_USE_ACCELERATE)
+        target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_NEW_LAPACK)
+        target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_LAPACK_ILP64)
 
         target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK})
     else()
@@ -29,15 +37,9 @@ if (GGML_OPENMP)
     if (OpenMP_FOUND)
         message(STATUS "OpenMP found")
 
-        add_compile_definitions(GGML_USE_OPENMP)
+        target_compile_definitions(ggml-cpu PRIVATE GGML_USE_OPENMP)
 
         target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
-
-        # FIXME: should be replaced with a compiler id check
-        #if (GGML_MUSA)
-        #    list(APPEND GGML_CPU_EXTRA_INCLUDES     "/usr/lib/llvm-14/lib/clang/14.0.0/include")
-        #    list(APPEND GGML_CPU_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so")
-        #endif()
     else()
         message(WARNING "OpenMP not found")
     endif()
@@ -46,11 +48,11 @@ endif()
 if (GGML_LLAMAFILE)
     message(STATUS "Using llamafile")
 
-    add_compile_definitions(GGML_USE_LLAMAFILE)
+    target_compile_definitions(ggml-cpu PRIVATE GGML_USE_LLAMAFILE)
 
-    target_sources(ggml-cpu PRIVATE
-                    llamafile/sgemm.cpp
-                    llamafile/sgemm.h)
+    list(APPEND GGML_CPU_SOURCES
+                llamafile/sgemm.cpp
+                llamafile/sgemm.h)
 endif()
 
 if (GGML_CPU_HBM)
@@ -58,7 +60,7 @@ if (GGML_CPU_HBM)
 
     message(STATUS "Using memkind for CPU HBM")
 
-    add_compile_definitions(GGML_USE_CPU_HBM)
+    target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_HBM)
 
     target_link_libraries(ggml-cpu PUBLIC memkind)
 endif()
@@ -72,16 +74,16 @@ if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
     message(STATUS "ARM detected")
 
     if (MSVC)
-        add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
-        add_compile_definitions(__ARM_NEON)
-        add_compile_definitions(__ARM_FEATURE_FMA)
+        list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead
+        list(APPEND ARCH_DEFINITIONS __ARM_NEON)
+        list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA)
 
         set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
         string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
 
         check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
         if (GGML_COMPILER_SUPPORT_DOTPROD)
-            add_compile_definitions(__ARM_FEATURE_DOTPROD)
+            list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
 
             message(STATUS "ARM feature DOTPROD enabled")
         endif ()
@@ -89,14 +91,14 @@ if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
         check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
 
         if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
-            add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
+            list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
 
             message(STATUS "ARM feature MATMUL_INT8 enabled")
         endif ()
 
         check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
         if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
-            add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+            list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 
             message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
         endif ()
@@ -118,7 +120,7 @@ if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
                 check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
                 if (GGML_COMPILER_SUPPORT_DOTPROD)
                     set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
-                    add_compile_definitions(__ARM_FEATURE_DOTPROD)
+                    list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
 
                     message(STATUS "ARM feature DOTPROD enabled")
                 endif ()
@@ -131,7 +133,7 @@ if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
                 check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
                 if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
                     set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
-                    add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
+                    list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
 
                     message(STATUS "ARM feature MATMUL_INT8 enabled")
                 endif ()
@@ -175,7 +177,6 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
     if (MSVC)
         # instruction set detection for MSVC only
         if (GGML_NATIVE)
-            # TODO: improve, should not reference files from the parent folder
             include(cmake/FindSIMD.cmake)
         endif ()
         if (GGML_AVX512)
@@ -185,37 +186,31 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
             # macros corresponding to the extensions.
             # Do it manually.
             if (GGML_AVX512_VBMI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
+                list(APPEND ARCH_DEFINITIONS __AVX512VBMI__)
                 if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
                     list(APPEND ARCH_FLAGS -mavx512vbmi)
                 endif()
             endif()
             if (GGML_AVX512_VNNI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
+                list(APPEND ARCH_DEFINITIONS __AVX512VNNI__)
                 if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
                     list(APPEND ARCH_FLAGS -mavx512vnni)
                 endif()
             endif()
             if (GGML_AVX512_BF16)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
+                list(APPEND ARCH_DEFINITIONS __AVX512BF16__)
                 if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
                     list(APPEND ARCH_FLAGS -mavx512bf16)
                 endif()
             endif()
             if (GGML_AMX_TILE)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
+                list(APPEND ARCH_DEFINITIONS __AMX_TILE__)
             endif()
             if (GGML_AMX_INT8)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
+                list(APPEND ARCH_DEFINITIONS __AMX_INT8__)
             endif()
             if (GGML_AMX_BF16)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
+                list(APPEND ARCH_DEFINITIONS __AMX_BF16__)
             endif()
         elseif (GGML_AVX2)
             list(APPEND ARCH_FLAGS /arch:AVX2)
@@ -276,7 +271,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
        list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
     else()
         list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
-        #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
+        # TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
     endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
     message(STATUS "loongarch64 detected")
@@ -299,11 +294,12 @@ endif()
 
 if (GGML_CPU_AARCH64)
     message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels")
-    add_compile_definitions(GGML_USE_CPU_AARCH64)
+    target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_AARCH64)
 endif()
 
-target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
-target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
+target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES})
+set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS     "${ARCH_FLAGS}")
+set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}")
 
 if (EMSCRIPTEN)
     set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
diff --git a/ggml/src/ggml-cpu/amx/amx.cpp b/ggml/src/ggml-cpu/amx/amx.cpp
new file mode 100644
index 000000000..09c0df0f5
--- /dev/null
+++ b/ggml/src/ggml-cpu/amx/amx.cpp
@@ -0,0 +1,196 @@
+#include "amx.h"
+#include "common.h"
+#include "mmq.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+#include "ggml-cpu.h"
+
+#if defined(__gnu_linux__)
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+
+// AMX buffer interface
+static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    free(buffer->context);
+}
+
+static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return (void *)(buffer->context);
+}
+
+static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+    memset((char *)tensor->data + offset, value, size);
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    if (qtype_has_amx_kernels(tensor->type)) {
+        ggml_backend_amx_convert_weight(tensor, data, offset, size);
+    } else {
+        memcpy((char *)tensor->data + offset, data, size);
+    }
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
+    memcpy(data, (const char *)tensor->data + offset, size);
+
+    GGML_UNUSED(buffer);
+}
+
+static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+    if (ggml_backend_buffer_is_host(src->buffer)) {
+        if (qtype_has_amx_kernels(src->type)) {
+            ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_nbytes(dst));
+        } else {
+            memcpy(dst->data, src->data, ggml_nbytes(src));
+        }
+        return true;
+    }
+    return false;
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    memset(buffer->context, value, buffer->size);
+}
+
+static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_amx_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_amx_buffer_get_base,
+    /* .init_tensor     = */ NULL, // no initialization required
+    /* .memset_tensor   = */ ggml_backend_amx_buffer_memset_tensor,
+    /* .set_tensor      = */ ggml_backend_amx_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_amx_buffer_get_tensor,
+    /* .cpy_tensor      = */ ggml_backend_amx_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_amx_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "AMX";
+
+    GGML_UNUSED(buft);
+}
+
+static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
+    if (data == NULL) {
+        fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
+        return NULL;
+    }
+
+    return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
+}
+
+static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return TENSOR_ALIGNMENT;
+
+    GGML_UNUSED(buft);
+}
+
+static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
+    return ggml_backend_amx_get_alloc_size(tensor);
+
+    GGML_UNUSED(buft);
+}
+
+static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+    return false;
+
+    GGML_UNUSED(buft);
+}
+
+#define ARCH_GET_XCOMP_PERM     0x1022
+#define ARCH_REQ_XCOMP_PERM     0x1023
+#define XFEATURE_XTILECFG       17
+#define XFEATURE_XTILEDATA      18
+
+static bool ggml_amx_init() {
+#if defined(__gnu_linux__)
+    if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
+        fprintf(stderr, "AMX is not ready to be used!\n");
+        return false;
+    }
+    return true;
+#elif defined(_WIN32)
+    return true;
+#endif
+}
+ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
+    static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
+        /* .iface = */ {
+            /* .get_name         = */ ggml_backend_amx_buffer_type_get_name,
+            /* .alloc_buffer     = */ ggml_backend_amx_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_amx_buffer_type_get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_alloc_size   = */ ggml_backend_amx_buffer_type_get_alloc_size,
+            /* .is_host          = */ ggml_backend_amx_buffer_type_is_host,
+        },
+        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+        /* .context = */ NULL,
+    };
+
+    if (!ggml_amx_init()) {
+        return NULL;
+    }
+
+    return &ggml_backend_buffer_type_amx;
+}
+
+bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
+}
+
+bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op) {
+    // handle only 2d gemm for now
+    auto is_contiguous_2d = [](const struct ggml_tensor * t) {
+        return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
+    };
+
+    switch (op->op) {
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            return true;
+
+        case GGML_OP_MUL_MAT: {
+            const struct ggml_tensor * src0 = op->src[0];
+            const struct ggml_tensor * src1 = op->src[1];
+
+            const enum ggml_type type = src0->type;
+            const int64_t ne0 = op->ne[0];
+
+            // amx kernels enables for Q4_0, Q4_1, Q8_0, F16
+            // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
+            bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
+
+            bool can_use_amx =
+                is_contiguous_2d(src0) &&       // src0 must be contiguous
+                is_contiguous_2d(src1) &&       // src1 must be contiguous
+                src1->type == GGML_TYPE_F32 &&  // src1 must be float32
+                has_amx_kernels &&              // with amx kernel impls
+                ne0 % (TILE_N * 2) == 0;        // out_features is 32x
+
+            return can_use_amx;
+        }
+        default:
+            return false;
+    }
+}
+
+#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
diff --git a/ggml/src/ggml-cpu/amx/amx.h b/ggml/src/ggml-cpu/amx/amx.h
new file mode 100644
index 000000000..c43546273
--- /dev/null
+++ b/ggml/src/ggml-cpu/amx/amx.h
@@ -0,0 +1,20 @@
+#include "ggml-backend.h"
+#include "ggml-cpu-impl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+
+ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
+bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft);
+bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op);
+void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/src/ggml-amx/common.h b/ggml/src/ggml-cpu/amx/common.h
similarity index 77%
rename from ggml/src/ggml-amx/common.h
rename to ggml/src/ggml-cpu/amx/common.h
index 5db8ce30d..0b0657289 100644
--- a/ggml/src/ggml-amx/common.h
+++ b/ggml/src/ggml-cpu/amx/common.h
@@ -1,8 +1,7 @@
 #pragma once
 
 #include "ggml.h"
-// hack until AMX is moved into the CPU backend
-#include "../ggml-cpu/ggml-cpu-impl.h" // <immintrin.h>
+#include "ggml-cpu-impl.h"
 
 #include <algorithm>
 #include <memory>
@@ -74,16 +73,24 @@ inline void parallel_for(int nth, int n, const func_t& f) {
 #endif
 }
 
+template <typename func_t>
+inline void parallel_for_ggml(const ggml_compute_params * params, int n, const func_t & f) {
+    int tbegin, tend;
+    balance211(n, params->nth, params->ith, tbegin, tend);
+    f(tbegin, tend);
+    ggml_barrier(params->threadpool); // TODO: might not always be needed
+}
+
 // quantized types that have AMX support
 inline bool qtype_has_amx_kernels(const enum ggml_type type) {
     // TODO: fix padding for vnni format
     return (type == GGML_TYPE_Q4_0) ||
-        (type == GGML_TYPE_Q4_1);
-        //(type == GGML_TYPE_Q8_0) ||
-        //(type == GGML_TYPE_Q4_K) ||
-        //(type == GGML_TYPE_Q5_K) ||
-        //(type == GGML_TYPE_Q6_K) ||
-        //(type == GGML_TYPE_IQ4_XS);
+        (type == GGML_TYPE_Q4_1) ||
+        (type == GGML_TYPE_Q8_0) ||
+        (type == GGML_TYPE_Q4_K) ||
+        (type == GGML_TYPE_Q5_K) ||
+        (type == GGML_TYPE_Q6_K) ||
+        (type == GGML_TYPE_IQ4_XS);
 }
 
 // ggml backend context
diff --git a/ggml/src/ggml-amx/mmq.cpp b/ggml/src/ggml-cpu/amx/mmq.cpp
similarity index 98%
rename from ggml/src/ggml-amx/mmq.cpp
rename to ggml/src/ggml-cpu/amx/mmq.cpp
index 529bee25b..6447e73d0 100644
--- a/ggml/src/ggml-amx/mmq.cpp
+++ b/ggml/src/ggml-cpu/amx/mmq.cpp
@@ -4,8 +4,11 @@
 #pragma GCC diagnostic ignored "-Wunused-local-typedefs"
 #endif
 
+#include "amx.h"
 #include "mmq.h"
 #include "ggml-impl.h"
+#include "ggml-cpu-impl.h"
+#include "ggml-cpu-quants.h"
 #include "ggml-quants.h"
 #include <algorithm>
 #include <type_traits>
@@ -33,7 +36,7 @@
 #define ALWAYS_INLINE inline
 #endif
 
-#if defined(__AMX_INT8__)
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
 
 namespace {
 
@@ -496,13 +499,12 @@ inline void from_float(const float * x, char * vy, int64_t k);
 
 template <>
 inline void from_float<block_q8_0>(const float * x, char * vy, int64_t k) {
-    // FIXME: using unoptimized reference impl until moved to CPU backend
-    quantize_row_q8_0_ref(x, (block_q8_0 *)vy, k);
+    quantize_row_q8_0(x, (block_q8_0 *)vy, k);
 }
 
 template <>
 inline void from_float<block_q8_1>(const float * x, char * vy, int64_t k) {
-    quantize_row_q8_1_ref(x, (block_q8_1 *)vy, k);
+    quantize_row_q8_1(x, (block_q8_1 *)vy, k);
 }
 
 template <>
@@ -950,7 +952,7 @@ template<typename TB, typename packed_B_t = packed_B_type<TB>>
 void unpack_B(packed_B_t * RESTRICT tile, const void * RESTRICT packed_B) {
     GGML_UNUSED(tile);
     GGML_UNUSED(packed_B);
-};
+}
 
 template <>
 void unpack_B<block_q4_0>(int8_t * RESTRICT tile, const void * RESTRICT packed_B) {
@@ -2327,9 +2329,7 @@ size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor) {
 
 // pack weight to vnni format
 void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-
-    size_t alloc_size = ggml_backend_amx_get_alloc_size(tensor);
-    GGML_ASSERT(alloc_size == size);
+    GGML_ASSERT(offset == 0 && size == ggml_nbytes(tensor)); // only full tensor conversion is supported for now
 
     const enum ggml_type TYPE = tensor->type;
 
@@ -2348,6 +2348,29 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d
     });
 }
 
+size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst) {
+    struct ggml_tensor * src0 = dst->src[0];
+
+    const enum ggml_type TYPE = src0->type;
+
+    const bool is_floating_type = TYPE == GGML_TYPE_F16;
+    if (is_floating_type) {
+        return 0;
+    }
+
+    const int M = dst->ne[1];
+    const int K = src0->ne[0];
+
+    size_t desired_wsize = 0;
+
+    GGML_DISPATCH_QTYPES(TYPE, [&] {
+        const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
+        desired_wsize = M * row_size_A;
+    });
+
+    return desired_wsize;
+}
+
 // NB: mixed dtype gemm with Advanced Matrix Extensions (Intel AMX)
 //
 // src0: weight in shape of {N, K}, quantized
@@ -2356,14 +2379,12 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d
 //
 // the function performs: dst = src1 @ src0.T
 //
-void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst) {
+void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_tensor * dst) {
     struct ggml_tensor * src0 = dst->src[0];
     struct ggml_tensor * src1 = dst->src[1];
 
     const enum ggml_type TYPE = src0->type;
 
-    const int n_threads = ctx->n_threads;
-
     // f16 only has avx512 kernels for now,
     // amx kernels will be added once 6th gen xeon is released.
     const bool is_floating_type = TYPE == GGML_TYPE_F16;
@@ -2379,7 +2400,7 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
         const int MB = div_up(M, BLOCK_M);
         const int NB = div_up(N, BLOCK_N);
 
-        parallel_for(n_threads, MB * NB, [&](int begin, int end) {
+        parallel_for_ggml(params, MB * NB, [&](int begin, int end) {
             GGML_DISPATCH_FLOATING_TYPES(TYPE, [&] {
                 for (int i = begin; i < end; ++i) {
                     int mb = i / NB;
@@ -2412,27 +2433,29 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
     }
 
     // pointer to work space, used convert A from float to quantized type
-    void * wdata = nullptr;
+    void * wdata = params->wdata;
 
     //TODO: performance improvement: merge quant A
-    GGML_DISPATCH_QTYPES(TYPE, [&] {
-        const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
-        const size_t desired_wsize = M * row_size_A;
-        if (ctx->work_size < desired_wsize) {
-            ctx->work_data.reset(new char[desired_wsize]);
-            ctx->work_size = desired_wsize;
-        }
-        wdata = ctx->work_data.get();
+    if (params->ith == 0) {
+        GGML_DISPATCH_QTYPES(TYPE, [&] {
+            const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
+            const size_t desired_wsize = M * row_size_A;
+            if (params->wsize < desired_wsize) {
+                GGML_ABORT("insufficient work space size");
+            }
 
-        // Q4_0, Q4_1, Q8_0 handles 1 TILE_K per blck_size
-        // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
-        GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);
+            // Q4_0, Q4_1, Q8_0 handles 1 TILE_K per blck_size
+            // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
+            GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);
 
-        const float * A_data = static_cast<const float *>(src1->data);
-        for (int m = 0; m < M; ++m) {
-            from_float<vec_dot_type>(A_data + m * K, (char *)wdata + m * row_size_A, K);
-        }
-    });
+            const float * A_data = static_cast<const float *>(src1->data);
+            for (int m = 0; m < M; ++m) {
+                from_float<vec_dot_type>(A_data + m * K, (char *)wdata + m * row_size_A, K);
+            }
+        });
+    }
+
+    ggml_barrier(params->threadpool);
 
     if (M == 1) {
         // MB = 1 and handle 8 tiles in each block
@@ -2440,7 +2463,7 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
         constexpr int BLOCK_N = TILE_N * kTilesN;
         const int NB = div_up(N, BLOCK_N);
 
-        parallel_for(n_threads, NB, [&](int begin, int end) {
+        parallel_for_ggml(params, NB, [&](int begin, int end) {
             GGML_DISPATCH_QTYPES(TYPE, [&] {
                 const int KB = K / blck_size;
                 const int TILE_SIZE = get_tile_size<type>();
@@ -2470,7 +2493,7 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
     const int MB = div_up(M, BLOCK_M);
     const int NB = div_up(N, BLOCK_N);
 
-    parallel_for(n_threads, MB * NB, [&](int begin, int end) {
+    parallel_for_ggml(params, MB * NB, [&](int begin, int end) {
         // init tile config for each thread
         ggml_tile_config_init();
 
@@ -2498,13 +2521,4 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
     });
 }
 
-#else // if defined(__AMX_INT8__)
-
-void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst) {
-    fprintf(stderr, "GGML is not compiled with AMX support!\n");
-
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-#endif // if defined(__AMX_INT8__)
+#endif // if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
diff --git a/ggml/src/ggml-amx/mmq.h b/ggml/src/ggml-cpu/amx/mmq.h
similarity index 72%
rename from ggml/src/ggml-amx/mmq.h
rename to ggml/src/ggml-cpu/amx/mmq.h
index cf0920620..f37366093 100644
--- a/ggml/src/ggml-amx/mmq.h
+++ b/ggml/src/ggml-cpu/amx/mmq.h
@@ -1,6 +1,5 @@
 #pragma once
 #include "common.h"
-#include <stdint.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -10,7 +9,7 @@ size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
 
 void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
 
-void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst);
+void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 
 #ifdef __cplusplus
 }
diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
index 27a530b22..d71076ad1 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -15,6 +15,18 @@
 extern "C" {
 #endif
 
+struct ggml_compute_params {
+    // ith = thread index, nth = number of threads
+    int ith, nth;
+
+    // work buffer for all threads
+    size_t wsize;
+    void * wdata;
+
+    struct ggml_threadpool * threadpool;
+};
+
+
 #if defined(_MSC_VER)
 
 #define m512bh(p) p
@@ -366,6 +378,9 @@ static __m256 __lasx_xvreplfr2vr_s(float val) {
 }
 #endif
 
+// TODO: move to ggml-threading
+void ggml_barrier(struct ggml_threadpool * tp);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index e0cefc20b..23ae2e10c 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -10,6 +10,7 @@
 #include "ggml-quants.h"
 #include "ggml-cpu-quants.h"
 #include "ggml-threading.h"
+#include "amx/amx.h"
 #include "ggml.h"
 
 #if defined(_MSC_VER) || defined(__MINGW32__)
@@ -624,7 +625,7 @@ do {                                                                  \
     for (int i = 0; i < offset; ++i) {                                \
         x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
     }                                                                 \
-    res = _mm512_reduce_add_ps(x[0]);                                 \
+    res = (ggml_float) _mm512_reduce_add_ps(x[0]);                    \
 } while (0)
 
 // TODO: is this optimal ?
@@ -674,7 +675,7 @@ do {                                                              \
     for (int i = 0; i < offset; ++i) {                            \
         x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
     }                                                             \
-    res = _mm512_reduce_add_ps(x[0]);                             \
+    res = (ggml_float) _mm512_reduce_add_ps(x[0]);                \
 } while (0)
 
 #define GGML_F16_VEC                GGML_F32Cx16
@@ -685,8 +686,8 @@ do {                                                              \
 #define GGML_F16_VEC_FMA            GGML_F32Cx16_FMA
 #define GGML_F16_VEC_ADD            GGML_F32Cx16_ADD
 #define GGML_F16_VEC_MUL            GGML_F32Cx16_MUL
-#define GGML_F16_VEC_REDUCE         GGML_F32Cx16_REDUCE
 
+#define GGML_F16_VEC_REDUCE         GGML_F32Cx16_REDUCE
 #elif defined(__AVX__)
 
 #define GGML_SIMD
@@ -1178,28 +1179,28 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
 #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
 #define GGML_F32x4_ADD     __lsx_vfadd_s
 #define GGML_F32x4_MUL     __lsx_vfmul_s
-#define GGML_F32x4_REDUCE(res, x)                                 \
-{                                                                 \
-    int offset = GGML_F32_ARR >> 1;                               \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                     \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                     \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                     \
-    }                                                             \
-    __m128i tmp = __lsx_vsrli_d((__m128i)x[0], 32); \
-    tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, x[0]); \
-    tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
-    const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
-    tmp = __lsx_vsrli_d((__m128i)t0, 32); \
-    tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, t0); \
-    tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
-    res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0);        \
+#define GGML_F32x4_REDUCE(res, x)                                                     \
+{                                                                                     \
+    int offset = GGML_F32_ARR >> 1;                                                   \
+    for (int i = 0; i < offset; ++i) {                                                \
+        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
+    }                                                                                 \
+    offset >>= 1;                                                                     \
+    for (int i = 0; i < offset; ++i) {                                                \
+        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
+    }                                                                                 \
+    offset >>= 1;                                                                     \
+    for (int i = 0; i < offset; ++i) {                                                \
+        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
+    }                                                                                 \
+    __m128i tmp     = __lsx_vsrli_d((__m128i) x[0], 32);                              \
+    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]);                    \
+    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
+    const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88);                                     \
+    tmp             = __lsx_vsrli_d((__m128i) t0, 32);                                \
+    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, t0);                      \
+    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
+    res             = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
 }
 
 #define GGML_F32_VEC        GGML_F32x4
@@ -1367,31 +1368,15 @@ struct ggml_compute_state {
     int ith;
 };
 
-struct ggml_compute_params {
-    // ith = thread index, nth = number of threads
-    int ith, nth;
-
-    // work buffer for all threads
-    size_t wsize;
-    void * wdata;
-
-    struct ggml_threadpool * threadpool;
-};
-
 //
 // fundamental operations
 //
 
 inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
 inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
 inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
 inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
 inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
 inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
 inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float   v) { for (int i = 0; i < n; ++i) z[i]  = x[i] + v;    }
 inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }
@@ -2286,7 +2271,7 @@ struct ggml_state {
 
 static struct ggml_state g_state = {0};
 
-static void ggml_barrier(struct ggml_threadpool * tp) {
+void ggml_barrier(struct ggml_threadpool * tp) {
     int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
     if (n_threads == 1) {
         return;
@@ -7455,6 +7440,13 @@ static void ggml_compute_forward_mul_mat(
         type = (enum ggml_type)(intptr_t)src0->extra;
     }
 
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+    if (src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
+        ggml_backend_amx_mul_mat(params, dst);
+        return;
+    }
+#endif
+
     enum ggml_type           const vec_dot_type         = type_traits_cpu[type].vec_dot_type;
     ggml_from_float_t        const from_float           = type_traits_cpu[vec_dot_type].from_float;
     ggml_from_float_to_mat_t const from_float_to_mat    = type_traits_cpu[vec_dot_type].from_float_to_mat;
@@ -13294,10 +13286,16 @@ struct ggml_cplan ggml_graph_plan(
                 } break;
             case GGML_OP_MUL_MAT:
                 {
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+                    if (node->src[0]->buffer && ggml_backend_amx_buft_is_amx(node->src[0]->buffer->buft)) {
+                        cur = ggml_backend_amx_desired_wsize(node);
+                    }
+#endif
                     const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
 
                     if (node->src[1]->type != vec_dot_type) {
-                        cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
+                        size_t cur2 = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
+                        cur = MAX(cur, cur2);
                     }
                 } break;
             case GGML_OP_MUL_MAT_ID:
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index 44d99089a..77e5d87a8 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -3,6 +3,7 @@
 #include "ggml-cpu.h"
 #include "ggml-cpu-aarch64.h"
 #include "ggml-impl.h"
+#include "amx/amx.h"
 #include <cctype>
 #include <string>
 #include <vector>
@@ -134,12 +135,16 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backen
     static std::vector<ggml_backend_buffer_type_t> bufts = []() {
         std::vector<ggml_backend_buffer_type_t> bufts;
 
-#ifdef GGML_USE_CPU_HBM
-        bufts.push_back(ggml_backend_cpu_hbm_buffer_type());
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+        if (ggml_backend_amx_buffer_type()) {
+            bufts.push_back(ggml_backend_amx_buffer_type());
+        }
 #endif
 
 #ifdef GGML_USE_CPU_AARCH64
-        bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
+        if (ggml_backend_cpu_aarch64_buffer_type()) {
+            bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
+        }
 #endif
 
         bufts.push_back(NULL);
@@ -456,12 +461,27 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
     const struct ggml_tensor * src0 = op->src[0];
     const struct ggml_tensor * src1 = op->src[1];
 
+    if (op->op == GGML_OP_NONE || op->op == GGML_OP_RESHAPE || op->op == GGML_OP_VIEW || op->op == GGML_OP_PERMUTE || op->op == GGML_OP_TRANSPOSE) {
+        return true;
+    }
+
     if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
         if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) {
             return false;
         }
     }
 
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+    if (src0 && src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
+        return ggml_backend_amx_device_supports_op(op);
+    }
+    for (int i = 1; i < GGML_MAX_SRC; i++) {
+        if (op->src[i] && op->src[i]->buffer && ggml_backend_amx_buft_is_amx(op->src[i]->buffer->buft)) {
+            return false;
+        }
+    }
+#endif
+
     for (int i = 1; i < GGML_MAX_SRC; i++) {
         if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
             return false;
@@ -491,7 +511,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
 }
 
 static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);
+    bool supported = ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);
+
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+    supported = supported || ggml_backend_amx_buft_is_amx(buft);
+#endif
+
+    return supported;
 
     GGML_UNUSED(dev);
 }
diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
index b2ce2e664..da4146ec4 100644
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@@ -50,8 +50,7 @@
 
 #include "sgemm.h"
 #include "ggml-impl.h"
-// hack until moved into the CPU backend
-#include "../ggml-cpu-impl.h"
+#include "ggml-cpu-impl.h"
 #include "ggml-quants.h"
 
 #ifdef _MSC_VER
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index f39b7a88c..78e3af8f2 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -30,11 +30,13 @@
 extern "C" {
 #endif
 
-#undef MIN
-#undef MAX
+#ifndef MIN
+#    define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
 
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#ifndef MAX
+#    define MAX(a, b) ((a) > (b) ? (a) : (b))
+#endif
 
 // required for mmap as gguf only guarantees 32-byte alignment
 #define TENSOR_ALIGNMENT 32
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
index 10075db33..51c78b7d2 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
@@ -3,5 +3,5 @@ find_package (Threads REQUIRED)
 set(TARGET vulkan-shaders-gen)
 add_executable(${TARGET} vulkan-shaders-gen.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
 target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
diff --git a/pocs/vdot/CMakeLists.txt b/pocs/vdot/CMakeLists.txt
index d5405ad29..6235aec1f 100644
--- a/pocs/vdot/CMakeLists.txt
+++ b/pocs/vdot/CMakeLists.txt
@@ -1,9 +1,9 @@
 set(TARGET llama-vdot)
 add_executable(${TARGET} vdot.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 set(TARGET llama-q8dot)
 add_executable(${TARGET} q8dot.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2f581b921..f3b3908b1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -25,7 +25,7 @@ add_library(llama
             )
 
 target_include_directories(llama PUBLIC . ../include)
-target_compile_features   (llama PUBLIC cxx_std_11) # don't bump
+target_compile_features   (llama PUBLIC cxx_std_17) # don't bump
 
 target_link_libraries(llama PUBLIC ggml)
 
diff --git a/src/unicode.cpp b/src/unicode.cpp
index 50b35bbbc..3d4592635 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -201,7 +201,18 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
 }
 
 static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
+#if defined(__clang__)
+    // disable C++17 deprecation warning for std::codecvt_utf8
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
     std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
+
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#endif
+
     return conv.from_bytes(s);
 }
 
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index be370044d..e5c9e75e4 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -284,7 +284,7 @@ static void test_perf() {
 
     data.reserve(n_vocab);
     for (int i = 0; i < n_vocab; i++) {
-        const float logit = 2.0f*((float)(rand())/RAND_MAX - 0.5f);
+        const float logit = 2.0f*((double)(rand())/RAND_MAX - 0.5);
         data.emplace_back(llama_token_data{i, logit, 0.0f});
     }
 

From 0533e7fb3842a523f64dc533bd7bd7147ec2c63a Mon Sep 17 00:00:00 2001
From: Eve <139727413+netrunnereve@users.noreply.github.com>
Date: Sat, 30 Nov 2024 07:00:02 +0000
Subject: [PATCH 09/24] vulkan: Dynamic subgroup size support for Q6_K mat_vec
 (#10536)

* subgroup 64 version with subgroup add. 15% faster

scalable version

tested for subgroup sizes 16-128

* check for subgroup multiple of 16 and greater than 16

* subgroup sizes are always a power of 2 (https://github.com/KhronosGroup/GLSL/issues/45)

* force 16 sequential threads per block

* make 16 subgroup size a constant
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp          | 13 ++++----
 .../vulkan-shaders/mul_mat_vec_q6_k.comp      | 30 +++++++++----------
 2 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 849c11923..df6a659f4 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1231,6 +1231,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
 
     std::cerr << "ggml_vulkan: Compiling shaders";
 
+    // some shaders require the subgroup size to be 16 or larger
+    const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u);
+
     // mulmat
     std::vector<uint32_t> l_warptile, m_warptile, s_warptile,
                           l_warptile_mmq, m_warptile_mmq, s_warptile_mmq;
@@ -1240,11 +1243,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
 
     l_warptile = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
     m_warptile = { 128,  64,  64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
-    s_warptile = { std::max(device->subgroup_size, 16u),  32,  32, 16, 32, 32, 2, 2, 2, device->subgroup_size };
+    s_warptile = { subgroup_size_16,  32,  32, 16, 32, 32, 2, 2, 2, device->subgroup_size };
 
     l_warptile_mmq = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
     m_warptile_mmq = { 128,  64,  64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
-    s_warptile_mmq = { std::max(device->subgroup_size, 16u),  32,  32, 32, 32, 32, 2, 2, 2, device->subgroup_size };
+    s_warptile_mmq = { subgroup_size_16,  32,  32, 32, 32, 32, 2, 2, 2, device->subgroup_size };
 
     l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 };
     m_mmq_wg_denoms = m_wg_denoms = { 64,  64, 1 };
@@ -1431,7 +1434,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
 
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32",  mul_mat_vec_f32_f16_f32_len,  mul_mat_vec_f32_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
@@ -1445,7 +1448,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size}, 1, true);
 
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32",  mul_mat_vec_id_f32_f32_len,  mul_mat_vec_id_f32_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
@@ -1459,7 +1462,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
 
     // dequant shaders
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
index 0b392d68d..760aff854 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
@@ -4,9 +4,11 @@
 
 #include "mul_mat_vec_base.comp"
 
-layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 
-shared FLOAT_TYPE tmp[32];
+layout (constant_id = 0) const uint BLOCK_SIZE = 32;
+
+shared FLOAT_TYPE tmp[BLOCK_SIZE];
 
 void main() {
     const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
@@ -21,21 +23,19 @@ void main() {
     const uint num_blocks_per_row = p.ncols / QUANT_K;
     const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
 
-    const uint tid = gl_LocalInvocationID.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const uint ix  = gl_LocalInvocationID.x%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
+    // 16 threads are used to process each block
+    const uint it_size = gl_WorkGroupSize.x/16;
+    const uint tid = gl_LocalInvocationID.x;
+    const uint itid = tid%16;  // 0...16
+    const uint ix  = tid/16;
 
-    const uint step = 16/K_QUANTS_PER_ITERATION;            // 16 or 8
+    const uint step = 8;
 
-    const uint v_im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const uint v_in = tid - step*v_im;                      // 0...15 or 0...7
+    const uint v_im = itid/step;                            // 0 or 1. 0 computes 0..., 1 computes 128...
+    const uint v_in = itid - step*v_im;                     // 0...15 or 0...7
 
-#if K_QUANTS_PER_ITERATION == 1
-    const uint l0 = v_in;                                   // 0...15
-    const uint is = 0;
-#else
     const uint l0 = 4 * v_in;                               // 0, 4, 8, ..., 28
     const uint is = v_in / 4;
-#endif
 
     const uint ql_offset = 64*v_im + l0;
     const uint qh_offset = 32*v_im + l0;
@@ -44,7 +44,7 @@ void main() {
 
     FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
 
-    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
         const uint y_idx   = i * QUANT_K + y_offset;
 
         const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
@@ -95,10 +95,10 @@ void main() {
     }
 
     tmp[gl_LocalInvocationID.x] = temp;
-
     // sum up partial sums and write back result
+
     barrier();
-    [[unroll]] for (uint s = 16; s > 0; s >>= 1) {
+    [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
         if (tid < s) {
             tmp[tid] += tmp[tid + s];
         }

From abadba05be52cccf6c0da49534e37f6062ce8ded Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 30 Nov 2024 09:47:07 +0200
Subject: [PATCH 10/24] readme : refresh (#10587)

* readme : refresh

* readme : move section [no ci]

* readme : clarify [no ci]

* readme : fixes [no ci]

* readme : more fixes [no ci]

* readme : simplify [no ci]

* readme : clarify GGUF
---
 README.md | 362 +++++++++++++++++++-----------------------------------
 1 file changed, 123 insertions(+), 239 deletions(-)

diff --git a/README.md b/README.md
index 8aa3d1e59..c5321f26c 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 ## Description
 
 The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
-variety of hardware - locally and in the cloud.
+range of hardware - locally and in the cloud.
 
 - Plain C/C++ implementation without any dependencies
 - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
@@ -36,14 +36,17 @@ variety of hardware - locally and in the cloud.
 - Vulkan and SYCL backend support
 - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
 
-Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
-improved significantly thanks to many contributions. It is the main playground for developing new features for the
-[ggml](https://github.com/ggerganov/ggml) library.
+The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library.
 
-**Supported models:**
+<details>
+<summary>Models</summary>
 
 Typically finetunes of the base models below are supported as well.
 
+Instructions for adding support for new models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md)
+
+**Text-only:**
+
 - [X] LLaMA 🦙
 - [x] LLaMA 2 🦙🦙
 - [x] LLaMA 3 🦙🦙🦙
@@ -97,9 +100,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
 - [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
 
-(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
-
-**Multimodal models:**
+**Multimodal:**
 
 - [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
 - [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
@@ -111,7 +112,10 @@ Typically finetunes of the base models below are supported as well.
 - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
 - [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
 
-**Bindings:**
+</details>
+
+<details>
+<summary>Bindings</summary>
 
 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
@@ -138,7 +142,12 @@ Typically finetunes of the base models below are supported as well.
 - Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
 - Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
 
-**UI:**
+</details>
+
+<details>
+<summary>UIs</summary>
+
+*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
 
 - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
 - [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
@@ -148,6 +157,7 @@ Typically finetunes of the base models below are supported as well.
 - [janhq/jan](https://github.com/janhq/jan) (AGPL)
 - [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0)
 - [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
+- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)
 - [LARS](https://github.com/abgulati/LARS) (AGPL)
 - [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
 - [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
@@ -171,241 +181,35 @@ Typically finetunes of the base models below are supported as well.
 - [semperai/amica](https://github.com/semperai/amica) (MIT)
 - [withcatai/catai](https://github.com/withcatai/catai) (MIT)
 
-*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
+</details>
 
-**Tools:**
+<details>
+<summary>Tools</summary>
 
 - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
 - [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp
 - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
 - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
-- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)
+- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example)
 
-**Infrastructure:**
+</details>
+
+<details>
+<summary>Infrastructure</summary>
 
 - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
 - [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
 - [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
 
-**Games:**
+</details>
+
+<details>
+<summary>Games</summary>
+
 - [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
 
-## Demo
-
-<details>
-<summary>Typical run using LLaMA v2 13B on M2 Ultra</summary>
-
-```
-$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
-I llama.cpp build info:
-I UNAME_S:  Darwin
-I UNAME_P:  arm
-I UNAME_M:  arm64
-I CFLAGS:   -I.            -O3 -std=c11   -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -pthread -DGGML_USE_K_QUANTS -DGGML_USE_ACCELERATE
-I CXXFLAGS: -I. -I./common -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS
-I LDFLAGS:   -framework Accelerate
-I CC:       Apple clang version 14.0.3 (clang-1403.0.22.14.1)
-I CXX:      Apple clang version 14.0.3 (clang-1403.0.22.14.1)
-
-make: Nothing to be done for `default'.
-main: build = 1041 (cf658ad)
-main: seed  = 1692823051
-llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
-llama_model_loader: - type  f32:   81 tensors
-llama_model_loader: - type q4_0:  281 tensors
-llama_model_loader: - type q6_K:    1 tensors
-llm_load_print_meta: format         = GGUF V1 (latest)
-llm_load_print_meta: arch           = llama
-llm_load_print_meta: vocab type     = SPM
-llm_load_print_meta: n_vocab        = 32000
-llm_load_print_meta: n_merges       = 0
-llm_load_print_meta: n_ctx_train    = 4096
-llm_load_print_meta: n_ctx          = 512
-llm_load_print_meta: n_embd         = 5120
-llm_load_print_meta: n_head         = 40
-llm_load_print_meta: n_head_kv      = 40
-llm_load_print_meta: n_layer        = 40
-llm_load_print_meta: n_rot          = 128
-llm_load_print_meta: n_gqa          = 1
-llm_load_print_meta: f_norm_eps     = 1.0e-05
-llm_load_print_meta: f_norm_rms_eps = 1.0e-05
-llm_load_print_meta: n_ff           = 13824
-llm_load_print_meta: freq_base      = 10000.0
-llm_load_print_meta: freq_scale     = 1
-llm_load_print_meta: model type     = 13B
-llm_load_print_meta: model ftype    = mostly Q4_0
-llm_load_print_meta: model size     = 13.02 B
-llm_load_print_meta: general.name   = LLaMA v2
-llm_load_print_meta: BOS token = 1 '<s>'
-llm_load_print_meta: EOS token = 2 '</s>'
-llm_load_print_meta: UNK token = 0 '<unk>'
-llm_load_print_meta: LF token  = 13 '<0x0A>'
-llm_load_tensors: ggml ctx size =    0.11 MB
-llm_load_tensors: mem required  = 7024.01 MB (+  400.00 MB per state)
-...................................................................................................
-llama_new_context_with_model: kv self size  =  400.00 MB
-llama_new_context_with_model: compute buffer total size =   75.41 MB
-
-system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
-sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
-generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0
-
-
- Building a website can be done in 10 simple steps:
-Step 1: Find the right website platform.
-Step 2: Choose your domain name and hosting plan.
-Step 3: Design your website layout.
-Step 4: Write your website content and add images.
-Step 5: Install security features to protect your site from hackers or spammers
-Step 6: Test your website on multiple browsers, mobile devices, operating systems etc…
-Step 7: Test it again with people who are not related to you personally – friends or family members will work just fine!
-Step 8: Start marketing and promoting the website via social media channels or paid ads
-Step 9: Analyze how many visitors have come to your site so far, what type of people visit more often than others (e.g., men vs women) etc…
-Step 10: Continue to improve upon all aspects mentioned above by following trends in web design and staying up-to-date on new technologies that can enhance user experience even further!
-How does a Website Work?
-A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit – whether it’s an image or text file (like PDFs). In order for someone else’s browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable!
-The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols – this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking.
-How to
-llama_print_timings:        load time =   576.45 ms
-llama_print_timings:      sample time =   283.10 ms /   400 runs   (    0.71 ms per token,  1412.91 tokens per second)
-llama_print_timings: prompt eval time =   599.83 ms /    19 tokens (   31.57 ms per token,    31.68 tokens per second)
-llama_print_timings:        eval time = 24513.59 ms /   399 runs   (   61.44 ms per token,    16.28 tokens per second)
-llama_print_timings:       total time = 25431.49 ms
-```
-
 </details>
 
-<details>
-<summary>Demo of running both LLaMA-7B and whisper.cpp on a single M1 Pro MacBook</summary>
-
-And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook:
-
-https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8b4f-add84093ffff.mp4
-
-</details>
-
-## Usage
-
-Here are the end-to-end binary build and model conversion steps for most supported models.
-
-### Basic usage
-
-Firstly, you need to get the binary. There are different methods that you can follow:
-- Method 1: Clone this repository and build locally, see [how to build](./docs/build.md)
-- Method 2: If you are using MacOS or Linux, you can install llama.cpp via [brew, flox or nix](./docs/install.md)
-- Method 3: Use a Docker image, see [documentation for Docker](./docs/docker.md)
-- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/llama.cpp/releases)
-
-You can run a basic completion using this command:
-
-```bash
-llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
-
-# Output:
-# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
-```
-
-See [this page](./examples/main/README.md) for a full list of parameters.
-
-### Conversation mode
-
-If you want a more ChatGPT-like experience, you can run in conversation mode by passing `-cnv` as a parameter:
-
-```bash
-llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
-
-# Output:
-# > hi, who are you?
-# Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
-#
-# > what is 1+1?
-# Easy peasy! The answer to 1+1 is... 2!
-```
-
-By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
-
-```bash
-./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
-```
-
-You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:
-
-```bash
-./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
-```
-
-### Web server
-
-[llama.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
-
-Example usage:
-
-```bash
-./llama-server -m your_model.gguf --port 8080
-
-# Basic web UI can be accessed via browser: http://localhost:8080
-# Chat completion endpoint: http://localhost:8080/v1/chat/completions
-```
-
-### Interactive mode
-
-> [!NOTE]
-> If you prefer basic usage, please consider using conversation mode instead of interactive mode
-
-In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
-
-Here is an example of a few-shot interaction, invoked with the command
-
-```bash
-# default arguments using a 7B model
-./examples/chat.sh
-
-# advanced chat with a 13B model
-./examples/chat-13B.sh
-
-# custom arguments using a 13B model
-./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
-```
-
-Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
-
-![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
-
-### Persistent Interaction
-
-The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
-
-```bash
-# Start a new chat
-PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
-
-# Resume that chat
-PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
-
-# Start a different chat with the same prompt/model
-PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/another ./examples/chat-persistent.sh
-
-# Different prompt cache for different prompt/model
-PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
-    CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
-```
-
-### Constrained output with grammars
-
-`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
-
-```bash
-./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
-```
-
-The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
-
-For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
-
-## Build
-
-Please refer to [Build llama.cpp locally](./docs/build.md)
-
 ## Supported backends
 
 | Backend | Target devices |
@@ -420,23 +224,104 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
 | [Vulkan](./docs/build.md#vulkan) | GPU |
 | [CANN](./docs/build.md#cann) | Ascend NPU |
 
-## Tools
+## Building and usage
 
-### Prepare and Quantize
+The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
+The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
 
-> [!NOTE]
-> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
+- Clone this repository and build locally, see [how to build](./docs/build.md)
+- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](./docs/install.md)
+- Use a Docker image, see [documentation for Docker](./docs/docker.md)
+- Download pre-built binaries from [releases](https://github.com/ggerganov/llama.cpp/releases)
 
-To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
+### Obtaining and quantizing models
 
-Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
-It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face.
+The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
 
-To learn more about quantizing model, [read this documentation](./examples/quantize/README.md)
+- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
+- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
+
+After downloading a model, use the CLI tools to run it locally - see below.
+
+`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.
+
+The Hugging Face platform provides a variety of online tools for converting, quantizing and hosting models with `llama.cpp`:
+
+- Use the [GGUF-my-repo space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) to convert to GGUF format and quantize model weights to smaller sizes
+- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggerganov/llama.cpp/discussions/10123)
+- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggerganov/llama.cpp/discussions/9268)
+- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggerganov/llama.cpp/discussions/9669)
+
+To learn more about model quantization, [read this documentation](./examples/quantize/README.md)
+
+### Using the `llama-cli` tool
+
+Run a basic text completion:
+
+```bash
+llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
+
+# Output:
+# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
+```
+
+See [this page](./examples/main/README.md) for a full list of parameters.
+
+### Conversation mode
+
+Run `llama-cli` in conversation/chat mode by passing the `-cnv` parameter:
+
+```bash
+llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
+
+# Output:
+# > hi, who are you?
+# Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
+#
+# > what is 1+1?
+# Easy peasy! The answer to 1+1 is... 2!
+```
+
+By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
+
+```bash
+llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
+```
+
+You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:
+
+```bash
+llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
+```
+
+### Constrained output with grammars
+
+`llama.cpp` can constrain the output of the model via custom grammars. For example, you can force the model to output only JSON:
+
+```bash
+llama-cli -m your_model.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
+```
+
+The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
+
+For authoring more complex JSON grammars, check out https://grammar.intrinsiclabs.ai/
+
+### Web server (`llama-server`)
+
+The [llama-server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
+
+Example usage:
+
+```bash
+llama-server -m your_model.gguf --port 8080
+
+# Basic web UI can be accessed via browser: http://localhost:8080
+# Chat completion endpoint: http://localhost:8080/v1/chat/completions
+```
 
 ### Perplexity (measuring model quality)
 
-You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
+Use the `llama-perplexity` tool to measure perplexity over a given prompt (lower perplexity is better).
 For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
 
 To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md)
@@ -456,7 +341,6 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
 
 - [main (cli)](./examples/main/README.md)
 - [server](./examples/server/README.md)
-- [jeopardy](./examples/jeopardy/README.md)
 - [GBNF grammars](./grammars/README.md)
 
 **Development documentation**

From 3e0ba0e604b1ac5b2cbca9a3f38f91f2be4ef1cd Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 30 Nov 2024 10:09:21 +0200
Subject: [PATCH 11/24] readme : remove old badge

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index c5321f26c..37c6666aa 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,6 @@
 
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
-[![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)
 
 [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
 

From 0c39f44d70d058940fe2afe50cfc789e3e44d756 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= <adrien@gallouet.fr>
Date: Sat, 30 Nov 2024 18:13:18 +0100
Subject: [PATCH 12/24] ggml-cpu: replace AArch64 NEON assembly with intrinsics
 in ggml_gemv_q4_0_4x4_q8_0() (#10567)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
---
 ggml/src/ggml-cpu/ggml-cpu-aarch64.c | 94 +++++++++++-----------------
 1 file changed, 37 insertions(+), 57 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
index 14a1f00eb..61a92cfd9 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
@@ -525,67 +525,47 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(ncols_interleaved);
     UNUSED(blocklen);
 
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
     if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
-        const void * b_ptr = vx;
-        const void * a_ptr = vy;
-        float * res_ptr = s;
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *)vx;
 
-        __asm__ __volatile__(
-            "movi v31.16b, #0x4\n"
-            "movi v30.16b, #0xf0\n"
-            "add %x[b_ptr], %x[b_ptr], #0x8\n"
-            "1:"  // Column loop
-            "add x22, %x[a_ptr], #0x2\n"
-            "movi v29.16b, #0x0\n"
-            "mov x21, %x[nb]\n"
-            "2:"  // Block loop
-            "ldr q28, [%x[b_ptr], #0x0]\n"
-            "ldr q27, [x22, #0x0]\n"
-            "movi v26.4s, #0x0\n"
-            "sub x20, x22, #0x2\n"
-            "ldr q25, [x22, #0x10]\n"
-            "ldr q24, [%x[b_ptr], #0x10]\n"
-            "sub x21, x21, #0x1\n"
-            "add x22, x22, #0x22\n"
-            "ldr q23, [%x[b_ptr], #0x20]\n"
-            "ldr q22, [%x[b_ptr], #0x30]\n"
-            "ld1r { v21.8h }, [x20]\n"
-            "ldr q20, [%x[b_ptr], #-0x8]\n"
-            "sshl v16.16b, v28.16b, v31.16b\n"
-            "and v28.16b, v28.16b, v30.16b\n"
-            "sshl v19.16b, v24.16b, v31.16b\n"
-            "and v24.16b, v24.16b, v30.16b\n"
-            "add %x[b_ptr], %x[b_ptr], #0x48\n"
-            "sshl v18.16b, v23.16b, v31.16b\n"
-            "and v23.16b, v23.16b, v30.16b\n"
-            ".inst 0x4f9be21a  // sdot v26.4s, v16.16b, v27.4b[0]\n"
-            "sshl v17.16b, v22.16b, v31.16b\n"
-            "and v22.16b, v22.16b, v30.16b\n"
-            "fcvtl v21.4s, v21.4h\n"
-            "fcvtl v16.4s, v20.4h\n"
-            ".inst 0x4f99e39a  // sdot v26.4s, v28.16b, v25.4b[0]\n"
-            "fmul v16.4s, v16.4s, v21.4s\n"
-            ".inst 0x4fbbe27a  // sdot v26.4s, v19.16b, v27.4b[1]\n"
-            ".inst 0x4fb9e31a  // sdot v26.4s, v24.16b, v25.4b[1]\n"
-            ".inst 0x4f9bea5a  // sdot v26.4s, v18.16b, v27.4b[2]\n"
-            ".inst 0x4f99eafa  // sdot v26.4s, v23.16b, v25.4b[2]\n"
-            ".inst 0x4fbbea3a  // sdot v26.4s, v17.16b, v27.4b[3]\n"
-            ".inst 0x4fb9eada  // sdot v26.4s, v22.16b, v25.4b[3]\n"
-            "scvtf v26.4s, v26.4s, #0x4\n"
-            "fmla v29.4s, v26.4s, v16.4s\n"
-            "cbnz x21, 2b\n"
-            "sub %x[nc], %x[nc], #0x4\n"
-            "str q29, [%x[res_ptr], #0x0]\n"
-            "add %x[res_ptr], %x[res_ptr], #0x10\n"
-            "cbnz %x[nc], 1b\n"
-            : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
-            : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
-            : "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22"
-            );
+        for (int c = 0; c < nc; c += ncols_interleaved) {
+            const block_q8_0 * a_ptr = (const block_q8_0 *)vy;
+            float32x4_t acc = vdupq_n_f32(0);
+            for (int b = 0; b < nb; b++) {
+                int8x16_t b0 = vld1q_s8((const int8_t *)b_ptr->qs);
+                int8x16_t b1 = vld1q_s8((const int8_t *)b_ptr->qs + 16);
+                int8x16_t b2 = vld1q_s8((const int8_t *)b_ptr->qs + 32);
+                int8x16_t b3 = vld1q_s8((const int8_t *)b_ptr->qs + 48);
+                float16x4_t bd = vld1_f16((const __fp16 *)b_ptr->d);
+
+                int8x16_t a0 = vld1q_s8(a_ptr->qs);
+                int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2);
+                float16x4_t ad = vld1_dup_f16((const __fp16 *)&a_ptr->d);
+
+                int32x4_t ret = vdupq_n_s32(0);
+
+                ret = vdotq_laneq_s32(ret, b0 << 4, a0, 0);
+                ret = vdotq_laneq_s32(ret, b1 << 4, a0, 1);
+                ret = vdotq_laneq_s32(ret, b2 << 4, a0, 2);
+                ret = vdotq_laneq_s32(ret, b3 << 4, a0, 3);
+
+                ret = vdotq_laneq_s32(ret, b0 & 0xf0U, a1, 0);
+                ret = vdotq_laneq_s32(ret, b1 & 0xf0U, a1, 1);
+                ret = vdotq_laneq_s32(ret, b2 & 0xf0U, a1, 2);
+                ret = vdotq_laneq_s32(ret, b3 & 0xf0U, a1, 3);
+
+                acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
+                                vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
+                a_ptr++;
+                b_ptr++;
+            }
+            vst1q_f32(s, acc);
+            s += ncols_interleaved;
+        }
         return;
     }
-#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
     float sumf[4];
     int sumi;
 

From 43957ef203b4c9ceaee42c176b3ef44ea4359c85 Mon Sep 17 00:00:00 2001
From: Wang Qin <37098874+wangqin0@users.noreply.github.com>
Date: Sat, 30 Nov 2024 19:19:44 -0800
Subject: [PATCH 13/24] build: update Makefile comments for C++ version change
 (#10598)

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 83adcef28..e30821624 100644
--- a/Makefile
+++ b/Makefile
@@ -251,7 +251,7 @@ endif
 # Compile flags
 #
 
-# keep standard at C11 and C++11
+# keep standard at C11 and C++17
 MK_CPPFLAGS  = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU
 MK_CFLAGS    = -std=c11   -fPIC
 MK_CXXFLAGS  = -std=c++17 -fPIC

From 6acce3971098772a8aacb10fe8550b4119110581 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 1 Dec 2024 11:25:17 +0200
Subject: [PATCH 14/24] readme : update the usage section with examples
 (#10596)

* readme : update the usage section with examples

* readme : more examples
---
 README.md | 274 +++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 201 insertions(+), 73 deletions(-)

diff --git a/README.md b/README.md
index 37c6666aa..6fdd8d9ee 100644
--- a/README.md
+++ b/README.md
@@ -42,9 +42,9 @@ The `llama.cpp` project is the main playground for developing new features for t
 
 Typically finetunes of the base models below are supported as well.
 
-Instructions for adding support for new models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md)
+Instructions for adding support for new models: [HOWTO-add-model.md](docs/development/HOWTO-add-model.md)
 
-**Text-only:**
+#### Text-only
 
 - [X] LLaMA 🦙
 - [x] LLaMA 2 🦙🦙
@@ -99,7 +99,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](./docs/deve
 - [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
 - [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
 
-**Multimodal:**
+#### Multimodal
 
 - [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
 - [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
@@ -213,27 +213,27 @@ Instructions for adding support for new models: [HOWTO-add-model.md](./docs/deve
 
 | Backend | Target devices |
 | --- | --- |
-| [Metal](./docs/build.md#metal-build) | Apple Silicon |
-| [BLAS](./docs/build.md#blas-build) | All |
-| [BLIS](./docs/backend/BLIS.md) | All |
-| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
-| [MUSA](./docs/build.md#musa) | Moore Threads MTT GPU |
-| [CUDA](./docs/build.md#cuda) | Nvidia GPU |
-| [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
-| [Vulkan](./docs/build.md#vulkan) | GPU |
-| [CANN](./docs/build.md#cann) | Ascend NPU |
+| [Metal](docs/build.md#metal-build) | Apple Silicon |
+| [BLAS](docs/build.md#blas-build) | All |
+| [BLIS](docs/backend/BLIS.md) | All |
+| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
+| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
+| [CUDA](docs/build.md#cuda) | Nvidia GPU |
+| [hipBLAS](docs/build.md#hipblas) | AMD GPU |
+| [Vulkan](docs/build.md#vulkan) | GPU |
+| [CANN](docs/build.md#cann) | Ascend NPU |
 
-## Building and usage
+## Building the project
 
 The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
 The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
 
-- Clone this repository and build locally, see [how to build](./docs/build.md)
-- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](./docs/install.md)
-- Use a Docker image, see [documentation for Docker](./docs/docker.md)
+- Clone this repository and build locally, see [how to build](docs/build.md)
+- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
+- Use a Docker image, see [documentation for Docker](docs/docker.md)
 - Download pre-built binaries from [releases](https://github.com/ggerganov/llama.cpp/releases)
 
-### Obtaining and quantizing models
+## Obtaining and quantizing models
 
 The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
 
@@ -251,79 +251,204 @@ The Hugging Face platform provides a variety of online tools for converting, qua
 - Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggerganov/llama.cpp/discussions/9268)
 - Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggerganov/llama.cpp/discussions/9669)
 
-To learn more about model quantization, [read this documentation](./examples/quantize/README.md)
+To learn more about model quantization, [read this documentation](examples/quantize/README.md)
 
-### Using the `llama-cli` tool
+## [`llama-cli`](examples/main)
 
-Run a basic text completion:
+#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
 
-```bash
-llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
+- <details open>
+    <summary>Run simple text completion</summary>
 
-# Output:
-# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
-```
+    ```bash
+    llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128
 
-See [this page](./examples/main/README.md) for a full list of parameters.
+    # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
+    ```
 
-### Conversation mode
+    </details>
 
-Run `llama-cli` in conversation/chat mode by passing the `-cnv` parameter:
+- <details>
+    <summary>Run in conversation mode</summary>
 
-```bash
-llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
+    ```bash
+    llama-cli -m model.gguf -p "You are a helpful assistant" -cnv
 
-# Output:
-# > hi, who are you?
-# Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
-#
-# > what is 1+1?
-# Easy peasy! The answer to 1+1 is... 2!
-```
+    # > hi, who are you?
+    # Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
+    #
+    # > what is 1+1?
+    # Easy peasy! The answer to 1+1 is... 2!
+    ```
 
-By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
+    </details>
 
-```bash
-llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
-```
+- <details>
+    <summary>Run with custom chat template</summary>
 
-You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:
+    ```bash
+    # use the "chatml" template
+    llama-cli -m model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
 
-```bash
-llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
-```
+    # use a custom template
+    llama-cli -m model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
+    ```
 
-### Constrained output with grammars
+    [Supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
 
-`llama.cpp` can constrain the output of the model via custom grammars. For example, you can force the model to output only JSON:
+    </details>
 
-```bash
-llama-cli -m your_model.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
-```
+- <details>
+    <summary>Constrain the output with a custom grammar</summary>
 
-The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
+    ```bash
+    llama-cli -m model.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
 
-For authoring more complex JSON grammars, check out https://grammar.intrinsiclabs.ai/
+    # {"appointmentTime": "8pm", "appointmentDetails": "schedule a a call"}
+    ```
 
-### Web server (`llama-server`)
+    The [grammars/](grammars/) folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](grammars/README.md).
 
-The [llama-server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
+    For authoring more complex JSON grammars, check out https://grammar.intrinsiclabs.ai/
 
-Example usage:
+    </details>
 
-```bash
-llama-server -m your_model.gguf --port 8080
 
-# Basic web UI can be accessed via browser: http://localhost:8080
-# Chat completion endpoint: http://localhost:8080/v1/chat/completions
-```
+## [`llama-server`](examples/server)
 
-### Perplexity (measuring model quality)
+#### A lightweight, [OpenAI API](https://github.com/openai/openai-openapi) compatible, HTTP server for serving LLMs.
 
-Use the `llama-perplexity` tool to measure perplexity over a given prompt (lower perplexity is better).
-For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
+- <details open>
+    <summary>Start a local HTTP server with default configuration on port 8080</summary>
+
+    ```bash
+    llama-server -m model.gguf --port 8080
+
+    # Basic web UI can be accessed via browser: http://localhost:8080
+    # Chat completion endpoint: http://localhost:8080/v1/chat/completions
+    ```
+
+    </details>
+
+- <details>
+    <summary>Support multiple-users and parallel decoding</summary>
+
+    ```bash
+    # up to 4 concurrent requests, each with 4096 max context
+    llama-server -m model.gguf -c 16384 -np 4
+    ```
+
+    </details>
+
+- <details>
+    <summary>Enable speculative decoding</summary>
+
+    ```bash
+    # the draft.gguf model should be a small variant of the target model.gguf
+    llama-server -m model.gguf -md draft.gguf
+    ```
+
+    </details>
+
+- <details>
+    <summary>Serve an embedding model</summary>
+
+    ```bash
+    # use the /embedding endpoint
+    llama-server -m model.gguf --embedding --pooling cls -ub 8192
+    ```
+
+    </details>
+
+- <details>
+    <summary>Serve a reranking model</summary>
+
+    ```bash
+    # use the /reranking endpoint
+    llama-server -m model.gguf --reranking
+    ```
+
+    </details>
+
+- <details>
+    <summary>Constrain all outputs with a grammar</summary>
+
+    ```bash
+    # custom grammar
+    llama-server -m model.gguf --grammar-file grammar.gbnf
+
+    # JSON
+    llama-server -m model.gguf --grammar-file grammars/json.gbnf
+    ```
+
+    </details>
+
+
+## [`llama-perplexity`](examples/perplexity)
+
+#### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.
+
+- <details open>
+    <summary>Measure the perplexity over a text file</summary>
+
+    ```bash
+    llama-perplexity -m model.gguf -f file.txt
+
+    # [1]15.2701,[2]5.4007,[3]5.3073,[4]6.2965,[5]5.8940,[6]5.6096,[7]5.7942,[8]4.9297, ...
+    # Final estimate: PPL = 5.4007 +/- 0.67339
+    ```
+
+    </details>
+
+- <details>
+    <summary>Measure KL divergence</summary>
+
+    ```bash
+    # TODO
+    ```
+
+    </details>
+
+[^1]: [examples/perplexity/README.md](examples/perplexity/README.md)
+[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
+
+## [`llama-bench`](example/bench)
+
+#### Benchmark the performance of the inference for various parameters.
+
+- <details open>
+    <summary>Run default benchmark</summary>
+
+    ```bash
+    llama-bench -m model.gguf
+
+    # Output:
+    # | model               |       size |     params | backend    | threads |          test |                  t/s |
+    # | ------------------- | ---------: | ---------: | ---------- | ------: | ------------: | -------------------: |
+    # | qwen2 1.5B Q4_0     | 885.97 MiB |     1.54 B | Metal,BLAS |      16 |         pp512 |      5765.41 ± 20.55 |
+    # | qwen2 1.5B Q4_0     | 885.97 MiB |     1.54 B | Metal,BLAS |      16 |         tg128 |        197.71 ± 0.81 |
+    #
+    # build: 3e0ba0e60 (4229)
+    ```
+
+    </details>
+
+
+## [`llama-simple`](examples/simple)
+
+#### A minimal example for implementing apps with `llama.cpp`. Useful for developers.
+
+- <details>
+    <summary>Basic text completion</summary>
+
+    ```bash
+    llama-simple -m model.gguf
+
+    # Hello my name is Kaitlyn and I am a 16 year old girl. I am a junior in high school and I am currently taking a class called "The Art of
+    ```
+
+    </details>
 
-To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md)
 
 ## Contributing
 
@@ -338,19 +463,19 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
 
 ## Other documentation
 
-- [main (cli)](./examples/main/README.md)
-- [server](./examples/server/README.md)
-- [GBNF grammars](./grammars/README.md)
+- [main (cli)](examples/main/README.md)
+- [server](examples/server/README.md)
+- [GBNF grammars](grammars/README.md)
 
-**Development documentation**
+#### Development documentation
 
-- [How to build](./docs/build.md)
-- [Running on Docker](./docs/docker.md)
-- [Build on Android](./docs/android.md)
-- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
+- [How to build](docs/build.md)
+- [Running on Docker](docs/docker.md)
+- [Build on Android](docs/android.md)
+- [Performance troubleshooting](docs/development/token_generation_performance_tips.md)
 - [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
 
-**Seminal papers and background on the models**
+#### Seminal papers and background on the models
 
 If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
 - LLaMA:
@@ -361,3 +486,6 @@ If your issue is with model generation quality, then please at least scan the fo
 - GPT-3.5 / InstructGPT / ChatGPT:
     - [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
     - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
+
+#### References
+

From 86dc11c5bcf34db2749d8bd8d4fa07a542c94f84 Mon Sep 17 00:00:00 2001
From: alek3y <44779186+alek3y@users.noreply.github.com>
Date: Sun, 1 Dec 2024 12:33:12 +0100
Subject: [PATCH 15/24] server : bind to any port when specified (#10590)

---
 examples/server/server.cpp | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 9c86407c2..1c765f0ea 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3347,8 +3347,18 @@ int main(int argc, char ** argv) {
         llama_backend_free();
     };
 
-    // bind HTTP listen port, run the HTTP server in a thread
-    if (!svr->bind_to_port(params.hostname, params.port)) {
+    // bind HTTP listen port
+    bool was_bound = false;
+    if (params.port == 0) {
+        int bound_port = svr->bind_to_any_port(params.hostname);
+        if ((was_bound = (bound_port >= 0))) {
+            params.port = bound_port;
+        }
+    } else {
+        was_bound = svr->bind_to_port(params.hostname, params.port);
+    }
+
+    if (!was_bound) {
         //LOG_ERROR("couldn't bind HTTP server socket", {
         //    {"hostname", params.hostname},
         //    {"port", params.port},
@@ -3357,6 +3367,8 @@ int main(int argc, char ** argv) {
         clean_up();
         return 1;
     }
+
+    // run the HTTP server in a thread
     std::thread t([&]() { svr->listen_after_bind(); });
     svr->wait_until_ready();
 

From 3420909dffa50e70660524797a1e715a717684d2 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Sun, 1 Dec 2024 16:12:41 +0100
Subject: [PATCH 16/24] ggml : automatic selection of best CPU backend (#10606)

* ggml : automatic selection of best CPU backend

* amx : minor opt

* add GGML_AVX_VNNI to enable avx-vnni, fix checks
---
 .devops/llama-server.Dockerfile      |  20 +-
 CMakeLists.txt                       |   4 -
 Package.swift                        |   2 +-
 ggml/CMakeLists.txt                  |   1 +
 ggml/src/ggml-backend-impl.h         |  58 ++++--
 ggml/src/ggml-backend-reg.cpp        | 270 ++++++++++++++++--------
 ggml/src/ggml-cpu/CMakeLists.txt     |  13 ++
 ggml/src/ggml-cpu/amx/common.h       |   1 -
 ggml/src/ggml-cpu/amx/mmq.cpp        |  74 ++++---
 ggml/src/ggml-cpu/cpu-feats-x86.cpp  | 298 +++++++++++++++++++++++++++
 ggml/src/ggml-cpu/ggml-cpu-aarch64.c |   2 +-
 scripts/build-cpu.sh                 |  12 ++
 12 files changed, 599 insertions(+), 156 deletions(-)
 create mode 100644 ggml/src/ggml-cpu/cpu-feats-x86.cpp
 create mode 100755 scripts/build-cpu.sh

diff --git a/.devops/llama-server.Dockerfile b/.devops/llama-server.Dockerfile
index 02accc85e..7110dda9e 100644
--- a/.devops/llama-server.Dockerfile
+++ b/.devops/llama-server.Dockerfile
@@ -3,22 +3,34 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION AS build
 
 RUN apt-get update && \
-    apt-get install -y build-essential git libcurl4-openssl-dev
+    apt-get install -y build-essential git cmake libcurl4-openssl-dev
 
 WORKDIR /app
 
 COPY . .
 
-ENV LLAMA_CURL=1
 
-RUN make -j$(nproc) llama-server
+RUN \
+    # Build multiple versions of the CPU backend
+    scripts/build-cpu.sh avx         -DGGML_AVX=ON -DGGML_AVX2=OFF && \
+    scripts/build-cpu.sh avx2        -DGGML_AVX=ON -DGGML_AVX2=ON && \
+    scripts/build-cpu.sh avx512      -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON && \
+    scripts/build-cpu.sh amx         -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON -DGGML_AVX_VNNI=ON -DGGML_AVX512_VNNI=ON -DGGML_AMX_TILE=ON -DGGML_AMX_INT8=ON && \
+    # Build llama-server
+    cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build build --target llama-server -j $(nproc) && \
+    # Copy the built libraries to /app/lib
+    mkdir -p /app/lib && \
+    mv libggml-cpu* /app/lib/ && \
+    find build -name "*.so" -exec cp {} /app/lib/ \;
 
 FROM ubuntu:$UBUNTU_VERSION AS runtime
 
 RUN apt-get update && \
     apt-get install -y libcurl4-openssl-dev libgomp1 curl
 
-COPY --from=build /app/llama-server /llama-server
+COPY --from=build /app/build/bin/llama-server /llama-server
+COPY --from=build /app/lib/ /
 
 ENV LC_ALL=C.utf8
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0d389dccb..f84fff9e6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -96,10 +96,6 @@ if (NOT DEFINED GGML_LLAMAFILE)
     set(GGML_LLAMAFILE_DEFAULT ON)
 endif()
 
-if (NOT DEFINED GGML_AMX)
-    set(GGML_AMX ON)
-endif()
-
 if (NOT DEFINED GGML_CUDA_GRAPHS)
     set(GGML_CUDA_GRAPHS_DEFAULT ON)
 endif()
diff --git a/Package.swift b/Package.swift
index 1e75aa7e2..d32b74a63 100644
--- a/Package.swift
+++ b/Package.swift
@@ -88,5 +88,5 @@ let package = Package(
             linkerSettings: linkerSettings
         )
     ],
-    cxxLanguageStandard: .cxx11
+    cxxLanguageStandard: .cxx17
 )
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 789fa3b0c..06d371e09 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -96,6 +96,7 @@ option(GGML_CPU_HBM     "ggml: use memkind for CPU HBM" OFF)
 option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
 
 option(GGML_AVX         "ggml: enable AVX"              ${INS_ENB})
+option(GGML_AVX_VNNI    "ggml: enable AVX-VNNI"         OFF)
 option(GGML_AVX2        "ggml: enable AVX2"             ${INS_ENB})
 option(GGML_AVX512      "ggml: enable AVX512"           OFF)
 option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI"      OFF)
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index dff7749b4..36d72e95f 100644
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -211,27 +211,45 @@ extern "C" {
     GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
 
     // Add backend dynamic loading support to the backend
-    typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
 
-    #ifdef GGML_BACKEND_DL
-        #ifdef __cplusplus
-        #    define GGML_BACKEND_DL_IMPL(reg_fn)                                 \
-                extern "C" {                                                     \
-                    GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
-                }                                                                \
-                ggml_backend_reg_t ggml_backend_init(void) {                     \
-                    return reg_fn();                                             \
-                }
-        #else
-        #    define GGML_BACKEND_DL_IMPL(reg_fn)                             \
-                GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
-                ggml_backend_reg_t ggml_backend_init(void) {                 \
-                    return reg_fn();                                         \
-                }
-        #endif
-    #else
-    #    define GGML_BACKEND_DL_IMPL(reg_fn)
-    #endif
+    // Initialize the backend
+    typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
+    // Optional: obtain a score for the backend based on the system configuration
+    // Higher scores are preferred, 0 means the backend is not supported in the current system
+    typedef int                (*ggml_backend_score_t)(void);
+
+#ifdef GGML_BACKEND_DL
+#    ifdef __cplusplus
+#        define GGML_BACKEND_DL_IMPL(reg_fn)                             \
+            extern "C" {                                                 \
+            GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
+            }                                                            \
+            ggml_backend_reg_t ggml_backend_init(void) {                 \
+                return reg_fn();                                         \
+            }
+#        define GGML_BACKEND_DL_SCORE_IMPL(score_fn)       \
+            extern "C" {                                   \
+            GGML_BACKEND_API int ggml_backend_score(void); \
+            }                                              \
+            int ggml_backend_score(void) {                 \
+                return score_fn();                         \
+            }
+#    else
+#        define GGML_BACKEND_DL_IMPL(reg_fn)                              \
+            GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void);  \
+            ggml_backend_reg_t                  ggml_backend_init(void) { \
+                return reg_fn();                                          \
+            }
+#        define GGML_BACKEND_DL_SCORE_IMPL(score_fn)        \
+            GGML_BACKEND_API int ggml_backend_score(void);  \
+            int                  ggml_backend_score(void) { \
+                return score_fn();                          \
+            }
+#    endif
+#else
+#    define GGML_BACKEND_DL_IMPL(reg_fn)
+#    define GGML_BACKEND_DL_SCORE_IMPL(score_fn)
+#endif
 
 #ifdef  __cplusplus
 }
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 3182b84f5..2c4bf11b0 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -2,8 +2,13 @@
 #include "ggml-backend.h"
 #include "ggml-impl.h"
 #include <algorithm>
+#include <codecvt>
 #include <cstring>
+#include <filesystem>
+#include <locale>
+#include <memory>
 #include <string>
+#include <type_traits>
 #include <vector>
 
 #ifdef _WIN32
@@ -57,9 +62,71 @@
 #include "ggml-kompute.h"
 #endif
 
+#ifdef _WIN32
+
+using dl_handle = std::remove_pointer_t<HMODULE>;
+
+struct dl_handle_deleter {
+    void operator()(HMODULE handle) {
+        FreeLibrary(handle);
+    }
+};
+
+static dl_handle * dl_load_library(const std::wstring & path) {
+    // suppress error dialogs for missing DLLs
+    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+
+    HMODULE handle = LoadLibraryW(path.c_str());
+
+    SetErrorMode(old_mode);
+
+    return handle;
+}
+
+static dl_handle * dl_load_library(const std::string & path) {
+    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+    return dl_load_library(converter.from_bytes(path));
+}
+
+static void * dl_get_sym(dl_handle * handle, const char * name) {
+    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+
+    void * p = (void *) GetProcAddress(handle, name);
+
+    SetErrorMode(old_mode);
+
+    return p;
+}
+
+#else
+
+using dl_handle = void;
+
+struct dl_handle_deleter {
+    void operator()(void * handle) {
+        dlclose(handle);
+    }
+};
+
+static void * dl_load_library(const std::string & path) {
+    dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
+
+    return handle;
+}
+
+static void * dl_get_sym(dl_handle * handle, const char * name) {
+    return dlsym(handle, name);
+}
+
+#endif
+
+using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
+
 struct ggml_backend_reg_entry {
     ggml_backend_reg_t reg;
-    void * handle;
+    dl_handle_ptr handle;
 };
 
 struct ggml_backend_registry {
@@ -97,13 +164,16 @@ struct ggml_backend_registry {
     }
 
     ~ggml_backend_registry() {
-        while (!backends.empty()) {
-            // use silent since the log system may have been destroyed at this point
-            unload_backend(backends.back().reg, true);
+        // FIXME: backends cannot be safely unloaded without a function to destroy all the backend resources,
+        // since backend threads may still be running and accessing resources from the dynamic library
+        for (auto & entry : backends) {
+            if (entry.handle) {
+                entry.handle.release(); // NOLINT
+            }
         }
     }
 
-    void register_backend(ggml_backend_reg_t reg, void * handle = nullptr) {
+    void register_backend(ggml_backend_reg_t reg, dl_handle_ptr handle = nullptr) {
         if (!reg) {
             return;
         }
@@ -112,7 +182,7 @@ struct ggml_backend_registry {
         GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
             __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
 #endif
-        backends.push_back({ reg, handle });
+        backends.push_back({ reg, std::move(handle) });
         for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
             register_device(ggml_backend_reg_dev_get(reg, i));
         }
@@ -126,79 +196,53 @@ struct ggml_backend_registry {
     }
 
     ggml_backend_reg_t load_backend(const char * path, bool silent) {
-#ifdef _WIN32
-        // suppress error dialogs for missing DLLs
-        DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
-        SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
-
-        HMODULE handle = LoadLibraryA(path);
-
+        dl_handle_ptr handle { dl_load_library(path) };
         if (!handle) {
             if (!silent) {
-                GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError());
+                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
             }
-            SetErrorMode(old_mode);
             return nullptr;
         }
 
-        ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init");
-
-        SetErrorMode(old_mode);
-
-        if (!backend_init) {
+        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
+        if (score_fn && score_fn() == 0) {
             if (!silent) {
-                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError());
+                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
             }
-            FreeLibrary(handle);
             return nullptr;
         }
-#else
-        void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL);
 
-        if (!handle) {
+        auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
+        if (!backend_init_fn) {
             if (!silent) {
-                GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror());
+                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path);
             }
             return nullptr;
         }
 
-        auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init");
-
-        if (!backend_init) {
-            if (!silent) {
-                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror());
-            }
-            dlclose(handle);
-            return nullptr;
-        }
-#endif
-        ggml_backend_reg_t reg = backend_init();
-
+        ggml_backend_reg_t reg = backend_init_fn();
         if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
             if (!silent) {
                 if (!reg) {
                     GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
                 } else {
                     GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
-                                   __func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
+                        __func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
                 }
             }
-#ifdef _WIN32
-            FreeLibrary(handle);
-#else
-            dlclose(handle);
-#endif
             return nullptr;
         }
 
         GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
-        register_backend(reg, handle);
+
+        register_backend(reg, std::move(handle));
+
         return reg;
     }
 
     void unload_backend(ggml_backend_reg_t reg, bool silent) {
         auto it = std::find_if(backends.begin(), backends.end(),
-                                [reg](ggml_backend_reg_entry entry) { return entry.reg == reg; });
+                               [reg](const ggml_backend_reg_entry & entry) { return entry.reg == reg; });
 
         if (it == backends.end()) {
             if (!silent) {
@@ -217,15 +261,6 @@ struct ggml_backend_registry {
                             [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
             devices.end());
 
-        // unload library
-        if (it->handle) {
-#ifdef _WIN32
-            FreeLibrary((HMODULE) it->handle);
-#else
-            dlclose(it->handle);
-#endif
-        }
-
         // remove backend
         backends.erase(it);
     }
@@ -341,12 +376,7 @@ void ggml_backend_unload(ggml_backend_reg_t reg) {
     get_reg().unload_backend(reg, true);
 }
 
-void ggml_backend_load_all() {
-    std::vector<std::string> search_prefix;
-
-    // add the executable directory to the search path
-    // FIXME: this is convenient for development, but it should probably be disabled in production
-
+static std::string get_executable_path() {
 #if defined(__APPLE__)
     // get executable path
     std::vector<char> path;
@@ -364,7 +394,7 @@ void ggml_backend_load_all() {
     if (last_slash != std::string::npos) {
         base_path = base_path.substr(0, last_slash);
     }
-    search_prefix.push_back(base_path + "/");
+    return base_path + "/";
 #elif defined(__linux__)
     std::string base_path = ".";
     std::vector<char> path(1024);
@@ -386,38 +416,104 @@ void ggml_backend_load_all() {
         path.resize(path.size() * 2);
     }
 
-    search_prefix.push_back(base_path + "/");
+    return base_path + "/";
+#elif defined(_WIN32)
+    std::vector<char> path(MAX_PATH);
+    DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
+    if (len == 0) {
+        return "";
+    }
+    std::string base_path(path.data(), len);
+    // remove executable name
+    auto last_slash = base_path.find_last_of('\\');
+    if (last_slash != std::string::npos) {
+        base_path = base_path.substr(0, last_slash);
+    }
+    return base_path + "\\";
 #endif
+}
 
-    auto & reg = get_reg();
-
-    auto try_load = [&](const std::string & name) {
-        std::string os_name;
+static std::string backend_filename_prefix() {
 #ifdef _WIN32
-        os_name = "ggml-" + name + ".dll";
+    return "ggml-";
 #else
-        os_name = "libggml-" + name + ".so";
+    return "libggml-";
 #endif
-        if (reg.load_backend(os_name.c_str(), true)) {
-            return;
+}
+
+static std::string backend_filename_suffix() {
+#ifdef _WIN32
+    return ".dll";
+#else
+    return ".so";
+#endif
+}
+
+static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) {
+    // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
+     // TODO: search system paths
+    std::vector<std::string> search_paths = { "./", get_executable_path() };
+    std::string file_prefix = backend_filename_prefix() + name + "-";
+
+    int best_score = 0;
+    std::string best_path;
+
+    namespace fs = std::filesystem;
+    for (const auto & search_path : search_paths) {
+        if (!fs::exists(search_path)) {
+            continue;
         }
-        for (const auto & prefix : search_prefix) {
-            if (reg.load_backend((prefix + os_name).c_str(), true)) {
-                return;
+        for (const auto & entry : fs::directory_iterator(search_path)) {
+            if (entry.is_regular_file()) {
+                std::string filename = entry.path().filename().string();
+                std::string ext = entry.path().extension().string();
+                if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
+                    dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
+                    if (!handle && !silent) {
+                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
+                    }
+                    if (handle) {
+                        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
+                        if (score_fn) {
+                            int s = score_fn();
+#ifndef NDEBUG
+                            GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
+#endif
+                            if (s > best_score) {
+                                best_score = s;
+                                best_path = entry.path().string();
+                            }
+                        }
+                    }
+                }
             }
         }
-    };
+    }
 
-    try_load("amx");
-    try_load("blas");
-    try_load("cann");
-    try_load("cuda");
-    try_load("hip");
-    try_load("kompute");
-    try_load("metal");
-    try_load("rpc");
-    try_load("sycl");
-    try_load("vulkan");
-    try_load("musa");
-    try_load("cpu");
+    if (best_score == 0) {
+        // try to load the base backend
+        for (const auto & search_path : search_paths) {
+            std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
+            if (fs::exists(path)) {
+                return get_reg().load_backend(path.c_str(), silent);
+            }
+        }
+        return nullptr;
+    }
+
+    return get_reg().load_backend(best_path.c_str(), silent);
+}
+
+void ggml_backend_load_all() {
+    ggml_backend_load_best("blas", true);
+    ggml_backend_load_best("cann", true);
+    ggml_backend_load_best("cuda", true);
+    ggml_backend_load_best("hip", true);
+    ggml_backend_load_best("kompute", true);
+    ggml_backend_load_best("metal", true);
+    ggml_backend_load_best("rpc", true);
+    ggml_backend_load_best("sycl", true);
+    ggml_backend_load_best("vulkan", true);
+    ggml_backend_load_best("musa", true);
+    ggml_backend_load_best("cpu", true);
 }
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index fe2222084..5df63884c 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -217,6 +217,12 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
         elseif (GGML_AVX)
             list(APPEND ARCH_FLAGS /arch:AVX)
         endif()
+        if (GGML_AVX_VNNI)
+            list(APPEND ARCH_DEFINITIONS __AVXVNNI__)
+            if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                list(APPEND ARCH_FLAGS -mavxvnni)
+            endif()
+        endif()
     else()
         if (GGML_NATIVE)
             list(APPEND ARCH_FLAGS -march=native)
@@ -233,6 +239,9 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
         if (GGML_AVX2)
             list(APPEND ARCH_FLAGS -mavx2)
         endif()
+        if (GGML_AVX_VNNI)
+            list(APPEND ARCH_FLAGS -mavxvnni)
+        endif()
         if (GGML_AVX512)
             list(APPEND ARCH_FLAGS -mavx512f)
             list(APPEND ARCH_FLAGS -mavx512dq)
@@ -301,6 +310,10 @@ target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES})
 set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS     "${ARCH_FLAGS}")
 set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}")
 
+# the feature detection code must be compiled without any architecture flags
+target_sources(ggml-cpu PRIVATE cpu-feats-x86.cpp)
+# target_sources(ggml-cpu PRIVATE cpu-feats-arm.cpp) # TODO: ARM feature detection
+
 if (EMSCRIPTEN)
     set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
 endif()
diff --git a/ggml/src/ggml-cpu/amx/common.h b/ggml/src/ggml-cpu/amx/common.h
index 0b0657289..40074c3fc 100644
--- a/ggml/src/ggml-cpu/amx/common.h
+++ b/ggml/src/ggml-cpu/amx/common.h
@@ -78,7 +78,6 @@ inline void parallel_for_ggml(const ggml_compute_params * params, int n, const f
     int tbegin, tend;
     balance211(n, params->nth, params->ith, tbegin, tend);
     f(tbegin, tend);
-    ggml_barrier(params->threadpool); // TODO: might not always be needed
 }
 
 // quantized types that have AMX support
diff --git a/ggml/src/ggml-cpu/amx/mmq.cpp b/ggml/src/ggml-cpu/amx/mmq.cpp
index 6447e73d0..0ec3aa86d 100644
--- a/ggml/src/ggml-cpu/amx/mmq.cpp
+++ b/ggml/src/ggml-cpu/amx/mmq.cpp
@@ -1340,21 +1340,19 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K>
         __m512 vb[COLS];
         __m512 vc[ROWS * COLS];
 
-        auto loadc = [&](int idx) {
+        auto loadc = [&](auto idx) {
             vc[idx] = _mm512_setzero_ps();
         };
         Unroll<ROWS * COLS>{}(loadc);
 
-        auto compute = [&](int idx, int k) {
-            // TODO: use `constexpr` here to get rid of interger div
-            // when upgraded to C++17
-            const int row = idx / COLS;
-            const int col = idx % COLS;
+        auto compute = [&](auto idx, auto k) {
+            constexpr int row = idx / COLS;
+            constexpr int col = idx % COLS;
 
-            if (col == 0) {
+            if constexpr (col == 0) {
                 va = _mm512_loadu_ps(A + row * K + k);
             }
-            if (row == 0) {
+            if constexpr (row == 0) {
                 vb[col] =  _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(B + col * K + k)));
             }
             vc[idx] = _mm512_fmadd_ps(va, vb[col], vc[idx]);
@@ -1364,9 +1362,9 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K>
             Unroll<ROWS * COLS>{}(compute, k);
         }
 
-        auto storec = [&](int idx) {
-            const int row = idx / COLS;
-            const int col = idx % COLS;
+        auto storec = [&](auto idx) {
+            constexpr int row = idx / COLS;
+            constexpr int col = idx % COLS;
             C[row * ldc + col] = _mm512_reduce_add_ps(vc[idx]);
         };
         Unroll<ROWS * COLS>{}(storec);
@@ -1429,14 +1427,14 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
         const __m512i off = _mm512_set1_epi8(8);
         const __m512i lowMask = _mm512_set1_epi8(0xF);
 
-        auto loadc = [&](int col) {
+        auto loadc = [&](auto col) {
             vc[col] = _mm512_setzero_ps();
         };
         Unroll<COLS>{}(loadc);
 
-        auto compute = [&](int col, int i) {
+        auto compute = [&](auto col, auto i) {
             // load a and compute compensation
-            if (col == 0) {
+            if constexpr (col == 0) {
                 const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
                 vcomp = _mm512_setzero_si512();
                 for (int k = 0; k < 8; ++k) {
@@ -1468,7 +1466,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
         }
 
         //store to C
-        auto storec = [&](int col) {
+        auto storec = [&](auto col) {
             _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
         };
         Unroll<COLS>{}(storec);
@@ -1492,14 +1490,14 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
 
         const __m512i lowMask = _mm512_set1_epi8(0xF);
 
-        auto loadc = [&](int col) {
+        auto loadc = [&](auto col) {
             vc[col] = _mm512_setzero_ps();
         };
         Unroll<COLS>{}(loadc);
 
-        auto compute = [&](int col, int i) {
+        auto compute = [&](auto col, auto i) {
             // load a
-            if (col == 0) {
+            if constexpr (col == 0) {
                 const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
                 for (int k = 0; k < 8; ++k) {
                     va[k] = _mm512_set1_epi32(a_ptr[k]);
@@ -1533,7 +1531,7 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
         }
 
         //store to C
-        auto storec = [&](int col) {
+        auto storec = [&](auto col) {
             _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
         };
         Unroll<COLS>{}(storec);
@@ -1564,14 +1562,14 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
         //
         const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
 
-        auto loadc = [&](int col) {
+        auto loadc = [&](auto col) {
             vc[col] = _mm512_setzero_ps();
         };
         Unroll<COLS>{}(loadc);
 
-        auto compute = [&](int col, int i) {
+        auto compute = [&](auto col, auto i) {
             // load a and add offset 128
-            if (col == 0) {
+            if constexpr (col == 0) {
                 const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
                 for (int k = 0; k < 8; ++k) {
                     va[k] = _mm512_set1_epi32(a_ptr[k]);
@@ -1604,7 +1602,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
         }
 
         //store to C
-        auto storec = [&](int col) {
+        auto storec = [&](auto col) {
             _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
         };
         Unroll<COLS>{}(storec);
@@ -1636,7 +1634,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLO
 
         const __m512i lowMask = _mm512_set1_epi8(0xF);
 
-        auto loadc = [&](int col) {
+        auto loadc = [&](auto col) {
             vc[col] = _mm512_setzero_ps();
         };
         Unroll<COLS>{}(loadc);
@@ -1650,9 +1648,9 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLO
         //     int16 {k/2, n, 2}, viewed as 2d {k/2, 2n}, k = 8
         //     from {16,  8} to {4, 32}
         //
-        auto compute = [&](int col, int i) {
+        auto compute = [&](auto col, auto i) {
             // load a
-            if (col == 0) {
+            if constexpr (col == 0) {
                 for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
                     va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32)));
                 }
@@ -1704,7 +1702,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLO
         }
 
         //store to C
-        auto storec = [&](int col) {
+        auto storec = [&](auto col) {
             _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
         };
         Unroll<COLS>{}(storec);
@@ -1737,15 +1735,15 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q5_K, float, BLOCK_M, BLOCK_N, BLO
 
         const __m512i lowMask = _mm512_set1_epi8(0xF);
 
-        auto loadc = [&](int col) {
+        auto loadc = [&](auto col) {
             vc[col] = _mm512_setzero_ps();
         };
         Unroll<COLS>{}(loadc);
 
         // Q5_K and Q4_K shares the same vnni formats, refer to notes above.
-        auto compute = [&](int col, int i) {
+        auto compute = [&](auto col, auto i) {
             // load a
-            if (col == 0) {
+            if constexpr (col == 0) {
                 for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
                     va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32)));
                 }
@@ -1810,7 +1808,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q5_K, float, BLOCK_M, BLOCK_N, BLO
         }
 
         //store to C
-        auto storec = [&](int col) {
+        auto storec = [&](auto col) {
             _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
         };
         Unroll<COLS>{}(storec);
@@ -1843,13 +1841,13 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q6_K, float, BLOCK_M, BLOCK_N, BLO
         const __m512i m32s = _mm512_set1_epi32(32);
         const __m512i lowMask = _mm512_set1_epi8(0xF);
 
-        auto loadc = [&](int col) {
+        auto loadc = [&](auto col) {
             vc[col] = _mm512_setzero_ps();
         };
         Unroll<COLS>{}(loadc);
 
-        auto compute = [&](int col, int i) {
-            if (col == 0) {
+        auto compute = [&](auto col, auto i) {
+            if constexpr (col == 0) {
                 // load a
                 va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs +   0));
                 va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs +  64));
@@ -1961,13 +1959,13 @@ struct tinygemm_kernel_vnni<block_q8_K, block_iq4_xs, float, BLOCK_M, BLOCK_N, B
         const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
         const __m512i values256 = _mm512_add_epi8(values128, off);
 
-        auto loadc = [&](int col) {
+        auto loadc = [&](auto col) {
             vc[col] = _mm512_setzero_ps();
         };
         Unroll<COLS>{}(loadc);
 
-        auto compute = [&](int col, int i) {
-            if (col == 0) {
+        auto compute = [&](auto col, auto i) {
+            if constexpr (col == 0) {
                 // load a
                 va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs +   0));
                 va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs +  64));
@@ -2017,7 +2015,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_iq4_xs, float, BLOCK_M, BLOCK_N, B
         }
 
         //store to C
-        auto storec = [&](int col) {
+        auto storec = [&](auto col) {
             _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
         };
         Unroll<COLS>{}(storec);
diff --git a/ggml/src/ggml-cpu/cpu-feats-x86.cpp b/ggml/src/ggml-cpu/cpu-feats-x86.cpp
new file mode 100644
index 000000000..514701ffe
--- /dev/null
+++ b/ggml/src/ggml-cpu/cpu-feats-x86.cpp
@@ -0,0 +1,298 @@
+#include "ggml-cpu.h"
+#include "ggml-backend-impl.h"
+
+#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#include <cstring>
+#include <vector>
+#include <bitset>
+#include <array>
+#include <string>
+
+struct cpuid_x86 {
+    bool SSE3(void) { return f_1_ecx[0]; }
+    bool PCLMULQDQ(void) { return f_1_ecx[1]; }
+    bool MONITOR(void) { return f_1_ecx[3]; }
+    bool SSSE3(void) { return f_1_ecx[9]; }
+    bool FMA(void) { return f_1_ecx[12]; }
+    bool CMPXCHG16B(void) { return f_1_ecx[13]; }
+    bool SSE41(void) { return f_1_ecx[19]; }
+    bool SSE42(void) { return f_1_ecx[20]; }
+    bool MOVBE(void) { return f_1_ecx[22]; }
+    bool POPCNT(void) { return f_1_ecx[23]; }
+    bool AES(void) { return f_1_ecx[25]; }
+    bool XSAVE(void) { return f_1_ecx[26]; }
+    bool OSXSAVE(void) { return f_1_ecx[27]; }
+    bool AVX(void) { return f_1_ecx[28]; }
+    bool F16C(void) { return f_1_ecx[29]; }
+    bool RDRAND(void) { return f_1_ecx[30]; }
+
+    bool MSR(void) { return f_1_edx[5]; }
+    bool CX8(void) { return f_1_edx[8]; }
+    bool SEP(void) { return f_1_edx[11]; }
+    bool CMOV(void) { return f_1_edx[15]; }
+    bool CLFSH(void) { return f_1_edx[19]; }
+    bool MMX(void) { return f_1_edx[23]; }
+    bool FXSR(void) { return f_1_edx[24]; }
+    bool SSE(void) { return f_1_edx[25]; }
+    bool SSE2(void) { return f_1_edx[26]; }
+
+    bool FSGSBASE(void) { return f_7_ebx[0]; }
+    bool BMI1(void) { return f_7_ebx[3]; }
+    bool HLE(void) { return is_intel && f_7_ebx[4]; }
+    bool AVX2(void) { return f_7_ebx[5]; }
+    bool BMI2(void) { return f_7_ebx[8]; }
+    bool ERMS(void) { return f_7_ebx[9]; }
+    bool INVPCID(void) { return f_7_ebx[10]; }
+    bool RTM(void) { return is_intel && f_7_ebx[11]; }
+    bool AVX512F(void) { return f_7_ebx[16]; }
+    bool RDSEED(void) { return f_7_ebx[18]; }
+    bool ADX(void) { return f_7_ebx[19]; }
+    bool AVX512PF(void) { return f_7_ebx[26]; }
+    bool AVX512ER(void) { return f_7_ebx[27]; }
+    bool AVX512CD(void) { return f_7_ebx[28]; }
+    bool SHA(void) { return f_7_ebx[29]; }
+
+    bool PREFETCHWT1(void) { return f_7_ecx[0]; }
+
+    bool LAHF(void) { return f_81_ecx[0]; }
+    bool LZCNT(void) { return is_intel && f_81_ecx[5]; }
+    bool ABM(void) { return is_amd && f_81_ecx[5]; }
+    bool SSE4a(void) { return is_amd && f_81_ecx[6]; }
+    bool XOP(void) { return is_amd && f_81_ecx[11]; }
+    bool TBM(void) { return is_amd && f_81_ecx[21]; }
+
+    bool SYSCALL(void) { return is_intel && f_81_edx[11]; }
+    bool MMXEXT(void) { return is_amd && f_81_edx[22]; }
+    bool RDTSCP(void) { return is_intel && f_81_edx[27]; }
+    bool _3DNOWEXT(void) { return is_amd && f_81_edx[30]; }
+    bool _3DNOW(void) { return is_amd && f_81_edx[31]; }
+
+    bool AVX512_VBMI(void) { return f_7_ecx[1]; }
+    bool AVX512_VNNI(void) { return f_7_ecx[11]; }
+    bool AVX512_FP16(void) { return f_7_edx[23]; }
+    bool AVX512_BF16(void) { return f_7_1_eax[5]; }
+    bool AVX_VNNI(void) { return f_7_1_eax[4]; }
+
+    bool AMX_TILE(void) { return f_7_edx[24]; }
+    bool AMX_INT8(void) { return f_7_edx[25]; }
+    bool AMX_FP16(void) { return f_7_1_eax[21]; }
+    bool AMX_BF16(void) { return f_7_edx[22]; }
+
+#ifdef _MSC_VER
+    static void cpuid(int cpu_info[4], int eax) {
+        __cpuid(cpu_info, eax);
+    }
+    static void cpuidex(int cpu_info[4], int eax, int ecx) {
+        __cpuidex(cpu_info, eax, ecx);
+    }
+#else
+    static void cpuid(int cpu_info[4], int eax) {
+        __asm__ __volatile__(
+            "cpuid"
+            : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+            : "a"(eax), "c"(0));
+    }
+    static void cpuidex(int cpu_info[4], int eax, int ecx) {
+        __asm__ __volatile__(
+            "cpuid"
+            : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+            : "a"(eax), "c"(ecx));
+    }
+#endif
+
+    cpuid_x86() {
+        std::array<int, 4> cpui;
+        std::vector<std::array<int, 4>> data;
+
+        // calling __cpuid with 0x0 as the function_id argument
+        // gets the number of the highest valid function ID.
+        cpuid(cpui.data(), 0);
+        int n_ids = cpui[0];
+
+        for (int i = 0; i <= n_ids; ++i) {
+            cpuidex(cpui.data(), i, 0);
+            data.push_back(cpui);
+        }
+
+        // capture vendor string
+        char vendor[0x20] = {};
+        *reinterpret_cast<int *>(vendor)     = data[0][1];
+        *reinterpret_cast<int *>(vendor + 4) = data[0][3];
+        *reinterpret_cast<int *>(vendor + 8) = data[0][2];
+        this->vendor = vendor;
+        if (this->vendor == "GenuineIntel") {
+            is_intel = true;
+        } else if (this->vendor == "AuthenticAMD") {
+            is_amd = true;
+        }
+
+        // load bitset with flags for function 0x00000001
+        if (n_ids >= 1) {
+            f_1_ecx = data[1][2];
+            f_1_edx = data[1][3];
+        }
+
+        // load bitset with flags for function 0x00000007
+        if (n_ids >= 7) {
+            f_7_ebx = data[7][1];
+            f_7_ecx = data[7][2];
+            f_7_edx = data[7][3];
+            cpuidex(cpui.data(), 7, 1);
+            f_7_1_eax = cpui[0];
+        }
+
+        // calling __cpuid with 0x80000000 as the function_id argument
+        // gets the number of the highest valid extended ID.
+        cpuid(cpui.data(), 0x80000000);
+        unsigned int n_ex_ids = cpui[0];
+
+        std::vector<std::array<int, 4>> ext_data;
+        for (unsigned int i = 0x80000000; i <= n_ex_ids; ++i) {
+            cpuidex(cpui.data(), i, 0);
+            ext_data.push_back(cpui);
+        }
+
+        // load bitset with flags for function 0x80000001
+        if (n_ex_ids >= 0x80000001) {
+            f_81_ecx = ext_data[1][2];
+            f_81_edx = ext_data[1][3];
+        }
+
+        // interpret CPU brand string if reported
+        char brand[0x40] = {};
+        if (n_ex_ids >= 0x80000004) {
+            std::memcpy(brand, ext_data[2].data(), sizeof(cpui));
+            std::memcpy(brand + 16, ext_data[3].data(), sizeof(cpui));
+            std::memcpy(brand + 32, ext_data[4].data(), sizeof(cpui));
+            this->brand = brand;
+        }
+    }
+
+    bool is_intel = false;
+    bool is_amd = false;
+    std::string vendor;
+    std::string brand;
+    std::bitset<32> f_1_ecx;
+    std::bitset<32> f_1_edx;
+    std::bitset<32> f_7_ebx;
+    std::bitset<32> f_7_ecx;
+    std::bitset<32> f_7_edx;
+    std::bitset<32> f_7_1_eax;
+    std::bitset<32> f_81_ecx;
+    std::bitset<32> f_81_edx;
+};
+
+#if 0
+void test_x86_is() {
+    cpuid_x86 is;
+    printf("CPU Vendor: %s\n", is.vendor.c_str());
+    printf("Brand: %s\n", is.brand.c_str());
+    printf("is_intel: %d\n", is.is_intel);
+    printf("is_amd: %d\n", is.is_amd);
+    printf("sse3: %d\n", is.SSE3());
+    printf("pclmulqdq: %d\n", is.PCLMULQDQ());
+    printf("ssse3: %d\n", is.SSSE3());
+    printf("fma: %d\n", is.FMA());
+    printf("cmpxchg16b: %d\n", is.CMPXCHG16B());
+    printf("sse41: %d\n", is.SSE41());
+    printf("sse42: %d\n", is.SSE42());
+    printf("movbe: %d\n", is.MOVBE());
+    printf("popcnt: %d\n", is.POPCNT());
+    printf("aes: %d\n", is.AES());
+    printf("xsave: %d\n", is.XSAVE());
+    printf("osxsave: %d\n", is.OSXSAVE());
+    printf("avx: %d\n", is.AVX());
+    printf("f16c: %d\n", is.F16C());
+    printf("rdrand: %d\n", is.RDRAND());
+    printf("msr: %d\n", is.MSR());
+    printf("cx8: %d\n", is.CX8());
+    printf("sep: %d\n", is.SEP());
+    printf("cmov: %d\n", is.CMOV());
+    printf("clflush: %d\n", is.CLFSH());
+    printf("mmx: %d\n", is.MMX());
+    printf("fxsr: %d\n", is.FXSR());
+    printf("sse: %d\n", is.SSE());
+    printf("sse2: %d\n", is.SSE2());
+    printf("fsgsbase: %d\n", is.FSGSBASE());
+    printf("bmi1: %d\n", is.BMI1());
+    printf("hle: %d\n", is.HLE());
+    printf("avx2: %d\n", is.AVX2());
+    printf("bmi2: %d\n", is.BMI2());
+    printf("erms: %d\n", is.ERMS());
+    printf("invpcid: %d\n", is.INVPCID());
+    printf("rtm: %d\n", is.RTM());
+    printf("avx512f: %d\n", is.AVX512F());
+    printf("rdseed: %d\n", is.RDSEED());
+    printf("adx: %d\n", is.ADX());
+    printf("avx512pf: %d\n", is.AVX512PF());
+    printf("avx512er: %d\n", is.AVX512ER());
+    printf("avx512cd: %d\n", is.AVX512CD());
+    printf("sha: %d\n", is.SHA());
+    printf("prefetchwt1: %d\n", is.PREFETCHWT1());
+    printf("lahf: %d\n", is.LAHF());
+    printf("lzcnt: %d\n", is.LZCNT());
+    printf("abm: %d\n", is.ABM());
+    printf("sse4a: %d\n", is.SSE4a());
+    printf("xop: %d\n", is.XOP());
+    printf("tbm: %d\n", is.TBM());
+    printf("syscall: %d\n", is.SYSCALL());
+    printf("mmxext: %d\n", is.MMXEXT());
+    printf("rdtscp: %d\n", is.RDTSCP());
+    printf("3dnowext: %d\n", is._3DNOWEXT());
+    printf("3dnow: %d\n", is._3DNOW());
+    printf("avx512_vbmi: %d\n", is.AVX512_VBMI());
+    printf("avx512_vnni: %d\n", is.AVX512_VNNI());
+    printf("avx512_fp16: %d\n", is.AVX512_FP16());
+    printf("avx512_bf16: %d\n", is.AVX512_BF16());
+    printf("amx_tile: %d\n", is.AMX_TILE());
+    printf("amx_int8: %d\n", is.AMX_INT8());
+    printf("amx_fp16: %d\n", is.AMX_FP16());
+    printf("amx_bf16: %d\n", is.AMX_BF16());
+}
+#endif
+
+static int ggml_backend_cpu_x86_score() {
+    // FIXME: this does not check for OS support
+
+    cpuid_x86 is;
+    // if the CPU backend was built with any features not supported by the current CPU, it cannot be used
+    if (ggml_cpu_has_fma() && !is.FMA()) { return 0; }
+    if (ggml_cpu_has_f16c() && !is.F16C()) { return 0; }
+    if (ggml_cpu_has_ssse3() && !is.SSSE3()) { return 0; }
+    if (ggml_cpu_has_sse3() && !is.SSE3()) { return 0; }
+    if (ggml_cpu_has_avx() && !is.AVX()) { return 0; }
+    if (ggml_cpu_has_avx_vnni() && !is.AVX_VNNI()) { return 0; }
+    if (ggml_cpu_has_avx2() && !is.AVX2()) { return 0; }
+    if (ggml_cpu_has_avx512() && !is.AVX512F()) { return 0; }
+    if (ggml_cpu_has_avx512_vbmi() && !is.AVX512_VBMI()) { return 0; }
+    if (ggml_cpu_has_avx512_bf16() && !is.AVX512_BF16()) { return 0; }
+    if (ggml_cpu_has_avx512_vnni() && !is.AVX512_VNNI()) { return 0; }
+    if (ggml_cpu_has_amx_int8() && !is.AMX_INT8()) { return 0; }
+
+    // calculate a backend score based on the supported features
+    // more important features have a higher weight
+    int score = 0;
+    score +=  ggml_cpu_has_fma        () * 1;
+    score +=  ggml_cpu_has_f16c       () * 1<<1;
+    score +=  ggml_cpu_has_ssse3      () * 1<<2;
+    score +=  ggml_cpu_has_sse3       () * 1<<3;
+    score +=  ggml_cpu_has_avx_vnni   () * 1<<4;
+    score +=  ggml_cpu_has_avx        () * 1<<5;
+    score +=  ggml_cpu_has_avx2       () * 1<<6;
+    score +=  ggml_cpu_has_avx512     () * 1<<7;
+    // score +=  ggml_cpu_has_avx512_vbmi() * 1<<8; // not used
+    score +=  ggml_cpu_has_avx512_bf16() * 1<<9;
+    score +=  ggml_cpu_has_avx512_vnni() * 1<<10;
+    score +=  ggml_cpu_has_amx_int8   () * 1<<11;
+
+    return score;
+}
+
+GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_x86_score)
+
+#endif // defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
index 61a92cfd9..11152385e 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
@@ -128,7 +128,7 @@ static inline __m512i sum_i16_pairs_int_32x16(const __m512i x) {
 }
 
 static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) {
-#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
+#if defined(__AVX512VNNI__)
     const __m512i zero = _mm512_setzero_si512();
     return _mm512_dpbusd_epi32(zero, ax, sy);
 #else
diff --git a/scripts/build-cpu.sh b/scripts/build-cpu.sh
new file mode 100755
index 000000000..4b2ad816e
--- /dev/null
+++ b/scripts/build-cpu.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+name="$1"
+args="${@:2}"
+
+echo "Building $name with args: $args"
+
+rm -fr build-cpu-$1
+cmake -S . -B build-cpu-$1 -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF $args
+cmake --build build-cpu-$1 --config Release -t ggml-cpu -j $(nproc)
+cp build-cpu-$1/bin/libggml-cpu.so ./libggml-cpu-$1.so
+rm -fr build-cpu-$1

From 5c7a5aa0c32eb19ce03e178560797db5875d7692 Mon Sep 17 00:00:00 2001
From: Wang Qin <37098874+wangqin0@users.noreply.github.com>
Date: Sun, 1 Dec 2024 10:11:42 -0800
Subject: [PATCH 17/24] ci: add error handling for Python venv creation in
 run.sh (#10608)

---
 ci/run.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ci/run.sh b/ci/run.sh
index 20610e560..abf08a4ff 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -815,7 +815,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
     ln -sfn ${mnt_models} ${SRC}/models-mnt
 
     # Create a fresh python3 venv and enter it
-    python3 -m venv "$MNT/venv"
+    if ! python3 -m venv "$MNT/venv"; then
+        echo "Error: Failed to create Python virtual environment at $MNT/venv."
+        exit 1
+    fi
     source "$MNT/venv/bin/activate"
 
     pip install -r ${SRC}/requirements.txt --disable-pip-version-check

From 5e1ed95583ca552a98d8528b73e1ff81249c2bf9 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 1 Dec 2024 21:37:54 +0200
Subject: [PATCH 18/24] grammars : add English-only grammar (#10612)

---
 grammars/english.gbnf | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 grammars/english.gbnf

diff --git a/grammars/english.gbnf b/grammars/english.gbnf
new file mode 100644
index 000000000..2e53686c8
--- /dev/null
+++ b/grammars/english.gbnf
@@ -0,0 +1,6 @@
+# note: this might be incomplete, mostly an example
+root        ::= en-char+ ([ \t\n] en-char+)*
+en-char     ::= letter | digit | punctuation
+letter      ::= [a-zA-Z]
+digit       ::= [0-9]
+punctuation ::= [!"#$%&'()*+,-./:;<=>?@[\\\]^_`{|}~]

From 917786f43d0f29b7c77a0c56767c0fa4df68b1c5 Mon Sep 17 00:00:00 2001
From: Juk Armstrong <69222624+jukofyork@users.noreply.github.com>
Date: Sun, 1 Dec 2024 22:09:49 +0000
Subject: [PATCH 19/24] Add `mistral-v1`, `mistral-v3`, `mistral-v3-tekken` and
 `mistral-v7` chat template types (#10572)

* Templates: `mistral-v1`, `mistral-v2`, `mistral-v3`, `mistral-v3-tekken`

* Changed system message logic and added tests for all 4

* Invalid `system_message` instead of `content` fixed

* Removed tab-indented lines

* Added template code and test for `mistral-v7`

* Added all tests. Fixed bug with `tmpl == "llama2"` test.

* Replaced tabs with spaces.

* Removed `'mistral-v2'` option as no (open) models ever used it

* Removed all references to 'v2' template from comments

* Update llama.cpp

Fixed `trim_assistant_message` bug
---
 src/llama.cpp                | 108 ++++++++++++++++++++++++-----------
 tests/test-chat-template.cpp |  32 ++++++++++-
 2 files changed, 105 insertions(+), 35 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 22b951ba2..6e9ba9727 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -21862,41 +21862,85 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|im_start|>assistant\n";
         }
-    } else if (tmpl == "llama2" || tmpl == "mistral" || tmpl_contains("[INST]")) {
-        // llama2 template and its variants
-        // [variant] support system message
-        bool support_system_message = tmpl_contains("<<SYS>>") || tmpl == "mistral";
-        // [variant] space before + after response
-        bool space_around_response = tmpl_contains("' ' + eos_token");
-        // [variant] add BOS inside history
-        bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
-        // [variant] trim spaces from the input message
-        bool strip_message = tmpl_contains("content.strip()");
-        // construct the prompt
-        bool is_inside_turn = true; // skip BOS at the beginning
-        ss << "[INST] ";
-        for (auto message : chat) {
-            std::string content = strip_message ? trim(message->content) : message->content;
-            std::string role(message->role);
-            if (!is_inside_turn) {
-                is_inside_turn = true;
-                ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
-            }
-            if (role == "system") {
-                if (support_system_message) {
-                    ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
-                } else {
-                    // if the model does not support system message, we still include it in the first message, but without <<SYS>>
-                    ss << content << "\n";
+    } else if (tmpl == "llama2" || tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
+        if (tmpl == "mistral-v7" || tmpl_contains("[SYSTEM_PROMPT]")) {
+            // Official mistral 'v7' template
+            // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
+            for (auto message : chat) {
+                std::string role(message->role);
+                std::string content(message->content);
+                if (role == "system") {
+                    ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
+                } else if (role == "user") {
+                    ss << "[INST] " << content << "[/INST]";
+                }
+                else {
+                    ss << " " << content << "</s>";
                 }
-            } else if (role == "user") {
-                ss << content << " [/INST]";
-            } else {
-                ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
-                is_inside_turn = false;
             }
+        } else if (tmpl == "mistral-v1" || tmpl == "mistral-v3" || tmpl == "mistral-v3-tekken"
+                   || tmpl_contains("' [INST] ' + system_message") // catches official 'v1' template
+                   || tmpl_contains("[AVAILABLE_TOOLS]")) {        // catches official 'v3' and 'v3-tekken' templates
+            // Official mistral 'v1', 'v3' and 'v3-tekken' templates
+            // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
+            // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
+            std::string leading_space = (tmpl == "mistral-v1" || tmpl_contains(" [INST]") ? " " : "");
+            std::string trailing_space = (tmpl == "mistral-v3-tekken" || tmpl_contains("\"[INST]\"") ? "" : " ");
+            bool trim_assistant_message = tmpl_contains("|trim + eos_token");
+            bool is_inside_turn = false;
+            for (auto message : chat) {
+                if (!is_inside_turn) {
+                    ss << leading_space << "[INST]" << trailing_space;
+                    is_inside_turn = true;
+                }
+                std::string role(message->role);
+                std::string content(message->content);
+                if (role == "system") {
+                    ss << content << "\n\n";
+                } else if (role == "user") {
+                    ss << content << leading_space << "[/INST]";
+                } else {
+                    ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
+                    is_inside_turn = false;
+                }
+            }
+        } else {
+            // llama2 template and its variants
+            // [variant] support system message
+            // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
+            bool support_system_message = tmpl_contains("<<SYS>>") || tmpl == "llama2";
+            // [variant] space before + after response
+            bool space_around_response = tmpl_contains("' ' + eos_token");
+            // [variant] add BOS inside history
+            bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
+            // [variant] trim spaces from the input message
+            bool strip_message = tmpl_contains("content.strip()");
+            // construct the prompt
+            bool is_inside_turn = true; // skip BOS at the beginning
+            ss << "[INST] ";
+            for (auto message : chat) {
+                std::string content = strip_message ? trim(message->content) : message->content;
+                std::string role(message->role);
+                if (!is_inside_turn) {
+                    is_inside_turn = true;
+                    ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
+                }
+                if (role == "system") {
+                    if (support_system_message) {
+                        ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
+                    } else {
+                        // if the model does not support system message, we still include it in the first message, but without <<SYS>>
+                        ss << content << "\n";
+                    }
+                } else if (role == "user") {
+                    ss << content << " [/INST]";
+                } else {
+                    ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
+                    is_inside_turn = false;
+                }
+            }
+            // llama2 templates seem to not care about "add_generation_prompt
         }
-        // llama2 templates seem to not care about "add_generation_prompt"
     } else if (tmpl == "phi3" || (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>"))) {
         // Phi 3
         for (auto message : chat) {
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index 03e897e66..dd8f7d5f0 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -21,7 +21,7 @@ int main(void) {
     std::vector<std::string> templates = {
         // teknium/OpenHermes-2.5-Mistral-7B
         "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
-        // mistralai/Mistral-7B-Instruct-v0.2
+        // mistralai/Mistral-7B-Instruct-v0.2 (NOTE: Old pre-v1 without a system prompt)
         "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
         // TheBloke/FusionNet_34Bx2_MoE-AWQ
         "{%- for idx in range(0, messages|length) -%}\\n{%- if messages[idx]['role'] == 'user' -%}\\n{%- if idx > 1 -%}\\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\\n{%- else -%}\\n{{- messages[idx]['content'] + ' [/INST]' -}}\\n{%- endif -%}\\n{% elif messages[idx]['role'] == 'system' %}\\n{{- '[INST] <<SYS>>\\\\n' + messages[idx]['content'] + '\\\\n<</SYS>>\\\\n\\\\n' -}}\\n{%- elif messages[idx]['role'] == 'assistant' -%}\\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\\n{% endif %}\\n{% endfor %}",
@@ -67,11 +67,19 @@ int main(void) {
         "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
         // ibm-granite/granite-3.0-8b-instruct
         "{%- if tools %}\n    {{- '<|start_of_role|>available_tools<|end_of_role|>\n' }}\n    {%- for tool in tools %}\n    {{- tool | tojson(indent=4) }}\n    {%- if not loop.last %}\n        {{- '\n\n' }}\n    {%- endif %}\n    {%- endfor %}\n    {{- '<|end_of_text|>\n' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n    {{- '<|start_of_role|>system<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'user' %}\n    {{- '<|start_of_role|>user<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'assistant' %}\n    {{- '<|start_of_role|>assistant<|end_of_role|>'  + message['content'] + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'assistant_tool_call' %}\n    {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'tool_response' %}\n    {{- '<|start_of_role|>tool_response<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- endif %}\n    {%- if loop.last and add_generation_prompt %}\n    {{- '<|start_of_role|>assistant<|end_of_role|>' }}\n    {%- endif %}\n{%- endfor %}",
+        // mistralai/Mistral-7B-Instruct-v0.2 (mistralai 'v1' template with a system prompt)
+        "{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n    {%- endif %}\n    {%- if message['role'] == 'user' %}\n        {%- if loop.first and system_message is defined %}\n            {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n        {%- else %}\n            {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {{- ' ' + message['content'] + eos_token}}\n    {%- else %}\n        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n    {%- endif %}\n{%- endfor %}\n",
+        // Mistral-Large-Instruct-2407 (mistralai 'v3' template)
+        "{%- if messages[0][\"role\"] == \"system\" %}\n    {%- set system_message = messages[0][\"content\"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n    {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n        {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n            {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n        {%- endif %}\n        {%- set ns.index = ns.index + 1 %}\n    {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if message[\"role\"] == \"user\" %}\n        {%- if tools is not none and (message == user_messages[-1]) %}\n            {{- \"[AVAILABLE_TOOLS] [\" }}\n            {%- for tool in tools %}\n                {%- set tool = tool.function %}\n                {{- '{\"type\": \"function\", \"function\": {' }}\n                {%- for key, val in tool.items() if key != \"return\" %}\n                    {%- if val is string %}\n                        {{- '\"' + key + '\": \"' + val + '\"' }}\n                    {%- else %}\n                        {{- '\"' + key + '\": ' + val|tojson }}\n                    {%- endif %}\n                    {%- if not loop.last %}\n                        {{- \", \" }}\n                    {%- endif %}\n                {%- endfor %}\n                {{- \"}}\" }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- else %}\n                    {{- \"]\" }}\n                {%- endif %}\n            {%- endfor %}\n            {{- \"[/AVAILABLE_TOOLS]\" }}\n            {%- endif %}\n        {%- if loop.last and system_message is defined %}\n            {{- \"[INST] \" + system_message + \"\\n\\n\" + message[\"content\"] + \"[/INST]\" }}\n        {%- else %}\n            {{- \"[INST] \" + message[\"content\"] + \"[/INST]\" }}\n        {%- endif %}\n    {%- elif message.tool_calls is defined and message.tool_calls is not none %}\n        {{- \"[TOOL_CALLS] [\" }}\n        {%- for tool_call in message.tool_calls %}\n            {%- set out = tool_call.function|tojson %}\n            {{- out[:-1] }}\n            {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n                {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n            {%- endif %}\n            {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- else %}\n                {{- \"]\" + eos_token }}\n            {%- endif %}\n        {%- endfor %}\n    {%- elif message[\"role\"] == \"assistant\" %}\n        {{- \" \" + message[\"content\"]|trim + eos_token}}\n    {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n        {%- if message.content is defined and message.content.content is defined %}\n            {%- set content = message.content.content %}\n        {%- else %}\n            {%- set content = message.content %}\n        {%- endif %}\n        {{- '[TOOL_RESULTS] {\"content\": ' + content|string + \", \" }}\n        {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n            {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n        {%- endif %}\n        {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n    {%- else %}\n        {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n    {%- endif %}\n{%- endfor %}\n",
+        // Mistral-Nemo-Instruct-2407 (mistralai 'v3-tekken' template)
+        "{%- if messages[0][\"role\"] == \"system\" %}\n    {%- set system_message = messages[0][\"content\"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n    {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n        {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n            {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n        {%- endif %}\n        {%- set ns.index = ns.index + 1 %}\n    {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if message[\"role\"] == \"user\" %}\n        {%- if tools is not none and (message == user_messages[-1]) %}\n            {{- \"[AVAILABLE_TOOLS][\" }}\n            {%- for tool in tools %}\n                {%- set tool = tool.function %}\n                {{- '{\"type\": \"function\", \"function\": {' }}\n                {%- for key, val in tool.items() if key != \"return\" %}\n                    {%- if val is string %}\n                        {{- '\"' + key + '\": \"' + val + '\"' }}\n                    {%- else %}\n                        {{- '\"' + key + '\": ' + val|tojson }}\n                    {%- endif %}\n                    {%- if not loop.last %}\n                        {{- \", \" }}\n                    {%- endif %}\n                {%- endfor %}\n                {{- \"}}\" }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- else %}\n                    {{- \"]\" }}\n                {%- endif %}\n            {%- endfor %}\n            {{- \"[/AVAILABLE_TOOLS]\" }}\n            {%- endif %}\n        {%- if loop.last and system_message is defined %}\n            {{- \"[INST]\" + system_message + \"\\n\\n\" + message[\"content\"] + \"[/INST]\" }}\n        {%- else %}\n            {{- \"[INST]\" + message[\"content\"] + \"[/INST]\" }}\n        {%- endif %}\n    {%- elif (message.tool_calls is defined and message.tool_calls is not none) %}\n        {{- \"[TOOL_CALLS][\" }}\n        {%- for tool_call in message.tool_calls %}\n            {%- set out = tool_call.function|tojson %}\n            {{- out[:-1] }}\n            {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n                {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n            {%- endif %}\n            {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- else %}\n                {{- \"]\" + eos_token }}\n            {%- endif %}\n        {%- endfor %}\n    {%- elif message[\"role\"] == \"assistant\" %}\n        {{- message[\"content\"] + eos_token}}\n    {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n        {%- if message.content is defined and message.content.content is defined %}\n            {%- set content = message.content.content %}\n        {%- else %}\n            {%- set content = message.content %}\n        {%- endif %}\n        {{- '[TOOL_RESULTS]{\"content\": ' + content|string + \", \" }}\n        {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n            {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n        {%- endif %}\n        {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n    {%- else %}\n        {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n    {%- endif %}\n{%- endfor %}\n",
+        // mistralai/Mistral-Large-Instruct-2411 (mistralai 'v7' template)
+        "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + '[/INST]' }}{% elif message['role'] == 'system' %}{{ '[SYSTEM_PROMPT] ' + message['content'] + '[/SYSTEM_PROMPT]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, system and assistant roles are supported!') }}{% endif %}{% endfor %}",
     };
     std::vector<std::string> expected_output = {
         // teknium/OpenHermes-2.5-Mistral-7B
         "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi there<|im_end|>\n<|im_start|>user\nWho are you<|im_end|>\n<|im_start|>assistant\n   I am an assistant   <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant\n",
-        // mistralai/Mistral-7B-Instruct-v0.2
+        // mistralai/Mistral-7B-Instruct-v0.2 (NOTE: Old pre-v1 without a system prompt)
         "[INST] You are a helpful assistant\nHello [/INST]Hi there</s>[INST] Who are you [/INST]   I am an assistant   </s>[INST] Another question [/INST]",
         // TheBloke/FusionNet_34Bx2_MoE-AWQ
         "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s><s>[INST] Who are you [/INST]    I am an assistant    </s><s>[INST] Another question [/INST]",
@@ -113,6 +121,14 @@ int main(void) {
         u8"You are a helpful assistant\n\nUser: Hello\n\nAssistant: Hi there<｜end▁of▁sentence｜>User: Who are you\n\nAssistant:    I am an assistant   <｜end▁of▁sentence｜>User: Another question\n\nAssistant:",
         // ibm-granite/granite-3.0-8b-instruct
         "<|start_of_role|>system<|end_of_role|>You are a helpful assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Hello<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>Hi there<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Who are you<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>   I am an assistant   <|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Another question<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>\n",
+        // mistralai/Mistral-7B-Instruct-v0.2 (mistralai 'v1' template with a system prompt)
+        " [INST] You are a helpful assistant\n\nHello [/INST] Hi there</s> [INST] Who are you [/INST]    I am an assistant   </s> [INST] Another question [/INST]",
+        // Mistral-Large-Instruct-2407 (mistralai 'v3' template; modified to have system prompt at start)
+        "[INST] You are a helpful assistant\n\nHello[/INST] Hi there</s>[INST] Who are you[/INST] I am an assistant</s>[INST] Another question[/INST]",
+        // Mistral-Nemo-Instruct-2407 (mistralai 'v3-tekken' template; modified to have system prompt at start)
+        "[INST]You are a helpful assistant\n\nHello[/INST]Hi there</s>[INST]Who are you[/INST]   I am an assistant   </s>[INST]Another question[/INST]",
+        // mistralai/Mistral-Large-Instruct-2411 (mistralai 'v7' template)
+        "[SYSTEM_PROMPT] You are a helpful assistant[/SYSTEM_PROMPT][INST] Hello[/INST] Hi there</s>[INST] Who are you[/INST]    I am an assistant   </s>[INST] Another question[/INST]",
     };
     std::vector<char> formatted_chat(1024);
     int32_t res;
@@ -154,7 +170,12 @@ int main(void) {
         return output;
     };
     assert(fmt_sys("chatml") == "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n");
-    assert(fmt_sys("llama2") == "[INST] You are a helpful assistant\n");
+    assert(fmt_sys("mistral-v1") == " [INST] You are a helpful assistant\n\n");
+    assert(fmt_sys("mistral-v3") == "[INST] You are a helpful assistant\n\n");
+    assert(fmt_sys("mistral-v3-tekken") == "[INST]You are a helpful assistant\n\n");
+    assert(fmt_sys("mistral-v7") == "[SYSTEM_PROMPT] You are a helpful assistant[/SYSTEM_PROMPT]");
+    assert(fmt_sys("llama2") == "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\n");
+    assert(fmt_sys("mistral") == "[INST] You are a helpful assistant\n"); // for old pre-v1 templates
     assert(fmt_sys("gemma")  == ""); // for gemma, system message is merged with user message
     assert(fmt_sys("llama3") == "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|>");
 
@@ -173,7 +194,12 @@ int main(void) {
         return output;
     };
     assert(fmt_single("chatml") == "\n<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n");
+    assert(fmt_single("mistral-v1") == " [INST] How are you [/INST]");
+    assert(fmt_single("mistral-v3") == "[INST] How are you[/INST]");
+    assert(fmt_single("mistral-v3-tekken") == "[INST]How are you[/INST]");
+    assert(fmt_single("mistral-v7") == "[INST] How are you[/INST]");
     assert(fmt_single("llama2") == "[INST] How are you [/INST]");
+    assert(fmt_single("mistral") == "[INST] How are you [/INST]"); // for old pre-v1 templates
     assert(fmt_single("gemma")  == "\n<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
     assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
 

From 4cb003dd8d1f37523120a21e4b1a50a2adcb8c84 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 2 Dec 2024 08:53:27 +0200
Subject: [PATCH 20/24] contrib : refresh (#10593)

* contrib : refresh

* contrib : expand [no ci]

* contrib : expand test-backend-ops instructions

* contrib : add CODEOWNERS

* prs : update template to not have checkbox [no ci]
---
 .github/pull_request_template.md | 8 +-------
 CODEOWNERS                       | 3 +++
 CONTRIBUTING.md                  | 6 ++++--
 3 files changed, 8 insertions(+), 9 deletions(-)
 create mode 100644 CODEOWNERS

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 997c6d9d0..d9f5bdc23 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,7 +1 @@
-
-
-- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
-- Self-reported review complexity:
-  - [ ] Low
-  - [ ] Medium
-  - [ ] High
+*Make sure to read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 000000000..88ab6de4f
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1,3 @@
+# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
+
+ci/ @ggerganov
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 4c882c254..5a85ec5d2 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,9 +1,10 @@
 # Pull requests (for contributors)
 
 - Test your changes:
-  - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the `ggml` library
   - Execute [the full CI locally on your machine](ci/README.md) before publishing
-- Optionally rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs
+  - Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
+  - If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
+  - If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
 - If your PR becomes stale, don't hesitate to ping the maintainers in the comments
 
@@ -12,6 +13,7 @@
 - Squash-merge PRs
 - Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
 - Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
+- Consider adding yourself to [CODEOWNERS](CODEOWNERS)
 
 # Coding guidelines
 

From 991f8aabeec89d801300bb179e52013fb0eb0584 Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan.biswas@gmail.com>
Date: Mon, 2 Dec 2024 12:34:11 +0530
Subject: [PATCH 21/24] SYCL: Fix and switch to GGML_LOG system instead of
 fprintf (#10579)

* Switched to GGML_LOG

* Fix missing semicolon
---
 ggml/src/ggml-sycl/ggml-sycl.cpp | 90 ++++++++++++++++++--------------
 1 file changed, 51 insertions(+), 39 deletions(-)

diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 808f74fa0..1310981e5 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -47,7 +47,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
 
     info.device_count = dpct::dev_mgr::instance().device_count();
     if (info.device_count == 0) {
-        fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": %s\n", __func__);
+        GGML_LOG_ERROR("%s: failed to initialize " GGML_SYCL_NAME ": %s\n", __func__);
         return info;
     }
 
@@ -55,16 +55,16 @@ static ggml_sycl_device_info ggml_sycl_init() {
 
     int64_t total_vram = 0;
 #if defined(GGML_SYCL_FORCE_MMQ)
-    fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ:   yes\n", __func__);
+    GGML_LOG_INFO("%s: GGML_SYCL_FORCE_MMQ:   yes\n", __func__);
 #else
-    fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ:   no\n", __func__);
+    GGML_LOG_INFO("%s: GGML_SYCL_FORCE_MMQ:   no\n", __func__);
 #endif
 #if defined(SYCL_USE_XMX)
-    fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
+    GGML_LOG_INFO("%s: SYCL_USE_XMX: yes\n", __func__);
 #else
-    fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
+    GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
 #endif
-    fprintf(stderr, "%s: found %d " GGML_SYCL_NAME " devices:\n", __func__, info.device_count);
+    GGML_LOG_INFO("%s: found %d " GGML_SYCL_NAME " devices:\n", __func__, info.device_count);
 
     for (int i = 0; i < info.device_count; ++i) {
         info.devices[i].vmm = 0;
@@ -110,7 +110,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
 
     auto global_mem_size = prop.get_global_mem_size()/1000000;
 
-    fprintf(stderr, "|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
+    GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
             name.c_str(), version.c_str(), prop.get_max_compute_units(),
             prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
             global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
@@ -120,19 +120,30 @@ void ggml_backend_sycl_print_sycl_devices() {
     GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
     int device_count = dpct::dev_mgr::instance().device_count();
     std::map<std::string, size_t> DeviceNums;
-    fprintf(stderr, "found %d SYCL devices:\n", device_count);
-    fprintf(stderr, "|  |                   |                                       |       |Max    |        |Max  |Global |                     |\n");
-    fprintf(stderr, "|  |                   |                                       |       |compute|Max work|sub  |mem    |                     |\n");
-    fprintf(stderr, "|ID|        Device Type|                                   Name|Version|units  |group   |group|size   |       Driver version|\n");
-    fprintf(stderr, "|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|\n");
+    GGML_LOG_INFO("Found %d SYCL devices:\n", device_count);
+
+    GGML_LOG_INFO(
+        "|  |                   |                                       |      "
+        " |Max    |        |Max  |Global |                     |\n");
+    GGML_LOG_INFO(
+        "|  |                   |                                       |      "
+        " |compute|Max work|sub  |mem    |                     |\n");
+    GGML_LOG_INFO(
+        "|ID|        Device Type|                                   "
+        "Name|Version|units  |group   |group|size   |       Driver version|\n");
+    GGML_LOG_INFO(
+        "|--|-------------------|---------------------------------------|------"
+        "-|-------|--------|-----|-------|---------------------|\n");
+
     for (int id = 0; id < device_count; ++id) {
-        sycl::device device = dpct::dev_mgr::instance().get_device(id);
-        sycl::backend backend = device.get_backend();
-        std::string backend_type = get_device_backend_and_type(device);
-        int type_id=DeviceNums[backend_type]++;
-        std::stringstream device_type;
-        device_type << "[" <<  backend_type << ":" << std::to_string(type_id) << "]";
-        print_device_detail(id, device, device_type.str());
+      sycl::device device = dpct::dev_mgr::instance().get_device(id);
+      sycl::backend backend = device.get_backend();
+      std::string backend_type = get_device_backend_and_type(device);
+      int type_id = DeviceNums[backend_type]++;
+      std::stringstream device_type;
+      device_type << "[" << backend_type << ":" << std::to_string(type_id)
+                  << "]";
+      print_device_detail(id, device, device_type.str());
     }
 }
 
@@ -154,15 +165,14 @@ static void ggml_check_sycl() try {
     static bool initialized = false;
 
     if (!initialized) {
-        fprintf(stderr, "[SYCL] call ggml_check_sycl\n");
+        GGML_LOG_INFO("[SYCL] call ggml_check_sycl\n");
         g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
-
-        fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
+        GGML_LOG_INFO("%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
 
 #if defined(GGML_SYCL_F16)
-        fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__);
+        GGML_LOG_INFO("%s: GGML_SYCL_F16: yes\n", __func__);
 #else
-        fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
+        GGML_LOG_INFO("%s: GGML_SYCL_F16: no\n", __func__);
 #endif
 
 /* NOT REMOVE, keep it for next optimize for XMX.
@@ -180,9 +190,10 @@ static void ggml_check_sycl() try {
             return;
         }
         GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
-        ggml_backend_sycl_print_sycl_devices();
+
         initialized = true;
         g_sycl_loaded = true;
+        ggml_backend_sycl_print_sycl_devices();
     }
 }
 catch (sycl::exception const &exc) {
@@ -205,7 +216,7 @@ inline void check_allow_gpu_index(const int device_index) {
         __func__,
         device_index,
         ggml_sycl_info().device_count - 1);
-    fprintf(stderr, "%s\n", error_buf);
+    GGML_LOG_ERROR("%s\n", error_buf);
     assert(false);
   }
 }
@@ -475,8 +486,8 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
     SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
                                     size, *stream)));
     if (!dev_ptr) {
-        fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, size);
-        return nullptr;
+      GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
+      return nullptr;
     }
     ggml_backend_sycl_buffer_context * ctx = new  ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
     return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
@@ -752,7 +763,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
                                         size, *stream)));
         if (!buf) {
             char err_buf[1024];
-            snprintf(err_buf, 1023, "%s: can't malloc %lu Bytes memory on device", __func__, size);
+            snprintf(err_buf, 1023, "%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
             throw std::runtime_error(err_buf);
         }
         // set padding to 0 to avoid possible NaN values
@@ -1142,17 +1153,18 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
             CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
                                 look_ahead_size, *qptr)));
         if (!ptr) {
-            fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, look_ahead_size);
+            GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device/GPU\n", __func__, look_ahead_size);
             return nullptr;
         }
 
         *actual_size = look_ahead_size;
         pool_size += look_ahead_size;
 
-    #ifdef DEBUG_SYCL_MALLOC
-        fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
+#ifdef DEBUG_SYCL_MALLOC
+        GGML_LOG_DEBUG("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
                 (uint32_t)(max_size/1024/1024), (uint32_t)(g_sycl_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
-    #endif
+#endif
+
         // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg look_ahead_size=%lu, return %p\n", look_ahead_size, ptr);
         return ptr;
     }
@@ -1166,7 +1178,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
                 return;
             }
         }
-        fprintf(stderr, "WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n");
+        GGML_LOG_WARN("WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n");
         SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
         pool_size -= size;
     }
@@ -2437,7 +2449,7 @@ static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_te
             break;
         default:
             // TODO: k-quants
-            fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
+            GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
             GGML_ABORT("fatal error");
             break;
     }
@@ -3750,7 +3762,7 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
     } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
         ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
     } else {
-        fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
+        GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__,
                 ggml_type_name(src0->type), ggml_type_name(src1->type));
         GGML_ABORT("fatal error");
     }
@@ -3825,7 +3837,7 @@ void ggml_sycl_set_main_device(const int main_device) try {
         dpct::device_info prop;
         SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
             prop, dpct::dev_mgr::instance().get_device(main_device))));
-        fprintf(stderr, "Using device %d (%s) as main device\n",
+        GGML_LOG_INFO("Using device %d (%s) as main device\n",
                 main_device, prop.get_name());
     }
 }
@@ -4172,7 +4184,7 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_
 #endif
         bool ok = ggml_sycl_compute_forward(*sycl_ctx, node);
         if (!ok) {
-            fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+            GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
         }
         GGML_ASSERT(ok);
     }
@@ -4672,7 +4684,7 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
 
     ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context(device);
     if (ctx == nullptr) {
-        fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
+        GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
         return nullptr;
     };
 

From 64ed2091b24b2f9747148fdf49a34ed5938762c3 Mon Sep 17 00:00:00 2001
From: haopeng <657407891@qq.com>
Date: Mon, 2 Dec 2024 21:45:54 +0800
Subject: [PATCH 22/24] server: Add "tokens per second" information in the
 backend (#10548)

* add cmake rvv support

* add timings

* remove space

* update readme

* fix

* fix code

* remove empty line

* add test

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
---
 common/common.h                                 |  1 +
 examples/server/README.md                       |  2 ++
 examples/server/server.cpp                      | 16 +++++++++++++++-
 .../server/tests/unit/test_chat_completion.py   | 17 +++++++++++++++++
 examples/server/utils.hpp                       |  9 +++++++++
 5 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/common/common.h b/common/common.h
index 9b1508a15..0373fd3ea 100644
--- a/common/common.h
+++ b/common/common.h
@@ -133,6 +133,7 @@ struct common_params_sampling {
     bool    penalize_nl        = false; // consider newlines as a repeatable token
     bool    ignore_eos         = false;
     bool    no_perf            = false; // disable performance metrics
+    bool    timing_per_token   = false;
 
     std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY
 
diff --git a/examples/server/README.md b/examples/server/README.md
index 877768c8b..45ffb547f 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -416,6 +416,8 @@ node index.js
 
     `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
 
+    `timings_per_token`: Include prompt processing and text generation speed information in each response.  Default: `false`
+
 **Response format**
 
 - Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 1c765f0ea..8eca14b86 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -177,6 +177,8 @@ struct server_slot {
     bool stopped_word   = false;
     bool stopped_limit  = false;
 
+    bool timings_per_token = false;
+
     bool oaicompat = false;
 
     std::string oaicompat_model;
@@ -882,6 +884,8 @@ struct server_context {
             slot.oaicompat_model = "";
         }
 
+        slot.timings_per_token       = json_value(data, "timings_per_token",  false);
+
         slot.params.stream           = json_value(data, "stream",             false);
         slot.params.cache_prompt     = json_value(data, "cache_prompt",       true);
         slot.params.n_predict        = json_value(data, "n_predict",          json_value(data, "max_tokens", defaults.n_predict));
@@ -1279,6 +1283,7 @@ struct server_context {
             {"speculative.n_max",         slot.params.speculative.n_max},
             {"speculative.n_min",         slot.params.speculative.n_min},
             {"speculative.p_min",         slot.params.speculative.p_min},
+            {"timings_per_token",         slot.timings_per_token},
         };
     }
 
@@ -1336,6 +1341,10 @@ struct server_context {
             res.data["model"] = slot.oaicompat_model;
         }
 
+        if (slot.timings_per_token) {
+            res.data["timings"] = slot.get_formated_timings();
+        }
+
         queue_results.send(res);
     }
 
@@ -2274,12 +2283,17 @@ struct server_context {
                 common_sampler_accept(slot.smpl, id, true);
 
                 slot.n_decoded += 1;
+
+                const int64_t t_current = ggml_time_us();
+
                 if (slot.n_decoded == 1) {
-                    slot.t_start_generation = ggml_time_us();
+                    slot.t_start_generation = t_current;
                     slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
                     metrics.on_prompt_eval(slot);
                 }
 
+                slot.t_token_generation = (t_current - slot.t_start_generation) / 1e3;
+
                 completion_token_output result;
                 result.tok = id;
 
diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py
index 1048d6fca..8a439f9ef 100644
--- a/examples/server/tests/unit/test_chat_completion.py
+++ b/examples/server/tests/unit/test_chat_completion.py
@@ -146,3 +146,20 @@ def test_invalid_chat_completion_req(messages):
     })
     assert res.status_code == 400 or res.status_code == 500
     assert "error" in res.body
+
+
+def test_chat_completion_with_timings_per_token():
+    global server
+    server.start()
+    res = server.make_stream_request("POST", "/chat/completions", data={
+        "max_tokens": 10,
+        "messages": [{"role": "user", "content": "test"}],
+        "stream": True,
+        "timings_per_token": True,
+    })
+    for data in res:
+        assert "timings" in data
+        assert "prompt_per_second" in data["timings"]
+        assert "predicted_per_second" in data["timings"]
+        assert "predicted_n" in data["timings"]
+        assert data["timings"]["predicted_n"] <= 10
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 1665e9dc3..e4451532c 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -650,6 +650,10 @@ static json format_final_response_oaicompat(const json & request, const json & r
         res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
     }
 
+    if (result.contains("timings")) {
+        res.push_back({"timings", json_value(result, "timings", json::object())});
+    }
+
     return res;
 }
 
@@ -740,6 +744,11 @@ static std::vector<json> format_partial_response_oaicompat(const json & result,
         {"model",   modelname},
         {"object",  "chat.completion.chunk"}
     };
+
+    if (result.contains("timings")) {
+        ret.push_back({"timings", json_value(result, "timings", json::object())});
+    }
+
     if (!finish_reason.empty()) {
         int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
         int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);

From 8648c521010620c2daccfa1d26015c668ba2c717 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 2 Dec 2024 21:22:53 +0200
Subject: [PATCH 23/24] make : deprecate (#10514)

* make : deprecate

ggml-ci

* ci : disable Makefile builds

ggml-ci

* docs : remove make references [no ci]

* ci : disable swift build

ggml-ci

* docs : remove obsolete make references, scripts, examples

ggml-ci

* basic fix for compare-commits.sh

* update build.md

* more build.md updates

* more build.md updates

* more build.md updates

* Update Makefile

Co-authored-by: Diego Devesa <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
---
 .github/workflows/build.yml                | 163 ++------
 Makefile                                   |   4 +
 docs/backend/BLIS.md                       |   7 -
 docs/build.md                              | 245 +++++-------
 examples/base-translate.sh                 |  61 ---
 examples/convert-llama2c-to-ggml/README.md |   5 +-
 examples/imatrix/README.md                 |   2 -
 examples/server/README.md                  |  15 -
 scripts/compare-commits.sh                 |  18 +-
 scripts/pod-llama.sh                       | 212 -----------
 scripts/server-llm.sh                      | 418 ---------------------
 11 files changed, 139 insertions(+), 1011 deletions(-)
 delete mode 100755 examples/base-translate.sh
 delete mode 100644 scripts/pod-llama.sh
 delete mode 100644 scripts/server-llm.sh

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index e2291bd34..f3326a5fb 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -160,66 +160,6 @@ jobs:
           path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
           name: llama-bin-macos-x64.zip
 
-  ubuntu-focal-make:
-    runs-on: ubuntu-20.04
-    env:
-      LLAMA_NODE_AVAILABLE: true
-      LLAMA_PYTHON_AVAILABLE: true
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential gcc-8
-
-      - uses: actions/setup-node@v4
-        with:
-          node-version: "20"
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-
-      - name: Build
-        id: make_build
-        env:
-            LLAMA_FATAL_WARNINGS: 1
-        run: |
-          CC=gcc-8 make -j $(nproc)
-
-      - name: Test
-        id: make_test
-        run: |
-          CC=gcc-8 make tests -j $(nproc)
-          make test -j $(nproc)
-
-  ubuntu-focal-make-curl:
-    runs-on: ubuntu-20.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev
-
-      - name: Build
-        id: make_build
-        env:
-          LLAMA_FATAL_WARNINGS: 1
-          LLAMA_CURL: 1
-        run: |
-          CC=gcc-8 make -j $(nproc)
-
   ubuntu-latest-cmake:
     runs-on: ubuntu-latest
 
@@ -517,36 +457,6 @@ jobs:
           cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON ..
           cmake --build . --config Release -j $(nproc)
 
-  # TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
-  #       how to debug it.
-  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
-  macOS-latest-make:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: make_build
-        env:
-            LLAMA_FATAL_WARNINGS: 1
-        run: |
-          GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: make_test
-        run: |
-          GGML_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
-          GGML_NO_METAL=1 make test  -j $(sysctl -n hw.logicalcpu)
-
   # TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
   #       how to debug it.
   #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
@@ -642,33 +552,35 @@ jobs:
             -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
           cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
 
-  macOS-latest-swift:
-    runs-on: macos-latest
-
-    strategy:
-      matrix:
-        destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: xcodebuild for swift package
-        id: xcodebuild
-        run: |
-          xcodebuild -scheme llama -destination "${{ matrix.destination }}"
-
-      - name: Build Swift Example
-        id: make_build_swift_example
-        run: |
-            make swift
+# TODO: tmp disabled. see for possible re-enable:
+#       https://github.com/ggerganov/llama.cpp/pull/10525
+#  macOS-latest-swift:
+#    runs-on: macos-latest
+#
+#    strategy:
+#      matrix:
+#        destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v4
+#
+#      - name: Dependencies
+#        id: depends
+#        continue-on-error: true
+#        run: |
+#          brew update
+#
+#      - name: xcodebuild for swift package
+#        id: xcodebuild
+#        run: |
+#          xcodebuild -scheme llama -destination "${{ matrix.destination }}"
+#
+#      - name: Build Swift Example
+#        id: make_build_swift_example
+#        run: |
+#            make swift
 
   windows-msys2:
     runs-on: windows-latest
@@ -695,21 +607,6 @@ jobs:
             mingw-w64-${{matrix.env}}-cmake
             mingw-w64-${{matrix.env}}-openblas
 
-      - name: Build using make
-        shell: msys2 {0}
-        run: |
-            make -j $(nproc)
-
-      - name: Clean after building using make
-        shell: msys2 {0}
-        run: |
-            make clean
-
-      - name: Build using make w/ OpenBLAS
-        shell: msys2 {0}
-        run: |
-            make GGML_OPENBLAS=1 -j $(nproc)
-
       - name: Build using CMake
         shell: msys2 {0}
         run: |
@@ -1257,9 +1154,7 @@ jobs:
     runs-on: ubuntu-latest
 
     needs:
-      - ubuntu-focal-make
       - ubuntu-latest-cmake
-      - macOS-latest-make
       - macOS-latest-cmake
       - windows-latest-cmake
       - windows-2019-cmake-cuda
diff --git a/Makefile b/Makefile
index e30821624..f5dc916f6 100644
--- a/Makefile
+++ b/Makefile
@@ -1,3 +1,7 @@
+ifndef LLAMA_MAKEFILE
+$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
+endif
+
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	libllava.a \
diff --git a/docs/backend/BLIS.md b/docs/backend/BLIS.md
index 35d06bd0f..904548577 100644
--- a/docs/backend/BLIS.md
+++ b/docs/backend/BLIS.md
@@ -27,13 +27,6 @@ We recommend using openmp since it's easier to modify the cores being used.
 
 ### llama.cpp compilation
 
-Makefile:
-
-```bash
-make GGML_BLIS=1 -j
-# make GGML_BLIS=1 llama-benchmark-matmult
-```
-
 CMake:
 
 ```bash
diff --git a/docs/build.md b/docs/build.md
index 72b810437..5465629e2 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -7,124 +7,63 @@ git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 ```
 
-In order to build llama.cpp you have four different options.
+The following sections describe how to build with different backends and options.
 
-- Using `make`:
-  - On Linux or MacOS:
+## CPU Build
 
-      ```bash
-      make
-      ```
+Build llama.cpp using `CMake`:
 
-  - On Windows (x86/x64 only, arm64 requires cmake):
+```bash
+cmake -B build
+cmake --build build --config Release
+```
 
-    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
-    2. Extract `w64devkit` on your pc.
-    3. Run `w64devkit.exe`.
-    4. Use the `cd` command to reach the `llama.cpp` folder.
-    5. From here you can run:
-        ```bash
-        make
-        ```
+**Notes**:
 
-  - Notes:
-    - For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
-    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
-    - For faster repeated compilation, install [ccache](https://ccache.dev/).
-    - For debug builds, run `make LLAMA_DEBUG=1`
+- For faster compilation, add the `-j` argument to run multiple jobs in parallel, or use a generator that does this automatically such as Ninja. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
+- For faster repeated compilation, install [ccache](https://ccache.dev/)
+- For debug builds, there are two cases:
 
-- Using `CMake`:
+    1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
 
-  ```bash
-  cmake -B build
-  cmake --build build --config Release
-  ```
+    ```bash
+    cmake -B build -DCMAKE_BUILD_TYPE=Debug
+    cmake --build build
+    ```
 
-  **Notes**:
+    2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
 
-    - For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
-    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
-    - For faster repeated compilation, install [ccache](https://ccache.dev/).
-    - For debug builds, there are two cases:
+    ```bash
+    cmake -B build -G "Xcode"
+    cmake --build build --config Debug
+    ```
 
-      1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
+    For more details and a list of supported generators, see the [CMake documentation](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html).
 
-      ```bash
-      cmake -B build -DCMAKE_BUILD_TYPE=Debug
-      cmake --build build
-      ```
-
-      2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
-
-      ```bash
-      cmake -B build -G "Xcode"
-      cmake --build build --config Debug
-      ```
-    - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
-      - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
-        - Tab Workload: Desktop-development with C++
-        - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
-      - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
-      - For Windows on ARM (arm64, WoA) build with:
-        ```bash
-        cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
-        cmake --build build-arm64-windows-llvm-release
-        ```
-        Note: Building for arm64 could also be done just with MSVC (with the build-arm64-windows-MSVC preset, or the standard CMake build instructions). But MSVC does not support inline ARM assembly-code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
-
--   Using `gmake` (FreeBSD):
-
-    1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
-    2. Add your user to **video** group
-    3. Install compilation dependencies.
-
-        ```bash
-        sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
-
-        gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
-        ```
-
-## Metal Build
-
-On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
-To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option.
-
-When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
-argument.
+- Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
+    - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
+    - Tab Workload: Desktop-development with C++
+    - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
+    - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
+    - For Windows on ARM (arm64, WoA) build with:
+    ```bash
+    cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
+    cmake --build build-arm64-windows-llvm-release
+    ```
+    Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
 
 ## BLAS Build
 
-Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
+Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Using BLAS doesn't affect the generation performance. There are currently several different BLAS implementations available for build and use:
 
-### Accelerate Framework:
+### Accelerate Framework
 
 This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
 
-### OpenBLAS:
+### OpenBLAS
 
 This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
 
-- Using `make`:
-  - On Linux:
-    ```bash
-    make GGML_OPENBLAS=1
-    ```
-
-  - On Windows:
-
-    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
-    2. Download the latest version of [OpenBLAS for Windows](https://github.com/xianyi/OpenBLAS/releases).
-    3. Extract `w64devkit` on your pc.
-    4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
-    5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
-    6. Run `w64devkit.exe`.
-    7. Use the `cd` command to reach the `llama.cpp` folder.
-    8. From here you can run:
-
-        ```bash
-        make GGML_OPENBLAS=1
-        ```
-
 - Using `CMake` on Linux:
 
     ```bash
@@ -136,14 +75,6 @@ This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS i
 
 Check [BLIS.md](./backend/BLIS.md) for more information.
 
-### SYCL
-
-SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
-
-llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
-
-For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
-
 ### Intel oneMKL
 
 Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
@@ -161,16 +92,29 @@ Building through oneAPI compilers will make avx_vnni instruction set available f
 
 Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
 
-### CUDA
+### Other BLAS libraries
 
-This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
+Any other BLAS library can be used by setting the `GGML_BLAS_VENDOR` option. See the [CMake documentation](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) for a list of supported vendors.
 
-For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
+## Metal Build
+
+On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
+To disable the Metal build at compile time use the `-DGGML_METAL=OFF` cmake option.
+
+When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers 0` command-line argument.
+
+## SYCL
+
+SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
+
+llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
+
+For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
+
+## CUDA
+
+This provides GPU acceleration using an NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from the [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).
 
-- Using `make`:
-  ```bash
-  make GGML_CUDA=1
-  ```
 - Using `CMake`:
 
   ```bash
@@ -192,14 +136,10 @@ The following compilation options are also available to tweak performance:
 | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
 | GGML_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |
 
-### MUSA
+## MUSA
 
 This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
 
-- Using `make`:
-  ```bash
-  make GGML_MUSA=1
-  ```
 - Using `CMake`:
 
   ```bash
@@ -213,16 +153,12 @@ The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enab
 
 Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
 
-### hipBLAS
+## HIP
 
-This provides BLAS acceleration on HIP-supported AMD GPUs.
+This provides GPU acceleration on HIP-supported AMD GPUs.
 Make sure to have ROCm installed.
 You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
 
-- Using `make`:
-  ```bash
-  make GGML_HIP=1
-  ```
 - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
   ```bash
   HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
@@ -247,11 +183,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
       && cmake --build build -- -j 16
   ```
 
-- Using `make` (example for target gfx1030, build with 16 CPU threads):
-  ```bash
-  make -j16 GGML_HIP=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
-  ```
-
 - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
   ```bash
   set PATH=%HIP_PATH%\bin;%PATH%
@@ -265,11 +196,11 @@ You can download it from your Linux distro's package manager or from here: [ROCm
 The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
 If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
 
-### Vulkan
+## Vulkan
 
 **Windows**
 
-#### w64devkit
+### w64devkit
 
 Download and extract [`w64devkit`](https://github.com/skeeto/w64devkit/releases).
 
@@ -289,9 +220,14 @@ Libs: -lvulkan-1
 EOF
 
 ```
-Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
 
-#### Git Bash MINGW64
+Switch into the `llama.cpp` directory and build using CMake.
+```sh
+cmake -B build -DGGML_VULKAN=ON
+cmake --build build --config Release
+```
+
+### Git Bash MINGW64
 
 Download and install [`Git-SCM`](https://git-scm.com/downloads/win) with the default settings
 
@@ -310,20 +246,21 @@ cmake --build build --config Release
 
 Now you can load the model in conversation mode using `Vulkan`
 
-```
-build/bin/release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
+```sh
+build/bin/Release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
 ```
 
-#### MSYS2
+### MSYS2
 Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
-  ```sh
-  pacman -S git \
-      mingw-w64-ucrt-x86_64-gcc \
-      mingw-w64-ucrt-x86_64-cmake \
-      mingw-w64-ucrt-x86_64-vulkan-devel \
-      mingw-w64-ucrt-x86_64-shaderc
-  ```
-Switch into `llama.cpp` directory and build using CMake.
+```sh
+pacman -S git \
+    mingw-w64-ucrt-x86_64-gcc \
+    mingw-w64-ucrt-x86_64-cmake \
+    mingw-w64-ucrt-x86_64-vulkan-devel \
+    mingw-w64-ucrt-x86_64-shaderc
+```
+
+Switch into the `llama.cpp` directory and build using CMake.
 ```sh
 cmake -B build -DGGML_VULKAN=ON
 cmake --build build --config Release
@@ -372,7 +309,7 @@ cmake --build build --config Release
 # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
 ```
 
-### CANN
+## CANN
 This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU.
 
 For more information about Ascend NPU in [Ascend Community](https://www.hiascend.com/en/).
@@ -387,22 +324,26 @@ cmake --build build --config release
 
 You can test with:
 
-`./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
-
-If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
 ```bash
-llm_load_tensors:       CANN buffer size = 13313.00 MiB
+./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32
+```
+
+If the following info is output on screen, you are using `llama.cpp` with the CANN backend:
+```bash
+llm_load_tensors:       CANN model buffer size = 13313.00 MiB
 llama_new_context_with_model:       CANN compute buffer size =  1260.81 MiB
 ```
 
 For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
 
-### Android
+## Android
 
 To read documentation for how to build on Android, [click here](./android.md)
 
-### Arm CPU optimized mulmat kernels
+## Notes about GPU-accelerated backends
 
-Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
+The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`.
 
-To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
+In most cases, it is possible to build and use multiple backends at the same time. For example, you can build llama.cpp with both CUDA and Vulkan support by using the `-DGGML_CUDA=ON -DGGML_VULKAN=ON` options with CMake. At runtime, you can specify which backend devices to use with the `--device` option. To see a list of available devices, use the `--list-devices` option.
+
+Backends can be built as dynamic libraries that can be loaded dynamically at runtime. This allows you to use the same llama.cpp binary on different machines with different GPUs. To enable this feature, use the `GGML_BACKEND_DL` option when building.
diff --git a/examples/base-translate.sh b/examples/base-translate.sh
deleted file mode 100755
index 103a52f55..000000000
--- a/examples/base-translate.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-#
-# Few-shot translation example.
-# Requires a base model (i.e. no fine-tuned or instruct models).
-#
-# Usage:
-#
-#   cd llama.cpp
-#   make -j
-#
-#   ./examples/base-translate.sh <model-base> "<text>" [extra-main-args]
-#
-
-if [ $# -lt 2 ]; then
-  echo "Usage: ./base-translate.sh <model-base> \"<text>\" [extra-main-args]"
-  exit 1
-fi
-
-eargs=""
-if [ $# -gt 2 ]; then
-  eargs="${@:3}"
-fi
-
-ftmp="__llama.cpp_example_tmp__.txt"
-trap "rm -f $ftmp" EXIT
-
-echo "Translate from English to French:
-
-===
-
-sea otter, peppermint, plush girafe:
-
-sea otter => loutre de mer
-peppermint => menthe poivrée
-plush girafe => girafe peluche
-
-===
-
-violin
-
-violin => violon
-
-===
-
-phone, computer, mouse, keyboard:
-
-phone => téléphone
-computer => ordinateur
-mouse => souris
-keyboard => clavier
-
-===
-" > $ftmp
-
-echo "$2
-" >> $ftmp
-
-model=$1
-
-# generate the most likely continuation until the string "===" is found
-./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
diff --git a/examples/convert-llama2c-to-ggml/README.md b/examples/convert-llama2c-to-ggml/README.md
index 5774ac83c..46a42da69 100644
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@@ -2,11 +2,8 @@
 
 This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
 
-To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository:
+To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository.
 
-`$ make -j`
-
-After successful compilation, following usage options are available:
 ```
 usage: ./llama-convert-llama2c-to-ggml [options]
 
diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md
index bb5faec94..9c056986b 100644
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@@ -25,8 +25,6 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
 ## Example
 
 ```bash
-GGML_CUDA=1 make -j
-
 # generate importance matrix (imatrix.dat)
 ./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
 
diff --git a/examples/server/README.md b/examples/server/README.md
index 45ffb547f..aa99d06f9 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -188,12 +188,6 @@ services:
 
 `llama-server` is built alongside everything else from the root of the project
 
-- Using `make`:
-
-  ```bash
-  make llama-server
-  ```
-
 - Using `CMake`:
 
   ```bash
@@ -207,15 +201,6 @@ services:
 
 `llama-server` can also be built with SSL support using OpenSSL 3
 
-- Using `make`:
-
-  ```bash
-  # NOTE: For non-system openssl, use the following:
-  #   CXXFLAGS="-I /path/to/openssl/include"
-  #   LDFLAGS="-L /path/to/openssl/lib"
-  make LLAMA_SERVER_SSL=true llama-server
-  ```
-
 - Using `CMake`:
 
   ```bash
diff --git a/scripts/compare-commits.sh b/scripts/compare-commits.sh
index 8b9b1ad39..143d98729 100755
--- a/scripts/compare-commits.sh
+++ b/scripts/compare-commits.sh
@@ -16,15 +16,21 @@ bench_args="${@:3}"
 rm -f llama-bench.sqlite > /dev/null
 
 # to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...)
+if [ -n "$GGML_CUDA" ]; then
+    cmake_opts="-DGGML_CUDA=ON"
+fi
+
+function run {
+    rm -fr build > /dev/null
+    cmake -B build -S . $cmake_opts > /dev/null
+    cmake --build build -t llama-bench > /dev/null
+    build/bin/llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
+}
 
 git checkout $1 > /dev/null
-make clean > /dev/null
-make -j$(nproc) $make_opts llama-bench > /dev/null
-./llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
+run
 
 git checkout $2 > /dev/null
-make clean > /dev/null
-make -j$(nproc) $make_opts llama-bench > /dev/null
-./llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
+run
 
 ./scripts/compare-llama-bench.py -b $1 -c $2
diff --git a/scripts/pod-llama.sh b/scripts/pod-llama.sh
deleted file mode 100644
index 6e56e1ed0..000000000
--- a/scripts/pod-llama.sh
+++ /dev/null
@@ -1,212 +0,0 @@
-#!/bin/bash
-#
-# Use this script only on fresh pods (runpod.io)!
-# Otherwise, it can break your environment!
-#
-
-if [ -z "$1" ]; then
-    echo "Usage: $0 <data>"
-    echo "  0: no models"
-    echo "  1: tinyllama-1b"
-    echo "  2: codellama-7b"
-    echo "  3: codellama-13b"
-    echo "  4: codellama-34b"
-    echo "  5: codellama-7b-instruct"
-    echo "  6: codellama-13b-instruct"
-    echo "  7: codellama-34b-instruct"
-
-    exit 1
-fi
-
-set -x
-
-# setup deps
-apt-get update
-apt-get install -y git-lfs cmake cmake-curses-gui vim ruby
-git-lfs install
-
-if [ ! -d "/workspace" ]; then
-    ln -sfn $(pwd) /workspace
-fi
-
-# download data
-cd /workspace
-
-# this is useful to git clone repos without doubling the disk size due to .git
-git clone https://github.com/iboB/git-lfs-download
-ln -sfn /workspace/git-lfs-download/git-lfs-download /usr/local/bin/git-lfs-download
-
-# llama.cpp
-cd /workspace
-git clone https://github.com/ggerganov/llama.cpp
-
-cd llama.cpp
-
-GGML_CUDA=1 make -j
-
-ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3  ./models/tinyllama-1b
-ln -sfn /workspace/CodeLlama-7b-hf           ./models/codellama-7b
-ln -sfn /workspace/CodeLlama-13b-hf          ./models/codellama-13b
-ln -sfn /workspace/CodeLlama-34b-hf          ./models/codellama-34b
-ln -sfn /workspace/CodeLlama-7b-Instruct-hf  ./models/codellama-7b-instruct
-ln -sfn /workspace/CodeLlama-13b-Instruct-hf ./models/codellama-13b-instruct
-ln -sfn /workspace/CodeLlama-34b-Instruct-hf ./models/codellama-34b-instruct
-
-pip install -r requirements.txt
-
-# cmake
-cd /workspace/llama.cpp
-
-mkdir build-cublas
-cd build-cublas
-
-cmake -DGGML_CUDA=1 ../
-make -j
-
-if [ "$1" -eq "0" ]; then
-    exit 0
-fi
-
-# more models
-if [ "$1" -eq "1" ]; then
-    cd /workspace
-
-    git-lfs-download https://huggingface.co/PY007/TinyLlama-1.1B-Chat-v0.3
-
-    cd /workspace/llama.cpp
-
-    python3 examples/convert_legacy_llama.py ./models/tinyllama-1b  --outfile ./models/tinyllama-1b/ggml-model-f16.gguf  --outtype f16
-
-    ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_0.gguf q4_0
-    ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_k.gguf q4_k
-    ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q8_0.gguf q8_0
-fi
-
-if [ "$1" -eq "2" ]; then
-    cd /workspace
-
-    git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-hf  --without *safetensors*
-    rm -v ./CodeLlama-7b-hf/*safetensors*
-
-    cd /workspace/llama.cpp
-
-    python3 examples/convert_legacy_llama.py ./models/codellama-7b  --outfile ./models/codellama-7b/ggml-model-f16.gguf  --outtype f16
-
-    ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_0.gguf q4_0
-    ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_k.gguf q4_k
-    ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q8_0.gguf q8_0
-fi
-
-if [ "$1" -eq "3" ]; then
-    cd /workspace
-
-    git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-hf --without *safetensors*
-    rm -v ./CodeLlama-13b-hf/*safetensors*
-
-    cd /workspace/llama.cpp
-
-    python3 examples/convert_legacy_llama.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16
-
-    ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_0.gguf q4_0
-    ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_k.gguf q4_k
-    ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q8_0.gguf q8_0
-fi
-
-if [ "$1" -eq "4" ]; then
-    cd /workspace
-
-    git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-hf --without *safetensors*
-    rm -v ./CodeLlama-34b-hf/*safetensors*
-
-    cd /workspace/llama.cpp
-
-    python3 examples/convert_legacy_llama.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16
-
-    ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_0.gguf q4_0
-    ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_k.gguf q4_k
-    ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q8_0.gguf q8_0
-fi
-
-if [ "$1" -eq "5" ]; then
-    cd /workspace
-
-    git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf  --without *safetensors*
-    rm -v ./CodeLlama-7b-Instruct-hf/*safetensors*
-
-    cd /workspace/llama.cpp
-
-    python3 examples/convert_legacy_llama.py ./models/codellama-7b-instruct  --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf  --outtype f16
-
-    ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_0.gguf q4_0
-    ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_k.gguf q4_k
-    ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q8_0.gguf q8_0
-fi
-
-if [ "$1" -eq "6" ]; then
-    cd /workspace
-
-    git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf --without *safetensors*
-    rm -v ./CodeLlama-13b-Instruct-hf/*safetensors*
-
-    cd /workspace/llama.cpp
-
-    python3 examples/convert_legacy_llama.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16
-
-    ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_0.gguf q4_0
-    ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_k.gguf q4_k
-    ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q8_0.gguf q8_0
-fi
-
-if [ "$1" -eq "7" ]; then
-    cd /workspace
-
-    git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf --without *safetensors*
-    rm -v ./CodeLlama-34b-Instruct-hf/*safetensors*
-
-    cd /workspace/llama.cpp
-
-    python3 examples/convert_legacy_llama.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16
-
-    ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_0.gguf q4_0
-    ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_k.gguf q4_k
-    ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q8_0.gguf q8_0
-fi
-
-if [ "$1" -eq "1" ]; then
-    # perf + perplexity
-    cd /workspace/llama.cpp/build-cublas
-
-    make -j && ../scripts/run-all-perf.sh tinyllama-1b "f16" "-ngl 99 -t 1 -p 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,32,64,128,256,512,1024,2048 -n 128"
-
-    ../scripts/get-wikitext-2.sh
-    unzip wikitext-2-raw-v1.zip
-
-    make -j && ./bin/llama-perplexity -m ../models/tinyllama-1b/ggml-model-f16.gguf -f ./wikitext-2-raw/wiki.test.raw -ngl 100 --chunks 32
-
-    # batched
-    cd /workspace/llama.cpp
-
-    GGML_CUDA=1 make -j && ./llama-batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999
-
-    # batched-bench
-    cd /workspace/llama.cpp
-
-    GGML_CUDA=1 make -j && ./llama-batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32
-
-    # parallel
-    cd /workspace/llama.cpp
-
-    GGML_CUDA=1 make -j && ./llama-parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb
-
-fi
-
-# speculative
-#if [ "$1" -eq "7" ]; then
-#    cd /workspace/llama.cpp
-#
-#    GGML_CUDA=1 make -j && ./llama-speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
-#fi
-
-# more benches
-#GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf  4096 1 99 1 512,3200 128,128,800 1
-#GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
diff --git a/scripts/server-llm.sh b/scripts/server-llm.sh
deleted file mode 100644
index 802592a3e..000000000
--- a/scripts/server-llm.sh
+++ /dev/null
@@ -1,418 +0,0 @@
-#!/bin/bash
-#
-# Helper script for deploying llama.cpp server with a single Bash command
-#
-# - Works on Linux and macOS
-# - Supports: CPU, CUDA, Metal
-# - Can run all GGUF models from HuggingFace
-# - Can serve requests in parallel
-# - Always builds latest llama.cpp from GitHub
-#
-# Limitations
-#
-# - Chat templates are poorly supported (base models recommended)
-# - Might be unstable!
-#
-# Usage:
-#   ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive]
-#
-#   --port:            port number, default is 8888
-#   --repo:            path to a repo containing GGUF model files
-#   --wtype:           weights type (f16, q8_0, q4_0, q4_1), default is user-input
-#   --backend:         cpu, cuda, metal, depends on the OS
-#   --gpu-id:          gpu id, default is 0
-#   --n-parallel:      number of parallel requests, default is 8
-#   --n-kv:            KV cache size, default is 4096
-#   --verbose:         verbose output
-#   --non-interactive: run without asking a permission to run
-#
-# Example:
-#
-#   bash -c "$(curl -s https://ggml.ai/server-llm.sh)"
-#
-
-set -e
-
-# required utils: curl, git, make
-if ! command -v curl &> /dev/null; then
-    printf "[-] curl not found\n"
-    exit 1
-fi
-if ! command -v git &> /dev/null; then
-    printf "[-] git not found\n"
-    exit 1
-fi
-if ! command -v make &> /dev/null; then
-    printf "[-] make not found\n"
-    exit 1
-fi
-
-# parse arguments
-is_interactive=1
-port=8888
-repo=""
-wtype=""
-backend="cpu"
-
-# if macOS, use metal backend by default
-if [[ "$OSTYPE" == "darwin"* ]]; then
-    backend="metal"
-elif command -v nvcc &> /dev/null; then
-    backend="cuda"
-fi
-
-gpu_id=0
-n_parallel=8
-n_kv=4096
-verbose=0
-
-function print_usage {
-    printf "Usage:\n"
-    printf "  ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive]\n\n"
-    printf "  --port:             port number, default is 8888\n"
-    printf "  --repo:             path to a repo containing GGUF model files\n"
-    printf "  --wtype:            weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
-    printf "  --backend:          cpu, cuda, metal, depends on the OS\n"
-    printf "  --gpu-id:           gpu id, default is 0\n"
-    printf "  --n-parallel:       number of parallel requests, default is 8\n"
-    printf "  --n-kv:             KV cache size, default is 4096\n"
-    printf "  --verbose:          verbose output\n\n"
-    printf "  --non-interactive:  run without asking a permission to run\n"
-    printf "Example:\n\n"
-    printf '  bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
-}
-
-while [[ $# -gt 0 ]]; do
-    key="$1"
-    case $key in
-        --non-interactive)
-            is_interactive=0
-            shift
-            ;;
-        --port)
-            port="$2"
-            shift
-            shift
-            ;;
-        --repo)
-            repo="$2"
-            shift
-            shift
-            ;;
-        --wtype)
-            wtype="$2"
-            shift
-            shift
-            ;;
-        --backend)
-            backend="$2"
-            shift
-            shift
-            ;;
-        --gpu-id)
-            gpu_id="$2"
-            shift
-            shift
-            ;;
-        --n-parallel)
-            n_parallel="$2"
-            shift
-            shift
-            ;;
-        --n-kv)
-            n_kv="$2"
-            shift
-            shift
-            ;;
-        --verbose)
-            verbose=1
-            shift
-            ;;
-        --help)
-            print_usage
-            exit 0
-            ;;
-        *)
-            echo "Unknown argument: $key"
-            print_usage
-            exit 1
-            ;;
-    esac
-done
-
-# available weights types
-wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")
-
-wfiles=()
-for wt in "${wtypes[@]}"; do
-    wfiles+=("")
-done
-
-# map wtype input to index
-if [[ ! -z "$wtype" ]]; then
-    iw=-1
-    is=0
-    for wt in "${wtypes[@]}"; do
-        # uppercase
-        uwt=$(echo "$wt" | tr '[:lower:]' '[:upper:]')
-        if [[ "$uwt" == "$wtype" ]]; then
-            iw=$is
-            break
-        fi
-        is=$((is+1))
-    done
-
-    if [[ $iw -eq -1 ]]; then
-        printf "[-] Invalid weight type: %s\n" "$wtype"
-        exit 1
-    fi
-
-    wtype="$iw"
-fi
-
-# sample repos
-repos=(
-    "https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
-    "https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
-    "https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
-    "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
-    "https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
-    "https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
-    "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
-    "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
-    "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
-    "https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
-)
-if [ $is_interactive -eq 1 ]; then
-    printf "\n"
-    printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
-    printf "    Based on the options that follow, the script might download a model file\n"
-    printf "    from the internet, which can be a few GBs in size. The script will also\n"
-    printf "    build the latest llama.cpp source code from GitHub, which can be unstable.\n"
-    printf "\n"
-    printf "    Upon success, an HTTP server will be started and it will serve the selected\n"
-    printf "    model using llama.cpp for demonstration purposes.\n"
-    printf "\n"
-    printf "    Please note:\n"
-    printf "\n"
-    printf "    - All new data will be stored in the current folder\n"
-    printf "    - The server will be listening on all network interfaces\n"
-    printf "    - The server will run with default settings which are not always optimal\n"
-    printf "    - Do not judge the quality of a model based on the results from this script\n"
-    printf "    - Do not use this script to benchmark llama.cpp\n"
-    printf "    - Do not use this script in production\n"
-    printf "    - This script is only for demonstration purposes\n"
-    printf "\n"
-    printf "    If you don't know what you are doing, please press Ctrl-C to abort now\n"
-    printf "\n"
-    printf "    Press Enter to continue ...\n\n"
-
-    read
-fi
-
-if [[ -z "$repo" ]]; then
-    printf "[+] No repo provided from the command line\n"
-    printf "    Please select a number from the list below or enter an URL:\n\n"
-
-    is=0
-    for r in "${repos[@]}"; do
-        printf "    %2d) %s\n" $is "$r"
-        is=$((is+1))
-    done
-
-    # ask for repo until index of sample repo is provided or an URL
-    while [[ -z "$repo" ]]; do
-        printf "\n    Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
-        read -p "[+] Select repo: " repo
-
-        # check if the input is a number
-        if [[ "$repo" =~ ^[0-9]+$ ]]; then
-            if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
-                repo="${repos[$repo]}"
-            else
-                printf "[-] Invalid repo index: %s\n" "$repo"
-                repo=""
-            fi
-        elif [[ "$repo" =~ ^https?:// ]]; then
-            repo="$repo"
-        else
-            printf "[-] Invalid repo URL: %s\n" "$repo"
-            repo=""
-        fi
-    done
-fi
-
-# remove suffix
-repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
-
-printf "[+] Checking for GGUF model files in %s\n" "$repo"
-
-# find GGUF files in the source
-# TODO: better logic
-model_tree="${repo%/}/tree/main"
-model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
-
-# list all files in the provided git repo
-printf "[+] Model files:\n\n"
-for file in $model_files; do
-    # determine iw by grepping the filename with wtypes
-    iw=-1
-    is=0
-    for wt in "${wtypes[@]}"; do
-        # uppercase
-        ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
-        if [[ "$ufile" =~ "$wt" ]]; then
-            iw=$is
-            break
-        fi
-        is=$((is+1))
-    done
-
-    if [[ $iw -eq -1 ]]; then
-        continue
-    fi
-
-    wfiles[$iw]="$file"
-
-    have=" "
-    if [[ -f "$file" ]]; then
-        have="*"
-    fi
-
-    printf "    %2d) %s %s\n" $iw "$have" "$file"
-done
-
-wfile="${wfiles[$wtype]}"
-
-# ask for weights type until provided and available
-while [[ -z "$wfile" ]]; do
-    printf "\n"
-    read -p "[+] Select weight type: " wtype
-    wfile="${wfiles[$wtype]}"
-
-    if [[ -z "$wfile" ]]; then
-        printf "[-] Invalid weight type: %s\n" "$wtype"
-        wtype=""
-    fi
-done
-
-printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"
-
-url="${repo%/}/resolve/main/$wfile"
-
-# check file if the model has been downloaded before
-chk="$wfile.chk"
-
-# check if we should download the file
-# - if $wfile does not exist
-# - if $wfile exists but $chk does not exist
-# - if $wfile exists and $chk exists but $wfile is newer than $chk
-# TODO: better logic using git lfs info
-
-do_download=0
-
-if [[ ! -f "$wfile" ]]; then
-    do_download=1
-elif [[ ! -f "$chk" ]]; then
-    do_download=1
-elif [[ "$wfile" -nt "$chk" ]]; then
-    do_download=1
-fi
-
-if [[ $do_download -eq 1 ]]; then
-    printf "[+] Downloading weights from %s\n" "$url"
-
-    # download the weights file
-    curl -o "$wfile" -# -L "$url"
-
-    # create a check file if successful
-    if [[ $? -eq 0 ]]; then
-        printf "[+] Creating check file %s\n" "$chk"
-        touch "$chk"
-    fi
-else
-    printf "[+] Using cached weights %s\n" "$wfile"
-fi
-
-# get latest llama.cpp and build
-
-printf "[+] Downloading latest llama.cpp\n"
-
-llama_cpp_dir="__llama_cpp_port_${port}__"
-
-if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
-    # if the dir exists and there isn't a file "__ggml_script__" in it, abort
-    printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
-    printf "[-] Please remove it and try again\n"
-    exit 1
-elif [[ -d "$llama_cpp_dir" ]]; then
-    printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
-    printf "[+] Using cached llama.cpp\n"
-
-    cd "$llama_cpp_dir"
-    git reset --hard
-    git fetch
-    git checkout origin/master
-
-    cd ..
-else
-    printf "[+] Cloning llama.cpp\n"
-
-    git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
-fi
-
-# mark that that the directory is made by this script
-touch "$llama_cpp_dir/__ggml_script__"
-
-if [[ $verbose -eq 1 ]]; then
-    set -x
-fi
-
-# build
-cd "$llama_cpp_dir"
-
-make clean
-
-log="--silent"
-if [[ $verbose -eq 1 ]]; then
-    log=""
-fi
-
-if [[ "$backend" == "cuda" ]]; then
-    printf "[+] Building with CUDA backend\n"
-    GGML_CUDA=1 make -j llama-server $log
-elif [[ "$backend" == "cpu" ]]; then
-    printf "[+] Building with CPU backend\n"
-    make -j llama-server $log
-elif [[ "$backend" == "metal" ]]; then
-    printf "[+] Building with Metal backend\n"
-    make -j llama-server $log
-else
-    printf "[-] Unknown backend: %s\n" "$backend"
-    exit 1
-fi
-
-# run the server
-
-printf "[+] Running server\n"
-
-args=""
-if [[ "$backend" == "cuda" ]]; then
-    export CUDA_VISIBLE_DEVICES=$gpu_id
-    args="-ngl 999"
-elif [[ "$backend" == "cpu" ]]; then
-    args="-ngl 0"
-elif [[ "$backend" == "metal" ]]; then
-    args="-ngl 999"
-else
-    printf "[-] Unknown backend: %s\n" "$backend"
-    exit 1
-fi
-
-if [[ $verbose -eq 1 ]]; then
-    args="$args --verbose"
-fi
-
-./llama-server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
-
-exit 0

From 642330ac7cea11dc1f9d3df2b8f3dbd10b5e3f3e Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Mon, 2 Dec 2024 22:10:19 +0100
Subject: [PATCH 24/24] llama : add enum for built-in chat templates (#10623)

* llama : add enum for supported chat templates

* use "built-in" instead of "supported"

* arg: print list of built-in templates

* fix test

* update server README
---
 common/arg.cpp               |  20 +-
 examples/server/README.md    |  11 +-
 include/llama.h              |   3 +
 src/llama.cpp                | 363 +++++++++++++++++++++++++----------
 tests/test-chat-template.cpp |  20 +-
 5 files changed, 310 insertions(+), 107 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 32d9a964c..078c75384 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -348,6 +348,18 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
     return true;
 }
 
+static std::string list_builtin_chat_templates() {
+    std::vector<const char *> supported_tmpl;
+    int32_t res = llama_chat_builtin_templates(nullptr, 0);
+    supported_tmpl.resize(res);
+    res = llama_chat_builtin_templates(supported_tmpl.data(), supported_tmpl.size());
+    std::ostringstream msg;
+    for (auto & tmpl : supported_tmpl) {
+        msg << tmpl << (&tmpl == &supported_tmpl.back() ? "" : ", ");
+    }
+    return msg.str();
+}
+
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
     // load dynamic backends
     ggml_backend_load_all();
@@ -1814,9 +1826,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--chat-template"}, "JINJA_TEMPLATE",
-        "set custom jinja chat template (default: template taken from model's metadata)\n"
-        "if suffix/prefix are specified, template will be disabled\n"
-        "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template",
+        string_format(
+            "set custom jinja chat template (default: template taken from model's metadata)\n"
+            "if suffix/prefix are specified, template will be disabled\n"
+            "list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
+        ),
         [](common_params & params, const std::string & value) {
             if (!common_chat_verify_template(value)) {
                 throw std::runtime_error(string_format(
diff --git a/examples/server/README.md b/examples/server/README.md
index aa99d06f9..3f0d45e5b 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -69,6 +69,8 @@ The project is under active development, and we are [looking for feedback and co
 | `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
 | `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) |
 | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
+| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
+| `--list-devices` | print list of available devices and exit |
 | `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
 | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
 | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
@@ -158,9 +160,16 @@ The project is under active development, and we are [looking for feedback and co
 | `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
 | `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
 | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted:<br/>https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>list of built-in templates:<br/>chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
 | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
+| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16) |
+| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5) |
+| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9) |
+| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model) |
+| `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
+| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model |
+| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) |
 
 
 Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
diff --git a/include/llama.h b/include/llama.h
index ab5e376e6..439e0ff0c 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -990,6 +990,9 @@ extern "C" {
                                   char * buf,
                                int32_t   length);
 
+    // Get list of built-in chat templates
+    int32_t llama_chat_builtin_templates(const char ** output, size_t len);
+
     //
     // Sampling API
     //
diff --git a/src/llama.cpp b/src/llama.cpp
index 6e9ba9727..6a6f4c2a5 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1549,6 +1549,67 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
     },
 };
 
+enum llm_chat_template {
+    LLM_CHAT_TEMPLATE_CHATML,
+    LLM_CHAT_TEMPLATE_LLAMA_2,
+    LLM_CHAT_TEMPLATE_LLAMA_2_SYS,
+    LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS,
+    LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP,
+    LLM_CHAT_TEMPLATE_MISTRAL_V1,
+    LLM_CHAT_TEMPLATE_MISTRAL_V3,
+    LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
+    LLM_CHAT_TEMPLATE_MISTRAL_V7,
+    LLM_CHAT_TEMPLATE_PHI_3,
+    LLM_CHAT_TEMPLATE_ZEPHYR,
+    LLM_CHAT_TEMPLATE_MONARCH,
+    LLM_CHAT_TEMPLATE_GEMMA,
+    LLM_CHAT_TEMPLATE_ORION,
+    LLM_CHAT_TEMPLATE_OPENCHAT,
+    LLM_CHAT_TEMPLATE_VICUNA,
+    LLM_CHAT_TEMPLATE_VICUNA_ORCA,
+    LLM_CHAT_TEMPLATE_DEEPSEEK,
+    LLM_CHAT_TEMPLATE_DEEPSEEK_2,
+    LLM_CHAT_TEMPLATE_COMMAND_R,
+    LLM_CHAT_TEMPLATE_LLAMA_3,
+    LLM_CHAT_TEMPLATE_CHATGML_3,
+    LLM_CHAT_TEMPLATE_CHATGML_4,
+    LLM_CHAT_TEMPLATE_MINICPM,
+    LLM_CHAT_TEMPLATE_EXAONE_3,
+    LLM_CHAT_TEMPLATE_RWKV_WORLD,
+    LLM_CHAT_TEMPLATE_GRANITE,
+    LLM_CHAT_TEMPLATE_UNKNOWN,
+};
+
+static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
+    { "chatml",            LLM_CHAT_TEMPLATE_CHATML            },
+    { "llama2",            LLM_CHAT_TEMPLATE_LLAMA_2           },
+    { "llama2-sys",        LLM_CHAT_TEMPLATE_LLAMA_2_SYS       },
+    { "llama2-sys-bos",    LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS   },
+    { "llama2-sys-strip",  LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP },
+    { "mistral-v1",        LLM_CHAT_TEMPLATE_MISTRAL_V1        },
+    { "mistral-v3",        LLM_CHAT_TEMPLATE_MISTRAL_V3        },
+    { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
+    { "mistral-v7",        LLM_CHAT_TEMPLATE_MISTRAL_V7        },
+    { "phi3",              LLM_CHAT_TEMPLATE_PHI_3             },
+    { "zephyr",            LLM_CHAT_TEMPLATE_ZEPHYR            },
+    { "monarch",           LLM_CHAT_TEMPLATE_MONARCH           },
+    { "gemma",             LLM_CHAT_TEMPLATE_GEMMA             },
+    { "orion",             LLM_CHAT_TEMPLATE_ORION             },
+    { "openchat",          LLM_CHAT_TEMPLATE_OPENCHAT          },
+    { "vicuna",            LLM_CHAT_TEMPLATE_VICUNA            },
+    { "vicuna-orca",       LLM_CHAT_TEMPLATE_VICUNA_ORCA       },
+    { "deepseek",          LLM_CHAT_TEMPLATE_DEEPSEEK          },
+    { "deepseek2",         LLM_CHAT_TEMPLATE_DEEPSEEK_2        },
+    { "command-r",         LLM_CHAT_TEMPLATE_COMMAND_R         },
+    { "llama3",            LLM_CHAT_TEMPLATE_LLAMA_3           },
+    { "chatglm3",          LLM_CHAT_TEMPLATE_CHATGML_3         },
+    { "chatglm4",          LLM_CHAT_TEMPLATE_CHATGML_4         },
+    { "minicpm",           LLM_CHAT_TEMPLATE_MINICPM           },
+    { "exaone3",           LLM_CHAT_TEMPLATE_EXAONE_3          },
+    { "rwkv-world",        LLM_CHAT_TEMPLATE_RWKV_WORLD        },
+    { "granite",           LLM_CHAT_TEMPLATE_GRANITE           },
+};
+
 static llm_arch llm_arch_from_string(const std::string & name) {
     for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
         if (kv.second == name) {
@@ -21843,18 +21904,109 @@ int32_t llama_detokenize(
 // chat templates
 //
 
+static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
+    if (LLM_CHAT_TEMPLATES.find(tmpl) != LLM_CHAT_TEMPLATES.end()) {
+        return LLM_CHAT_TEMPLATES.at(tmpl);
+    }
+    auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
+        return tmpl.find(haystack) != std::string::npos;
+    };
+    if (tmpl_contains("<|im_start|>")) {
+        return LLM_CHAT_TEMPLATE_CHATML;
+    } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
+        if (tmpl_contains("[SYSTEM_PROMPT]")) {
+            return LLM_CHAT_TEMPLATE_MISTRAL_V7;
+        } else if (
+            // catches official 'v1' template
+            tmpl_contains("' [INST] ' + system_message")
+            // catches official 'v3' and 'v3-tekken' templates
+            || tmpl_contains("[AVAILABLE_TOOLS]")
+        ) {
+            // Official mistral 'v1', 'v3' and 'v3-tekken' templates
+            // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
+            // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
+            if (tmpl_contains(" [INST]")) {
+                return LLM_CHAT_TEMPLATE_MISTRAL_V1;
+            } else if (tmpl_contains("\"[INST]\"")) {
+                return LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN;
+            }
+            return LLM_CHAT_TEMPLATE_MISTRAL_V3;
+        } else {
+            // llama2 template and its variants
+            // [variant] support system message
+            // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
+            bool support_system_message = tmpl_contains("<<SYS>>");
+            bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
+            bool strip_message = tmpl_contains("content.strip()");
+            if (strip_message) {
+                return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
+            } else if (add_bos_inside_history) {
+                return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
+            } else if (support_system_message) {
+                return LLM_CHAT_TEMPLATE_LLAMA_2_SYS;
+            } else {
+                return LLM_CHAT_TEMPLATE_LLAMA_2;
+            }
+        }
+    } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
+        return LLM_CHAT_TEMPLATE_PHI_3;
+    } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
+        return LLM_CHAT_TEMPLATE_ZEPHYR;
+    } else if (tmpl_contains("bos_token + message['role']")) {
+        return LLM_CHAT_TEMPLATE_MONARCH;
+    } else if (tmpl_contains("<start_of_turn>")) {
+        return LLM_CHAT_TEMPLATE_GEMMA;
+    } else if (tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
+        // OrionStarAI/Orion-14B-Chat
+        return LLM_CHAT_TEMPLATE_ORION;
+    } else if (tmpl_contains("GPT4 Correct ")) {
+        // openchat/openchat-3.5-0106
+        return LLM_CHAT_TEMPLATE_OPENCHAT;
+    } else if (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: ")) {
+        // eachadea/vicuna-13b-1.1 (and Orca variant)
+        if (tmpl_contains("SYSTEM: ")) {
+            return LLM_CHAT_TEMPLATE_VICUNA_ORCA;
+        }
+        return LLM_CHAT_TEMPLATE_VICUNA;
+    } else if (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>")) {
+        // deepseek-ai/deepseek-coder-33b-instruct
+        return LLM_CHAT_TEMPLATE_DEEPSEEK;
+    } else if (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>")) {
+        // CohereForAI/c4ai-command-r-plus
+        return LLM_CHAT_TEMPLATE_COMMAND_R;
+    } else if (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>")) {
+        return LLM_CHAT_TEMPLATE_LLAMA_3;
+    } else if (tmpl_contains("[gMASK]sop")) {
+        // chatglm3-6b
+        return LLM_CHAT_TEMPLATE_CHATGML_3;
+    } else if (tmpl_contains("[gMASK]<sop>")) {
+        return LLM_CHAT_TEMPLATE_CHATGML_4;
+    } else if (tmpl_contains(LU8("<用户>"))) {
+        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
+        return LLM_CHAT_TEMPLATE_MINICPM;
+    } else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
+        return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
+    } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
+        // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
+        // EXAONE-3.0-7.8B-Instruct
+        return LLM_CHAT_TEMPLATE_EXAONE_3;
+    } else if (tmpl_contains("rwkv-world")) {
+        return LLM_CHAT_TEMPLATE_RWKV_WORLD;
+    } else if (tmpl_contains("<|start_of_role|>")) {
+        return LLM_CHAT_TEMPLATE_GRANITE;
+    }
+    return LLM_CHAT_TEMPLATE_UNKNOWN;
+}
+
 // Simple version of "llama_apply_chat_template" that only works with strings
 // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
 static int32_t llama_chat_apply_template_internal(
-    const std::string & tmpl,
+    const llm_chat_template tmpl,
     const std::vector<const llama_chat_message *> & chat,
     std::string & dest, bool add_ass) {
     // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
     std::stringstream ss;
-    auto tmpl_contains = [&tmpl](std::string haystack) -> bool {
-        return tmpl.find(haystack) != std::string::npos;
-    };
-    if (tmpl == "chatml" || tmpl_contains("<|im_start|>")) {
+    if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
         // chatml template
         for (auto message : chat) {
             ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
@@ -21862,86 +22014,84 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|im_start|>assistant\n";
         }
-    } else if (tmpl == "llama2" || tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
-        if (tmpl == "mistral-v7" || tmpl_contains("[SYSTEM_PROMPT]")) {
-            // Official mistral 'v7' template
-            // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
-            for (auto message : chat) {
-                std::string role(message->role);
-                std::string content(message->content);
-                if (role == "system") {
-                    ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
-                } else if (role == "user") {
-                    ss << "[INST] " << content << "[/INST]";
-                }
-                else {
-                    ss << " " << content << "</s>";
-                }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
+        // Official mistral 'v7' template
+        // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
+        for (auto message : chat) {
+            std::string role(message->role);
+            std::string content(message->content);
+            if (role == "system") {
+                ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
+            } else if (role == "user") {
+                ss << "[INST] " << content << "[/INST]";
             }
-        } else if (tmpl == "mistral-v1" || tmpl == "mistral-v3" || tmpl == "mistral-v3-tekken"
-                   || tmpl_contains("' [INST] ' + system_message") // catches official 'v1' template
-                   || tmpl_contains("[AVAILABLE_TOOLS]")) {        // catches official 'v3' and 'v3-tekken' templates
-            // Official mistral 'v1', 'v3' and 'v3-tekken' templates
-            // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
-            // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
-            std::string leading_space = (tmpl == "mistral-v1" || tmpl_contains(" [INST]") ? " " : "");
-            std::string trailing_space = (tmpl == "mistral-v3-tekken" || tmpl_contains("\"[INST]\"") ? "" : " ");
-            bool trim_assistant_message = tmpl_contains("|trim + eos_token");
-            bool is_inside_turn = false;
-            for (auto message : chat) {
-                if (!is_inside_turn) {
-                    ss << leading_space << "[INST]" << trailing_space;
-                    is_inside_turn = true;
-                }
-                std::string role(message->role);
-                std::string content(message->content);
-                if (role == "system") {
-                    ss << content << "\n\n";
-                } else if (role == "user") {
-                    ss << content << leading_space << "[/INST]";
-                } else {
-                    ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
-                    is_inside_turn = false;
-                }
+            else {
+                ss << " " << content << "</s>";
             }
-        } else {
-            // llama2 template and its variants
-            // [variant] support system message
-            // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
-            bool support_system_message = tmpl_contains("<<SYS>>") || tmpl == "llama2";
-            // [variant] space before + after response
-            bool space_around_response = tmpl_contains("' ' + eos_token");
-            // [variant] add BOS inside history
-            bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
-            // [variant] trim spaces from the input message
-            bool strip_message = tmpl_contains("content.strip()");
-            // construct the prompt
-            bool is_inside_turn = true; // skip BOS at the beginning
-            ss << "[INST] ";
-            for (auto message : chat) {
-                std::string content = strip_message ? trim(message->content) : message->content;
-                std::string role(message->role);
-                if (!is_inside_turn) {
-                    is_inside_turn = true;
-                    ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
-                }
-                if (role == "system") {
-                    if (support_system_message) {
-                        ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
-                    } else {
-                        // if the model does not support system message, we still include it in the first message, but without <<SYS>>
-                        ss << content << "\n";
-                    }
-                } else if (role == "user") {
-                    ss << content << " [/INST]";
-                } else {
-                    ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
-                    is_inside_turn = false;
-                }
-            }
-            // llama2 templates seem to not care about "add_generation_prompt
         }
-    } else if (tmpl == "phi3" || (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>"))) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
+            || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3
+            || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN) {
+        // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
+        // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
+        std::string leading_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 ? " " : "";
+        std::string trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN ? "" : " ";
+        bool trim_assistant_message = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3;
+        bool is_inside_turn = false;
+        for (auto message : chat) {
+            if (!is_inside_turn) {
+                ss << leading_space << "[INST]" << trailing_space;
+                is_inside_turn = true;
+            }
+            std::string role(message->role);
+            std::string content(message->content);
+            if (role == "system") {
+                ss << content << "\n\n";
+            } else if (role == "user") {
+                ss << content << leading_space << "[/INST]";
+            } else {
+                ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
+                is_inside_turn = false;
+            }
+        }
+    } else if (
+            tmpl == LLM_CHAT_TEMPLATE_LLAMA_2
+            || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS
+            || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS
+            || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP) {
+        // llama2 template and its variants
+        // [variant] support system message
+        // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
+        bool support_system_message = tmpl != LLM_CHAT_TEMPLATE_LLAMA_2;
+        // [variant] add BOS inside history
+        bool add_bos_inside_history = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
+        // [variant] trim spaces from the input message
+        bool strip_message = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
+        // construct the prompt
+        bool is_inside_turn = true; // skip BOS at the beginning
+        ss << "[INST] ";
+        for (auto message : chat) {
+            std::string content = strip_message ? trim(message->content) : message->content;
+            std::string role(message->role);
+            if (!is_inside_turn) {
+                is_inside_turn = true;
+                ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
+            }
+            if (role == "system") {
+                if (support_system_message) {
+                    ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
+                } else {
+                    // if the model does not support system message, we still include it in the first message, but without <<SYS>>
+                    ss << content << "\n";
+                }
+            } else if (role == "user") {
+                ss << content << " [/INST]";
+            } else {
+                ss << content << "</s>";
+                is_inside_turn = false;
+            }
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_3) {
         // Phi 3
         for (auto message : chat) {
             std::string role(message->role);
@@ -21950,7 +22100,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|assistant|>\n";
         }
-    } else if (tmpl == "zephyr" || tmpl_contains("<|user|>")) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
         // zephyr template
         for (auto message : chat) {
             ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
@@ -21958,7 +22108,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|assistant|>\n";
         }
-    } else if (tmpl == "monarch" || tmpl_contains("bos_token + message['role']")) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_MONARCH) {
         // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
         for (auto message : chat) {
             std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
@@ -21967,7 +22117,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<s>assistant\n";
         }
-    } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl_contains("<start_of_turn>")) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_GEMMA) {
         // google/gemma-7b-it
         std::string system_prompt = "";
         for (auto message : chat) {
@@ -21989,7 +22139,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<start_of_turn>model\n";
         }
-    } else if (tmpl == "orion" || tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_ORION) {
         // OrionStarAI/Orion-14B-Chat
         std::string system_prompt = "";
         for (auto message : chat) {
@@ -22009,7 +22159,7 @@ static int32_t llama_chat_apply_template_internal(
                 ss << message->content << "</s>";
             }
         }
-    } else if (tmpl == "openchat" || tmpl_contains("GPT4 Correct ")) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_OPENCHAT) {
         // openchat/openchat-3.5-0106,
         for (auto message : chat) {
             std::string role(message->role);
@@ -22023,13 +22173,13 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "GPT4 Correct Assistant:";
         }
-    } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: "))) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_VICUNA || tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
         // eachadea/vicuna-13b-1.1 (and Orca variant)
         for (auto message : chat) {
             std::string role(message->role);
             if (role == "system") {
                 // Orca-Vicuna variant uses a system prefix
-                if (tmpl == "vicuna-orca" || tmpl_contains("SYSTEM: ")) {
+                if (tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
                     ss << "SYSTEM: " << message->content << "\n";
                 } else {
                     ss << message->content << "\n\n";
@@ -22043,7 +22193,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "ASSISTANT:";
         }
-    } else if (tmpl == "deepseek" || (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>"))) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) {
         // deepseek-ai/deepseek-coder-33b-instruct
         for (auto message : chat) {
             std::string role(message->role);
@@ -22058,7 +22208,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "### Response:\n";
         }
-    } else if (tmpl == "command-r" || (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>"))) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_COMMAND_R) {
         // CohereForAI/c4ai-command-r-plus
         for (auto message : chat) {
             std::string role(message->role);
@@ -22073,7 +22223,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
         }
-    } else if (tmpl == "llama3" || (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>"))) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA_3) {
         // Llama 3
         for (auto message : chat) {
             std::string role(message->role);
@@ -22082,7 +22232,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
         }
-    } else if (tmpl == "chatglm3" || tmpl_contains("[gMASK]sop")) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
         // chatglm3-6b
         ss << "[gMASK]" << "sop";
         for (auto message : chat) {
@@ -22092,7 +22242,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|assistant|>";
         }
-    } else if (tmpl == "chatglm4" || tmpl_contains("[gMASK]<sop>")) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
         ss << "[gMASK]" << "<sop>";
         for (auto message : chat) {
             std::string role(message->role);
@@ -22101,7 +22251,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|assistant|>";
         }
-    } else if (tmpl == "minicpm" || tmpl_contains(LU8("<用户>"))) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
         // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
         for (auto message : chat) {
             std::string role(message->role);
@@ -22113,7 +22263,7 @@ static int32_t llama_chat_apply_template_internal(
                 ss << trim(message->content);
             }
         }
-    } else if (tmpl == "deepseek2" || tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_2) {
         // DeepSeek-V2
         for (auto message : chat) {
             std::string role(message->role);
@@ -22128,7 +22278,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "Assistant:";
         }
-    } else if (tmpl == "exaone3" || (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]"))) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
         // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
         // EXAONE-3.0-7.8B-Instruct
         for (auto message : chat) {
@@ -22144,7 +22294,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "[|assistant|]";
         }
-    } else if (tmpl == "rwkv-world" || tmpl_contains("rwkv-world")) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
         // this template requires the model to have "\n\n" as EOT token
         for (auto message : chat) {
             std::string role(message->role);
@@ -22154,7 +22304,7 @@ static int32_t llama_chat_apply_template_internal(
                 ss << message->content << "\n\n";
             }
         }
-    } else if (tmpl == "granite" || tmpl_contains("<|start_of_role|>")) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
         // IBM Granite template
         for (const auto & message : chat) {
             std::string role(message->role);
@@ -22206,7 +22356,11 @@ int32_t llama_chat_apply_template(
     }
 
     std::string formatted_chat;
-    int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
+    llm_chat_template detected_tmpl = llama_chat_detect_template(curr_tmpl);
+    if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
+        return -1;
+    }
+    int32_t res = llama_chat_apply_template_internal(detected_tmpl, chat_vec, formatted_chat, add_ass);
     if (res < 0) {
         return res;
     }
@@ -22216,6 +22370,15 @@ int32_t llama_chat_apply_template(
     return res;
 }
 
+int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
+    auto it = LLM_CHAT_TEMPLATES.begin();
+    for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
+        output[i] = it->first.c_str();
+        std::advance(it, 1);
+    }
+    return (int32_t) LLM_CHAT_TEMPLATES.size();
+}
+
 //
 // sampling
 //
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index dd8f7d5f0..aa140b569 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -82,9 +82,9 @@ int main(void) {
         // mistralai/Mistral-7B-Instruct-v0.2 (NOTE: Old pre-v1 without a system prompt)
         "[INST] You are a helpful assistant\nHello [/INST]Hi there</s>[INST] Who are you [/INST]   I am an assistant   </s>[INST] Another question [/INST]",
         // TheBloke/FusionNet_34Bx2_MoE-AWQ
-        "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s><s>[INST] Who are you [/INST]    I am an assistant    </s><s>[INST] Another question [/INST]",
+        "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST]Hi there</s><s>[INST] Who are you [/INST]   I am an assistant   </s><s>[INST] Another question [/INST]",
         // bofenghuang/vigogne-2-70b-chat
-        "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
+        "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST]Hi there</s>[INST] Who are you [/INST]I am an assistant</s>[INST] Another question [/INST]",
         // mlabonne/AlphaMonarch-7B
         "system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n   I am an assistant   </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
         // google/gemma-7b-it
@@ -133,6 +133,17 @@ int main(void) {
     std::vector<char> formatted_chat(1024);
     int32_t res;
 
+    // list all supported templates
+    std::vector<const char *> supported_tmpl;
+    res = llama_chat_builtin_templates(nullptr, 0);
+    assert(res > 0);
+    supported_tmpl.resize(res);
+    res = llama_chat_builtin_templates(supported_tmpl.data(), supported_tmpl.size());
+    printf("Built-in chat templates:\n");
+    for (auto tmpl : supported_tmpl) {
+        printf("  %s\n", tmpl);
+    }
+
     // test invalid chat template
     res = llama_chat_apply_template(nullptr, "INVALID TEMPLATE", conversation, message_count, true, formatted_chat.data(), formatted_chat.size());
     assert(res < 0);
@@ -174,7 +185,8 @@ int main(void) {
     assert(fmt_sys("mistral-v3") == "[INST] You are a helpful assistant\n\n");
     assert(fmt_sys("mistral-v3-tekken") == "[INST]You are a helpful assistant\n\n");
     assert(fmt_sys("mistral-v7") == "[SYSTEM_PROMPT] You are a helpful assistant[/SYSTEM_PROMPT]");
-    assert(fmt_sys("llama2") == "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\n");
+    assert(fmt_sys("llama2") == "[INST] You are a helpful assistant\n");
+    assert(fmt_sys("llama2-sys") == "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\n");
     assert(fmt_sys("mistral") == "[INST] You are a helpful assistant\n"); // for old pre-v1 templates
     assert(fmt_sys("gemma")  == ""); // for gemma, system message is merged with user message
     assert(fmt_sys("llama3") == "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|>");
@@ -203,5 +215,7 @@ int main(void) {
     assert(fmt_single("gemma")  == "\n<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
     assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
 
+    printf("Test chat templates: OK\n");
+
     return 0;
 }