Merge branch 'ggerganov:master' into master

2025-01-13 04:00:16 +00:00 · 2024-09-12 22:14:53 -05:00 · 2024-09-12 22:14:53 -05:00 · 69c97bbead
commit 69c97bbead
parent 42abdd0207 bd35cb0ae3
50 changed files with 712 additions and 343 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -375,7 +375,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Dependencies
        id: depends
@ -401,7 +401,7 @@ jobs:
    continue-on-error: true

    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4

      - name: add oneAPI to apt
        shell: bash
@ -442,7 +442,7 @@ jobs:
    continue-on-error: true

    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4

      - name: add oneAPI to apt
        shell: bash
@ -546,7 +546,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v4

      - name: Dependencies
        id: depends
@ -576,7 +576,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v4

      - name: Dependencies
        id: depends
@ -610,7 +610,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v4

      - name: Dependencies
        id: depends
@ -969,14 +969,14 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Install
        id: depends
        run: |
          $ErrorActionPreference = "Stop"
          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP SDK"
          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
          write-host "Completed AMD HIP SDK installation"
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -173,6 +173,7 @@ jobs:
        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
        run: |
          cd examples/server/tests
+          $env:PYTHONIOENCODING = ":replace"
          behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp

      - name: Slow tests
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -139,10 +139,16 @@ set(LLAMA_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location o
 # determining _precisely_ which defines are necessary for the llama-config
 # package.
 #
+set(GGML_TRANSIENT_DEFINES)
 get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
 get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
+if (GGML_DIR_DEFINES)
+    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES})
+endif()
 get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
-set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
+if (GGML_TARGET_DEFINES)
+    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
+endif()
 get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)

 set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
--- a/10
+++ b/10
@ -434,7 +434,7 @@ endif
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue

-ifndef RISCV
+ifndef RISCV_CROSS_COMPILE

 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	# Use all CPU extensions that are available:
@ -514,7 +514,12 @@ ifneq ($(filter loongarch64%,$(UNAME_M)),)
 	MK_CXXFLAGS += -mlasx
 endif

-else
+ifneq ($(filter riscv64%,$(UNAME_M)),)
+	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
+	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
+endif
+
+else # RISC-V CROSS COMPILATION
 	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
 	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
 endif
@ -1454,7 +1459,6 @@ llama-gen-docs: examples/gen-docs/gen-docs.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-	./llama-gen-docs

 libllava.a: examples/llava/llava.cpp \
 	examples/llava/llava.h \
--- a/README.md
+++ b/README.md
@ -89,6 +89,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
 - [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
 - [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
+- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)

 (instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))

--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -173,7 +173,6 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
    std::string arg;
    const std::string arg_prefix = "--";
    gpt_params & params = ctx_arg.params;
-    gpt_sampler_params & sparams = params.sparams;

    std::unordered_map<std::string, llama_arg *> arg_to_options;
    for (auto & opt : ctx_arg.options) {
@ -283,10 +282,6 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
        params.kv_overrides.back().key[0] = 0;
    }

-    if (sparams.seed == LLAMA_DEFAULT_SEED) {
-        sparams.seed = time(NULL);
-    }
-
    return true;
 }

@ -823,7 +818,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
        [](gpt_params & params) {
            params.special = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
    add_opt(llama_arg(
        {"-cnv", "--conversation"},
        format(
@ -909,7 +904,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
    ).set_sparam());
    add_opt(llama_arg(
        {"-s", "--seed"}, "SEED",
-        format("RNG seed (default: %d, use random seed for < 0)", params.sparams.seed),
+        format("RNG seed (default: %u, use random seed for %u)", params.sparams.seed, LLAMA_DEFAULT_SEED),
        [](gpt_params & params, const std::string & value) {
            params.sparams.seed = std::stoul(value);
        }
@ -1422,20 +1417,18 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
                params.split_mode = LLAMA_SPLIT_MODE_NONE;
            } else if (arg_next == "layer") {
                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
-            }
-            else if (arg_next == "row") {
+            } else if (arg_next == "row") {
 #ifdef GGML_USE_SYCL
                fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
                exit(1);
 #endif // GGML_USE_SYCL
                params.split_mode = LLAMA_SPLIT_MODE_ROW;
-            }
-            else {
+            } else {
                throw std::invalid_argument("invalid value");
            }
-#ifndef GGML_USE_CUDA_SYCL_VULKAN
-            fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL_VULKAN
+            if (!llama_supports_gpu_offload()) {
+                fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n");
+            }
        }
    ));
    add_opt(llama_arg(
@ -1460,9 +1453,9 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
                    params.tensor_split[i] = 0.0f;
                }
            }
-#ifndef GGML_USE_CUDA_SYCL_VULKAN
-            fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL_VULKAN
+            if (!llama_supports_gpu_offload()) {
+                fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n");
+            }
        }
    ));
    add_opt(llama_arg(
@ -1470,9 +1463,9 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
        format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
        [](gpt_params & params, int value) {
            params.main_gpu = value;
-#ifndef GGML_USE_CUDA_SYCL_VULKAN
-            fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL_VULKAN
+            if (!llama_supports_gpu_offload()) {
+                fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
+            }
        }
    ));
    add_opt(llama_arg(
--- a/common/common.cpp
+++ b/common/common.cpp
@ -56,14 +56,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL))
-#define GGML_USE_CUDA_SYCL
-#endif
-
-#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
-#define GGML_USE_CUDA_SYCL_VULKAN
-#endif
-
 #if defined(LLAMA_USE_CURL)
 #ifdef __linux__
 #include <linux/limits.h>
@ -949,11 +941,37 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p

 #ifdef LLAMA_USE_CURL

+#define CURL_MAX_RETRY 3
+#define CURL_RETRY_DELAY_SECONDS 2
+
+
 static bool starts_with(const std::string & str, const std::string & prefix) {
    // While we wait for C++20's std::string::starts_with...
    return str.rfind(prefix, 0) == 0;
 }

+static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
+    int remaining_attempts = max_attempts;
+
+    while (remaining_attempts > 0) {
+        fprintf(stderr, "%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
+
+        CURLcode res = curl_easy_perform(curl);
+        if (res == CURLE_OK) {
+            return true;
+        }
+
+        int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
+        fprintf(stderr, "%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
+
+        remaining_attempts--;
+        std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
+    }
+
+    fprintf(stderr, "%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
+    return false;
+}
+
 static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {

    // Initialize libcurl
@ -1057,9 +1075,8 @@ static bool llama_download_file(const std::string & url, const std::string & pat
        curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
        curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);

-        CURLcode res = curl_easy_perform(curl.get());
-        if (res != CURLE_OK) {
-            fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
+        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
+        if (!was_perform_successful) {
            return false;
        }

@ -1134,11 +1151,10 @@ static bool llama_download_file(const std::string & url, const std::string & pat
        };

        // start the download
-        fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
+        fprintf(stderr, "%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
            llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
-        auto res = curl_easy_perform(curl.get());
-        if (res != CURLE_OK) {
-            fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
+        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
+        if (!was_perform_successful) {
            return false;
        }

@ -1812,6 +1828,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "cpu_has_sve: %s\n",         ggml_cpu_has_sve()         ? "true" : "false");
    fprintf(stream, "cpu_has_f16c: %s\n",        ggml_cpu_has_f16c()        ? "true" : "false");
    fprintf(stream, "cpu_has_fp16_va: %s\n",     ggml_cpu_has_fp16_va()     ? "true" : "false");
+    fprintf(stream, "cpu_has_riscv_v: %s\n",     ggml_cpu_has_riscv_v()     ? "true" : "false");
    fprintf(stream, "cpu_has_wasm_simd: %s\n",   ggml_cpu_has_wasm_simd()   ? "true" : "false");
    fprintf(stream, "cpu_has_blas: %s\n",        ggml_cpu_has_blas()        ? "true" : "false");
    fprintf(stream, "cpu_has_sse3: %s\n",        ggml_cpu_has_sse3()        ? "true" : "false");
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -310,6 +310,10 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
    return cur_p.data[cur_p.selected].id;
 }

+uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
+    return llama_sampler_get_seed(gsmpl->chain);
+}
+
 // helpers

 llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
--- a/common/sampling.h
+++ b/common/sampling.h
@ -60,6 +60,8 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
 //
 llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);

+uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
+
 // helpers

 // access the internal list of current candidate tokens
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -302,6 +302,8 @@ class Model:
                            gguf.MODEL_TENSOR.TIME_MIX_FIRST,
                            gguf.MODEL_TENSOR.TIME_MIX_W1,
                            gguf.MODEL_TENSOR.TIME_MIX_W2,
+                            gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
+                            gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
                        )
                    )
                    or not new_name.endswith(".weight")
@ -624,6 +626,9 @@ class Model:
        if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
            # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
            res = "exaone"
+        if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
+            # ref: https://huggingface.co/microsoft/phi-2
+            res = "phi-2"

        if res is None:
            logger.warning("\n")
@ -2769,6 +2774,8 @@ class Rwkv6Model(Model):
        self.gguf_writer.add_tokenizer_model("rwkv")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_types(toktypes)
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
+        special_vocab.add_to_gguf(self.gguf_writer)

    def set_gguf_parameters(self):
        block_count = self.hparams["num_hidden_layers"]
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@ -31,6 +31,7 @@ import re
 import requests
 import sys
 import json
+import shutil

 from hashlib import sha256
 from enum import IntEnum, auto
@ -97,6 +98,7 @@ models = [
    {'name': "bloom",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
    {'name': "gpt3-finnish",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
    {"name": "exaone",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
+    {"name": "phi-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
 ]


@ -125,6 +127,21 @@ def download_model(model):
    if tokt == TOKENIZER_TYPE.UGM:
        files.append("spiece.model")

+    if os.path.isdir(repo):
+        # If repo is a path on the file system, copy the directory
+        for file in files:
+            src_path = os.path.join(repo, file)
+            dst_path = f"models/tokenizers/{name}/{file}"
+            if os.path.isfile(dst_path):
+                logger.info(f"{name}: File {dst_path} already exists - skipping")
+                continue
+            if os.path.isfile(src_path):
+                shutil.copy2(src_path, dst_path)
+                logger.info(f"{name}: Copied {src_path} to {dst_path}")
+            else:
+                logger.warning(f"{name}: Source file {src_path} does not exist")
+    else:
+        # If repo is a URL, download the files
        for file in files:
            save_path = f"models/tokenizers/{name}/{file}"
            if os.path.isfile(save_path):
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@ -363,7 +363,13 @@ if __name__ == '__main__':
                    yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))

            def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-                dest = super().modify_tensors(data_torch, name, bid)
+                dest = list(super().modify_tensors(data_torch, name, bid))
+                # some archs may have the same tensor for lm_head and output (tie word embeddings)
+                # in this case, adapters targeting lm_head will fail when using llama-export-lora
+                # therefore, we ignore them for now
+                # see: https://github.com/ggerganov/llama.cpp/issues/9065
+                if name == "lm_head.weight" and len(dest) == 0:
+                    raise ValueError("lm_head is present in adapter, but is ignored in base model")
                for dest_name, dest_data in dest:
                    assert isinstance(dest_data, LoraTorchTensor)
                    lora_a, lora_b = dest_data.get_lora_A_B()
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -3,32 +3,10 @@
 #include "llama.h"

 #include <algorithm>
-#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>

-// mutates the input string
-static std::vector<int> parse_list(char * p) {
-    std::vector<int> ret;
-
-    char * q = p;
-
-    while (*p) {
-        if (*p == ',') {
-            *p = '\0';
-            ret.push_back(std::atoi(q));
-            q = p + 1;
-        }
-
-        ++p;
-    }
-
-    ret.push_back(std::atoi(q));
-
-    return ret;
-}
-
 static void print_usage(int, char ** argv) {
    LOG_TEE("\nexample usage:\n");
    LOG_TEE("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@ -183,7 +183,7 @@ int main(int argc, char ** argv)  {

    ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);

-    TENSOR_DUMP(gf->nodes[0]);
+    TENSOR_DUMP(ggml_graph_node(gf, 0));

    printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));

@ -224,7 +224,7 @@ int main(int argc, char ** argv)  {


    // Let's use the F32 result from above as a reference for the quantized multiplication
-    float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
+    float sum_of_F32_reference = tensor_sum_elements(ggml_graph_node(gf, 0));

    printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
    printf("=====================================================================================\n");
@ -252,7 +252,7 @@ int main(int argc, char ** argv)  {

        // Check that the matrix multiplication result is in the right ballpark
        // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
-        float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
+        float sum_of_Q4_result = tensor_sum_elements(ggml_graph_node(gf31, 0));
        float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
        float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; //  Let's accept an epsilon of 10^-6

--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@ -226,8 +226,8 @@ static ggml_status compute_piter(
        result.eigenvectors.resize(params.n_batch);
        result.distances.resize(params.n_batch);
        // get output nodes
-        for (int i = 0; i < gf->n_nodes; ++i) {
-            auto node = gf->nodes[i];
+        for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
+            auto node = ggml_graph_node(gf, i);
            int iter = -1;
            // find b_tensor (without copying data from device)
            if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -90,8 +90,6 @@ int main(int argc, char ** argv) {

    print_build_info();

-    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
-
    llama_backend_init();
    llama_numa_init(params.numa);

--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@ -370,7 +370,7 @@ struct lora_merge_ctx {

        // write data to output file
        {
-            auto result = gf->nodes[gf->n_nodes - 1];
+            auto * result = ggml_graph_node(gf, -1);
            size_t len = ggml_nbytes(result);
            if (read_buf.size() < len) {
                read_buf.resize(len);
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -159,8 +159,6 @@ int main(int argc, char ** argv) {

    print_build_info();

-    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
-
    LOG("%s: llama backend init\n", __func__);
    llama_backend_init();
    llama_numa_init(params.numa);
@ -301,6 +299,9 @@ int main(int argc, char ** argv) {
            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
        }
    }
+    smpl = gpt_sampler_init(model, sparams);
+
+    LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
    LOG_TEE("sampling: \n%s\n", sparams.print().c_str());
    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
    LOG_TEE("\n\n");
@ -340,8 +341,6 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> embd;

-    smpl = gpt_sampler_init(model, sparams);
-
    while (n_remain != 0 || params.interactive) {
        // predict
        if (!embd.empty()) {
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@ -39,7 +39,7 @@ python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
 3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:

 ```sh
-python ./examples/llava/convert_image_encoder_to_gguf \
+python ./examples/llava/convert_image_encoder_to_gguf.py \
    -m path/to/clip-vit-large-patch14-336 \
    --llava-projector path/to/MobileVLM-1.7B/llava.projector \
    --output-dir path/to/MobileVLM-1.7B \
@ -47,7 +47,7 @@ python ./examples/llava/convert_image_encoder_to_gguf \
 ```

 ```sh
-python ./examples/llava/convert_image_encoder_to_gguf \
+python ./examples/llava/convert_image_encoder_to_gguf.py \
    -m path/to/clip-vit-large-patch14-336 \
    --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
    --output-dir path/to/MobileVLM-1.7B_V2 \
@ -57,12 +57,12 @@ python ./examples/llava/convert_image_encoder_to_gguf \
 4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:

 ```sh
-python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B
+python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B --skip-unknown
 ```

-5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
+5. Use `quantize` to convert LLaMA part's DataType from `fp32` to `q4_k`
 ```sh
-./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
+./llama-quantize path/to/MobileVLM-1.7B/ggml-model-F32.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
 ```

 Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -2449,7 +2449,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    ggml_backend_graph_compute(ctx->backend, gf);

    // the last node is the embedding tensor
-    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
+    struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);

    // copy the embeddings to the location passed by the user
    ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@ -184,7 +184,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
    // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
    ggml_build_forward_expand(gf, flatten);
    ggml_graph_compute_with_ctx(model.ctx, gf, 1);
-    struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
+    struct ggml_tensor* result = ggml_graph_node(gf, -1);

    memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
    // append without newline tokens (default behavior in llava_arch when not using unpad ):
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@ -18,8 +18,8 @@ struct llava_context {
 };

 static void show_additional_info(int /*argc*/, char ** argv) {
-    LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG_TEE("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
+    LOG_TEE("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG_TEE("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
 }

 static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
@ -255,7 +255,7 @@ int main(int argc, char ** argv) {

    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, show_additional_info)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
        return 1;
    }

--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -191,8 +191,6 @@ int main(int argc, char ** argv) {

    print_build_info();

-    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
-
    LOG("%s: llama backend init\n", __func__);
    llama_backend_init();
    llama_numa_init(params.numa);
@ -470,8 +468,10 @@ int main(int argc, char ** argv) {
        exit(1);
    }

+    LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
    LOG_TEE("sampling params: \n%s\n", sparams.print().c_str());
    LOG_TEE("sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str());
+
    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);

    // group-attention state
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -2007,8 +2007,6 @@ int main(int argc, char ** argv) {

    print_build_info();

-    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
-
    llama_backend_init();
    llama_numa_init(params.numa);

--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@ -1,6 +1,6 @@
 set(TARGET llama-quantize)
 add_executable(${TARGET} quantize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -407,9 +407,44 @@ Notice that each `probs` is an array of length `n_probs`.

    *Options:*

-    `content`: Set the text to tokenize.
+    `content`: (Required) The text to tokenize.

-    `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
+    `add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
+
+    `with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs.  Default: `false`
+
+**Response:**
+
+Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise.
+
+
+If `with_pieces` is `false`:
+```json
+{
+  "tokens": [123, 456, 789]
+}
+```
+
+If `with_pieces` is `true`:
+```json
+{
+  "tokens": [
+    {"id": 123, "piece": "Hello"},
+    {"id": 456, "piece": " world"},
+    {"id": 789, "piece": "!"}
+  ]
+}
+```
+
+With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
+```json
+{
+  "tokens": [
+    {"id": 198, "piece": [195]}, // hex C3
+    {"id": 164, "piece": [161]} // hex A1
+  ]
+}
+```

 ### POST `/detokenize`: Convert tokens to text

--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1266,6 +1266,7 @@ struct server_context {
            {"n_predict",                 slot.n_predict},     // Server configured n_predict
            {"model",                     params.model_alias},
            {"seed",                      slot.sparams.seed},
+            {"seed_cur",                  slot.smpl ? gpt_sampler_get_seed(slot.smpl) : 0},
            {"temperature",               slot.sparams.temp},
            {"dynatemp_range",            slot.sparams.dynatemp_range},
            {"dynatemp_exponent",         slot.sparams.dynatemp_exponent},
@ -3017,12 +3018,39 @@ int main(int argc, char ** argv) {
    const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
        const json body = json::parse(req.body);

-        std::vector<llama_token> tokens;
+        json tokens_response = json::array();
        if (body.count("content") != 0) {
            const bool add_special = json_value(body, "add_special", false);
-            tokens = ctx_server.tokenize(body.at("content"), add_special);
+            const bool with_pieces = json_value(body, "with_pieces", false);
+            std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special);
+
+            if (with_pieces) {
+                for (const auto& token : tokens) {
+                    std::string piece = llama_token_to_piece(ctx_server.ctx, token);
+                    json piece_json;
+
+                    // Check if the piece is valid UTF-8
+                    if (is_valid_utf8(piece)) {
+                        piece_json = piece;
+                    } else {
+                        // If not valid UTF-8, store as array of byte values
+                        piece_json = json::array();
+                        for (unsigned char c : piece) {
+                            piece_json.push_back(static_cast<int>(c));
                        }
-        const json data = format_tokenizer_response(tokens);
+                    }
+
+                    tokens_response.push_back({
+                        {"id", token},
+                        {"piece", piece_json}
+                    });
+                }
+            } else {
+                tokens_response = tokens;
+            }
+        }
+
+        const json data = format_tokenizer_response(tokens_response);
        res_ok(res, data);
    };

--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -1,3 +1,6 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
 import asyncio
 import json
 import os
@ -697,6 +700,32 @@ def step_tokenize_set_add_special(context):
    context.tokenize_add_special = True


+@step("tokenizing with pieces")
+@async_run_until_complete
+async def step_tokenize_with_pieces(context):
+    context.tokenized_text = context_text(context)
+    async with aiohttp.ClientSession() as session:
+        tokenize_args = {"content": context.tokenized_text, "with_pieces": True}
+        if getattr(context, "tokenize_add_special", None) is not None:
+            tokenize_args["add_special"] = context.tokenize_add_special
+
+        async with session.post(
+            f"{context.base_url}/tokenize", json=tokenize_args
+        ) as response:
+            assert response.status == 200
+            tokenize_json = await response.json()
+            context.tokens_with_pieces = tokenize_json["tokens"]
+
+
+@step("tokens are given with pieces")
+@async_run_until_complete
+async def step_tokenize_with_pieces(context):
+    # Verify that the response contains both token IDs and pieces
+    assert all(
+        "id" in token and "piece" in token for token in context.tokens_with_pieces
+    )
+
+
@step('tokenizing')
@async_run_until_complete
 async def step_tokenize(context):
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -616,7 +616,40 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
    return res;
 }

-static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
+static bool is_valid_utf8(const std::string & str) {
+    const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
+    const unsigned char* end = bytes + str.length();
+
+    while (bytes < end) {
+        if (*bytes <= 0x7F) {
+            // 1-byte sequence (0xxxxxxx)
+            bytes++;
+        } else if ((*bytes & 0xE0) == 0xC0) {
+            // 2-byte sequence (110xxxxx 10xxxxxx)
+            if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
+                return false;
+            bytes += 2;
+        } else if ((*bytes & 0xF0) == 0xE0) {
+            // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
+            if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
+                return false;
+            bytes += 3;
+        } else if ((*bytes & 0xF8) == 0xF0) {
+            // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
+            if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
+                (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
+                return false;
+            bytes += 4;
+        } else {
+            // Invalid UTF-8 lead byte
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static json format_tokenizer_response(const json & tokens) {
    return json {
        {"tokens", tokens}
    };
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@ -4,33 +4,23 @@
 #  Copyright (C) 2024 Intel Corporation
 #  SPDX-License-Identifier: MIT

-INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 source /opt/intel/oneapi/setvars.sh

-if [ $# -gt 0 ]; then
-    GGML_SYCL_DEVICE=$1
-    GGML_SYCL_SINGLE_GPU=1
-else
-    GGML_SYCL_DEVICE=0
-    GGML_SYCL_SINGLE_GPU=0
-fi
-
 #export GGML_SYCL_DEBUG=1

-
 #ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.

-if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
+INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
+MODEL_FILE=llama-2-7b.Q4_0.gguf
+NGL=33
+
+if [ $# -gt 0 ]; then
+    GGML_SYCL_DEVICE=$1
    echo "use $GGML_SYCL_DEVICE as main GPU"
    #use signle GPU only
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -mg $GGML_SYCL_DEVICE -sm none
+
 else
    #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0
 fi
-
-#use main GPU only
-#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
-
-#use multiple GPUs with same max compute units
-#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
--- a/flake.lock
+++ b/flake.lock
@ -5,11 +5,11 @@
        "nixpkgs-lib": "nixpkgs-lib"
      },
      "locked": {
-        "lastModified": 1725024810,
-        "narHash": "sha256-ODYRm8zHfLTH3soTFWE452ydPYz2iTvr9T8ftDMUQ3E=",
+        "lastModified": 1725234343,
+        "narHash": "sha256-+ebgonl3NbiKD2UD0x4BszCZQ6sTfL4xioaM49o5B3Y=",
        "owner": "hercules-ci",
        "repo": "flake-parts",
-        "rev": "af510d4a62d071ea13925ce41c95e3dec816c01d",
+        "rev": "567b938d64d4b4112ee253b9274472dc3a346eb6",
        "type": "github"
      },
      "original": {
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1724819573,
-        "narHash": "sha256-GnR7/ibgIH1vhoy8cYdmXE6iyZqKqFxQSVkFgosBh6w=",
+        "lastModified": 1725634671,
+        "narHash": "sha256-v3rIhsJBOMLR8e/RNWxr828tB+WywYIoajrZKFM+0Gg=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "71e91c409d1e654808b2621f28a327acfdad8dc2",
+        "rev": "574d1eac1c200690e27b8eb4e24887f8df7ac27c",
        "type": "github"
      },
      "original": {
@ -36,14 +36,14 @@
    },
    "nixpkgs-lib": {
      "locked": {
-        "lastModified": 1722555339,
-        "narHash": "sha256-uFf2QeW7eAHlYXuDktm9c25OxOyCoUOQmh5SZ9amE5Q=",
+        "lastModified": 1725233747,
+        "narHash": "sha256-Ss8QWLXdr2JCBPcYChJhz4xJm+h/xjl4G0c0XlP6a74=",
        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
      },
      "original": {
        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
      }
    },
    "root": {
--- a/ggml/include/ggml-cann.h
+++ b/ggml/include/ggml-cann.h
@ -80,6 +80,13 @@ ggml_backend_cann_buffer_type(int32_t device);
 */
 GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void);

+/**
+ * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
+ *
+ * @return A pointer to the host buffer type interface.
+ */
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
+
 /**
 * @brief Retrieves the description of a specific CANN device.
 *
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -358,6 +358,7 @@ extern "C" {

    struct ggml_object;
    struct ggml_context;
+    struct ggml_cgraph;

    // NOTE: always add types at the end of the enum to keep backward compatibility
    enum ggml_type {
@ -575,20 +576,6 @@ extern "C" {
        GGML_TENSOR_FLAG_PARAM  = 4,
    };

-    // ggml object
-    struct ggml_object {
-        size_t offs;
-        size_t size;
-
-        struct ggml_object * next;
-
-        enum ggml_object_type type;
-
-        char padding[4];
-    };
-
-    static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
-
    // n-dimensional tensor
    struct ggml_tensor {
        enum ggml_type type;
@ -671,35 +658,6 @@ extern "C" {
        void *              abort_callback_data;
    };

-    enum ggml_cgraph_eval_order {
-        GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
-        GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
-        GGML_CGRAPH_EVAL_ORDER_COUNT
-    };
-
-    typedef uint32_t ggml_bitset_t;
-
-    struct ggml_hash_set {
-        size_t size;
-        ggml_bitset_t * used;       // whether or not the keys are in use i.e. set
-        struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
-    };
-
-    // computation graph
-    struct ggml_cgraph {
-        int size;
-        int n_nodes;
-        int n_leafs;
-
-        struct ggml_tensor ** nodes;
-        struct ggml_tensor ** grads;
-        struct ggml_tensor ** leafs;
-
-        struct ggml_hash_set visited_hash_set;
-
-        enum ggml_cgraph_eval_order order;
-    };
-
    // scratch buffer
    struct ggml_scratch {
        size_t offs;
@ -2017,8 +1975,6 @@ extern "C" {
    typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
    typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);

-    #define GGML_N_TASKS_MAX -1
-
    GGML_API struct ggml_tensor * ggml_map_custom1(
            struct ggml_context   * ctx,
            struct ggml_tensor    * a,
@ -2088,7 +2044,6 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * tensor);

-
    GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);

@ -2096,11 +2051,17 @@ extern "C" {
    GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
    GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
-    GGML_API struct ggml_cgraph   ggml_graph_view        (struct ggml_cgraph * cgraph, int i0, int i1);
    GGML_API void                 ggml_graph_cpy       (struct ggml_cgraph * src, struct ggml_cgraph * dst);
    GGML_API void                 ggml_graph_reset     (struct ggml_cgraph * cgraph);  // zero grads
    GGML_API void                 ggml_graph_clear     (struct ggml_cgraph * cgraph);

+    GGML_API int                   ggml_graph_size   (struct ggml_cgraph * cgraph);
+    GGML_API struct ggml_tensor *  ggml_graph_node   (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
+    GGML_API struct ggml_tensor ** ggml_graph_nodes  (struct ggml_cgraph * cgraph);
+    GGML_API int                   ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
+
+    GGML_API void   ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
+
    GGML_API size_t ggml_graph_overhead(void);
    GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);

@ -2509,6 +2470,7 @@ extern "C" {
    GGML_API int ggml_cpu_has_gpublas    (void);
    GGML_API int ggml_cpu_has_sse3       (void);
    GGML_API int ggml_cpu_has_ssse3      (void);
+    GGML_API int ggml_cpu_has_riscv_v    (void);
    GGML_API int ggml_cpu_has_sycl       (void);
    GGML_API int ggml_cpu_has_rpc        (void);
    GGML_API int ggml_cpu_has_vsx        (void);
--- a/ggml/src/ggml-blas.cpp
+++ b/ggml/src/ggml-blas.cpp
@ -1,3 +1,4 @@
+#include "ggml-impl.h"
 #include "ggml-blas.h"
 #include "ggml-backend-impl.h"

--- a/ggml/src/ggml-cann.cpp
+++ b/ggml/src/ggml-cann.cpp
@ -30,6 +30,7 @@
 #include <cstring>
 #include <mutex>

+#include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 #include "ggml-cann/aclnn_ops.h"
 #include "ggml-cann/common.h"
@ -1220,6 +1221,116 @@ ggml_backend_cann_buffer_type(int32_t device) {
    return &ggml_backend_cann_buffer_types[device];
 }

+/**
+ * @brief Retrieves the name associated with a CANN host buffer type.
+ *
+ * This function returns the descriptive name associated with the specified
+ * CANN host buffer type context.
+ *
+ * @param buft Pointer to the host buffer type context.
+ * @return Const pointer to the C-style string containing the name.
+ */
+GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    return "CANN_Host";
+
+    GGML_UNUSED(buft);
+}
+
+/**
+ * @brief Retrieves the name associated with a CANN host buffer.
+ *
+ * This function returns the descriptive name associated with the specified
+ * CANN host buffer context.
+ *
+ * @param buft Pointer to the host buffer context.
+ * @return Const pointer to the C-style string containing the name.
+ */
+GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
+    return "CANN_Host";
+
+    GGML_UNUSED(buffer);
+}
+
+/**
+ * @brief Free resources associated with a CANN host buffer.
+ *
+ * This function frees the resources associated with a CANN host buffer, including
+ * its context.
+ *
+ * @param buffer The CANN host buffer to free.
+ */
+GGML_CALL static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
+    ACL_CHECK(aclrtFreeHost(buffer->context));
+}
+
+/**
+ * @brief Allocates a new CANN host buffer of the specified size.
+ *
+ * This function allocates a new CANN host buffer with the given size.
+ * @param size Size in bytes of the host buffer to allocate.
+ * @return Pointer to the allocated host buffer, or nullptr if allocation fails.
+ */
+static void * ggml_cann_host_malloc(size_t size) {
+    if (getenv("GGML_CANN_NO_PINNED") != nullptr) {
+        return nullptr;
+    }
+
+    void * hostPtr = nullptr;
+    aclError err = aclrtMallocHost((void **) &hostPtr, size);
+    if (err != ACL_SUCCESS) {
+
+        GGML_CANN_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
+                           size / 1024.0 / 1024.0, aclGetRecentErrMsg());
+        return nullptr;
+    }
+    return hostPtr;
+}
+
+/**
+ * @brief Allocates a new CANN host buffer of the specified type and size.
+ *
+ * @param buft Pointer to the host buffer type context.
+ * @param size Size in bytes of the host buffer to allocate.
+ * @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
+ */
+GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    void * hostPtr = ggml_cann_host_malloc(size);
+
+    if (hostPtr == nullptr) {
+        // fallback to cpu buffer
+        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
+    }
+
+    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
+    buffer->buft = buft;
+    buffer->iface.get_name = ggml_backend_cann_host_buffer_name;
+    buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
+
+    return buffer;
+}
+
+/**
+ * @brief Interface for managing CANN host buffer types in the GGML backend.
+ *
+ * Provides function pointers for allocating, querying properties, and managing
+ * memory for CANN buffer types in the GGML backend.
+ */
+GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
+    static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
+        /* .iface    = */ {
+            /* .get_name         = */ ggml_backend_cann_host_buffer_type_name,
+            /* .alloc_buffer     = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
+            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+        },
+        /* .context  = */ nullptr,
+    };
+
+    return &ggml_backend_cann_buffer_type_host;
+}
+
 /**
 * @brief Computes the forward operation for a given tensor using CANN
 * operations.
@ -1942,7 +2053,7 @@ GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) {
        GGML_CANN_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
        return nullptr;
    }
-
+    ggml_cann_set_device(ctx->device);
    ggml_backend_t cann_backend =
        new ggml_backend{/* .guid      = */ ggml_backend_cann_guid(),
                         /* .interface = */ ggml_backend_cann_interface,
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@ -1,5 +1,5 @@
 #include "ggml-cuda.h"
-#include "ggml.h"
+#include "ggml-impl.h"
 #include "ggml-backend-impl.h"

 #include "ggml-cuda/common.cuh"
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@ -26,7 +26,11 @@ void ggml_cuda_op_mul_mat_q(
    // nrows_dst == nrows of the matrix that the kernel writes into
    const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;

-    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst};
+    // The stream-k decomposition is only faster for recent NVIDIA GPUs.
+    // Also its fixup needs to allocate a temporary buffer in the memory pool.
+    // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
+    const bool use_stream_k = compute_capability >= CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11;
+    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};

    switch (src0->type) {
        case GGML_TYPE_Q4_0:
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@ -2742,6 +2742,7 @@ struct mmq_args {
    int64_t ne00; int64_t ne01; int64_t stride01;
    int64_t ne10; int64_t ne11; int64_t stride11;
    int64_t ne0;
+    bool use_stream_k;
 };

 template<ggml_type type>
@ -2777,8 +2778,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
    const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
    const dim3 block_nums_xy_tiling(nty, ntx, 1);

-    const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD;
-    if (!use_stream_k) {
+    if (!args.use_stream_k) {
        if (args.ne01 % mmq_y == 0) {
            constexpr bool need_check = false;
            mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>>
--- a/ggml/src/ggml-cuda/vendors/musa.h
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@ -130,42 +130,3 @@
 #define cudaKernelNodeParams musaKernelNodeParams
 #define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
 #define cudaStreamEndCapture musaStreamEndCapture
-
-// XXX: Clang builtins mapping
-#define __vsub4   __vsub4_musa
-#define __vcmpeq4 __vcmpeq4_musa
-#define __vcmpne4 __vcmpne4_musa
-
-#ifndef __has_builtin
-    #define __has_builtin(x) 0
-#endif
-
-typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
-
-static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
-    return __vsubss4(a, b);
-}
-
-static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
-    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
-    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
-    unsigned int c;
-    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
-#pragma unroll
-    for (int i = 0; i < 4; ++i) {
-        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
-    }
-    return c;
-}
-
-static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
-    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
-    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
-    unsigned int c;
-    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
-#pragma unroll
-    for (int i = 0; i < 4; ++i) {
-        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
-    }
-    return c;
-}
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@ -629,8 +629,16 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
 #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 #endif

+enum ggml_cgraph_eval_order {
+    GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
+    GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
+    GGML_CGRAPH_EVAL_ORDER_COUNT
+};
+
 // bitset

+typedef uint32_t ggml_bitset_t;
+
 static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
 #define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
 #define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
@ -656,6 +664,12 @@ static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
 #define GGML_HASHSET_FULL ((size_t)-1)
 #define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)

+struct ggml_hash_set {
+    size_t size;
+    ggml_bitset_t * used;       // whether or not the keys are in use i.e. set
+    struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
+};
+
 struct ggml_hash_set ggml_hash_set_new(size_t size);
 void                 ggml_hash_set_free(struct ggml_hash_set * hash_set);

@ -745,6 +759,24 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g
    GGML_ABORT("fatal error");
 }

+// computation graph
+
+struct ggml_cgraph {
+    int size;
+    int n_nodes;
+    int n_leafs;
+
+    struct ggml_tensor ** nodes;
+    struct ggml_tensor ** grads;
+    struct ggml_tensor ** leafs;
+
+    struct ggml_hash_set visited_hash_set;
+
+    enum ggml_cgraph_eval_order order;
+};
+
+struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
+
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/src/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute.cpp
@ -1,4 +1,4 @@
-#include "ggml.h"
+#include "ggml-impl.h"
 #include "ggml-backend.h"
 #include "ggml-backend-impl.h"
 #include "ggml-kompute.h"
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@ -1,7 +1,7 @@
 #import "ggml-metal.h"

+#import "ggml-impl.h"
 #import "ggml-backend-impl.h"
-#import "ggml.h"

 #import <Foundation/Foundation.h>

@ -3039,8 +3039,7 @@ static enum ggml_status ggml_metal_graph_compute(
        if (status != MTLCommandBufferStatusCompleted) {
            GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
            if (status == MTLCommandBufferStatusError) {
-                NSString * error_code = [command_buffer error].localizedDescription;
-                GGML_METAL_LOG_INFO("error: %s\n", [error_code UTF8String]);
+                GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
            }

            return GGML_STATUS_FAILED;
--- a/ggml/src/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc.cpp
@ -1,5 +1,5 @@
 #include "ggml-rpc.h"
-#include "ggml.h"
+#include "ggml-impl.h"
 #include "ggml-backend-impl.h"

 #include <cinttypes>
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@ -33,7 +33,7 @@
 #include <sycl/half_type.hpp>

 #include "ggml-sycl.h"
-#include "ggml.h"
+#include "ggml-impl.h"
 #include "ggml-backend-impl.h"

 #include "ggml-sycl/backend.hpp"
@ -5137,13 +5137,17 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
        case GGML_OP_SCALE:
        case GGML_OP_SQR:
        case GGML_OP_CLAMP:
+            return true;
        case GGML_OP_CONT:
+            return op->src[0]->type != GGML_TYPE_BF16;
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_SOFT_MAX:
            return true;
        case GGML_OP_ROPE:
            return ggml_is_contiguous(op->src[0]);
        case GGML_OP_IM2COL:
+            // TODO: add support for the new F32 operations
+            return op->src[0]->type == GGML_TYPE_F16;
        case GGML_OP_POOL_2D:
        case GGML_OP_SUM_ROWS:
        case GGML_OP_ARGSORT:
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@ -21,7 +21,7 @@
 #include <memory>
 #include <mutex>

-#include "ggml.h"
+#include "ggml-impl.h"
 #include "ggml-backend-impl.h"

 #include "ggml-vulkan-shaders.hpp"
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -287,6 +287,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
 #define GGML_DEBUG 0
 #define GGML_GELU_FP16
 #define GGML_GELU_QUICK_FP16
+#define GGML_N_TASKS_MAX (-1)

 #define GGML_SOFT_MAX_UNROLL 4
 #define GGML_VEC_DOT_UNROLL  2
@ -1124,17 +1125,17 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
 {                                                  \
    int offset = GGML_F32_ARR >> 1;                \
    for (int i = 0; i < offset; ++i) {             \
-        x[i] = vaddq_f32(x[i], x[offset+i]);   \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
    }                                              \
    offset >>= 1;                                  \
    for (int i = 0; i < offset; ++i) {             \
-        x[i] = vaddq_f32(x[i], x[offset+i]);   \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
    }                                              \
    offset >>= 1;                                  \
    for (int i = 0; i < offset; ++i) {             \
-        x[i] = vaddq_f32(x[i], x[offset+i]);   \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
    }                                              \
-    res = GGML_F32x4_REDUCE_ONE(x[0]);         \
+    (res) = GGML_F32x4_REDUCE_ONE((x)[0]);         \
 }

 #define GGML_F32_VEC        GGML_F32x4
@ -1165,26 +1166,26 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
    do {                                                            \
        int offset = GGML_F16_ARR >> 1;                             \
        for (int i = 0; i < offset; ++i) {                          \
-            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
        }                                                           \
        offset >>= 1;                                               \
        for (int i = 0; i < offset; ++i) {                          \
-            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
        }                                                           \
        offset >>= 1;                                               \
        for (int i = 0; i < offset; ++i) {                          \
-            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
        }                                                           \
-        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
-        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
-        res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
+        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
+        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
+        (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
    } while (0)

    #define GGML_F16_VEC                GGML_F16x8
    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
    #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
    #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
-    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r[i])
+    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i])
    #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
    #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
    #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
@ -1893,6 +1894,23 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
 #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
 #endif

+//
+// ggml object
+//
+
+struct ggml_object {
+    size_t offs;
+    size_t size;
+
+    struct ggml_object * next;
+
+    enum ggml_object_type type;
+
+    char padding[4];
+};
+
+static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
+
 //
 // ggml context
 //
@ -19161,6 +19179,34 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
    ggml_hash_set_reset(&cgraph->visited_hash_set);
 }

+int ggml_graph_size(struct ggml_cgraph * cgraph) {
+    return cgraph->size;
+}
+
+struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) {
+    if (i < 0) {
+        GGML_ASSERT(cgraph->n_nodes + i >= 0);
+        return cgraph->nodes[cgraph->n_nodes + i];
+    }
+
+    GGML_ASSERT(i < cgraph->n_nodes);
+    return cgraph->nodes[i];
+}
+
+struct ggml_tensor ** ggml_graph_nodes(struct ggml_cgraph * cgraph) {
+    return cgraph->nodes;
+}
+
+int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) {
+    return cgraph->n_nodes;
+}
+
+void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
+    GGML_ASSERT(cgraph->size > cgraph->n_nodes);
+    cgraph->nodes[cgraph->n_nodes] = tensor;
+    cgraph->n_nodes++;
+}
+
 // Android's libc implementation "bionic" does not support setting affinity
 #if defined(__gnu_linux__)
 static void set_numa_thread_affinity(int thread_n) {
@ -23242,6 +23288,14 @@ int ggml_cpu_has_arm_fma(void) {
 #endif
 }

+int ggml_cpu_has_riscv_v(void) {
+#if defined(__riscv_v_intrinsic)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 int ggml_cpu_has_metal(void) {
 #if defined(GGML_USE_METAL)
    return 1;
--- a/include/llama.h
+++ b/include/llama.h
@ -1056,6 +1056,9 @@ extern "C" {
    LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
    LLAMA_API int                    llama_sampler_chain_n  (const struct llama_sampler * chain);

+    // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
+    LLAMA_API struct llama_sampler * llama_sampler_chain_remove(   struct llama_sampler * chain, int32_t i);
+
    // available samplers:

    LLAMA_API struct llama_sampler * llama_sampler_init_greedy     (void);
@ -1127,6 +1130,10 @@ extern "C" {
                             int32_t   n_logit_bias,
              const llama_logit_bias * logit_bias);

+
+    // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
+    LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
+
    /// @details Sample and accept a token from the idx-th output of the last evaluation
    //
    // Shorthand for:
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@ -8,6 +8,7 @@
 #include <cstring>
 #include <ctime>
 #include <cfloat>
+#include <chrono>
 #include <cmath>
 #include <numeric>
 #include <random>
@ -162,6 +163,19 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
    cur_p->size = k;
 }

+static uint32_t get_rng_seed(uint32_t seed) {
+    if (seed == LLAMA_DEFAULT_SEED) {
+        // use system clock if std::random_device is not a true RNG
+        static bool is_rd_prng = std::random_device().entropy() == 0;
+        if (is_rd_prng) {
+            return (uint32_t) std::chrono::system_clock::now().time_since_epoch().count();
+        }
+        std::random_device rd;
+        return rd();
+    }
+    return seed;
+}
+
 // llama_sampler API

 const char * llama_sampler_name(const struct llama_sampler * smpl) {
@ -335,13 +349,26 @@ void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler
 struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i) {
    const auto * p = (const llama_sampler_chain *) chain->ctx;

-    if (i < 0 || i >= (int32_t) p->samplers.size()) {
+    if (i < 0 || (size_t) i >= p->samplers.size()) {
        return nullptr;
    }

    return p->samplers[i];
 }

+struct llama_sampler * llama_sampler_chain_remove(struct llama_sampler * chain, int32_t i) {
+    auto * p = (llama_sampler_chain *) chain->ctx;
+
+    if (i < 0 || (size_t) i >= p->samplers.size()) {
+        return nullptr;
+    }
+
+    auto * result = p->samplers[i];
+    p->samplers.erase(p->samplers.begin() + i);
+
+    return result;
+}
+
 int llama_sampler_chain_n(const struct llama_sampler * chain) {
    const auto * p = (const llama_sampler_chain *) chain->ctx;

@ -387,6 +414,7 @@ struct llama_sampler * llama_sampler_init_greedy() {

 struct llama_sampler_dist {
    const uint32_t seed;
+          uint32_t seed_cur;

    std::mt19937 rng;
 };
@ -416,7 +444,8 @@ static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sample

 static void llama_sampler_dist_reset(struct llama_sampler * smpl) {
    auto * ctx = (llama_sampler_dist *) smpl->ctx;
-    ctx->rng = std::mt19937(ctx->seed);
+    ctx->seed_cur = get_rng_seed(ctx->seed);
+    ctx->rng.seed(ctx->seed_cur);
 }

 static void llama_sampler_dist_free(struct llama_sampler * smpl) {
@ -433,11 +462,13 @@ static struct llama_sampler_i llama_sampler_dist_i = {
 };

 struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
+    auto seed_cur = get_rng_seed(seed);
    return new llama_sampler {
        /* .iface = */ &llama_sampler_dist_i,
        /* .ctx   = */ new llama_sampler_dist {
            /* .seed     = */ seed,
-            /* .rng  = */ std::mt19937(seed),
+            /* .seed_cur = */ seed_cur,
+            /* .rng      = */ std::mt19937(seed_cur),
        },
    };
 }
@ -1032,6 +1063,7 @@ struct llama_sampler_mirostat {
    const int32_t n_vocab;

    const uint32_t seed;
+          uint32_t seed_cur;

    const float tau;
    const float eta;
@ -1100,7 +1132,8 @@ static struct llama_sampler * llama_sampler_mirostat_clone(const struct llama_sa
 static void llama_sampler_mirostat_reset(struct llama_sampler * smpl) {
    auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
    ctx->mu = 2.0f*ctx->tau;
-    ctx->rng = std::mt19937(ctx->seed);
+    ctx->seed_cur = get_rng_seed(ctx->seed);
+    ctx->rng.seed(ctx->seed_cur);
 }

 static void llama_sampler_mirostat_free(struct llama_sampler * smpl) {
@ -1117,16 +1150,18 @@ static struct llama_sampler_i llama_sampler_mirostat_i = {
 };

 struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
+    auto seed_cur = get_rng_seed(seed);
    return new llama_sampler {
        /* .iface = */ &llama_sampler_mirostat_i,
        /* .ctx   = */ new llama_sampler_mirostat {
            /* .n_vocab  = */ n_vocab,
            /* .seed     = */ seed,
+            /* .seed_cur = */ seed_cur,
            /* .tau      = */ tau,
            /* .eta      = */ eta,
            /* .m        = */ m,
            /* .mu       = */ 2.0f*tau,
-            /* .rng     = */ std::mt19937(seed),
+            /* .rng      = */ std::mt19937(seed_cur),
        },
    };
 }
@ -1135,6 +1170,7 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see

 struct llama_sampler_mirostat_v2 {
    const uint32_t seed;
+          uint32_t seed_cur;

    const float tau;
    const float eta;
@ -1179,7 +1215,8 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t
 static void llama_sampler_mirostat_v2_reset(struct llama_sampler * smpl) {
    auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
    ctx->mu = 2.0f*ctx->tau;
-    ctx->rng = std::mt19937(ctx->seed);
+    ctx->seed_cur = get_rng_seed(ctx->seed);
+    ctx->rng.seed(ctx->seed_cur);
 }

 static struct llama_sampler * llama_sampler_mirostat_v2_clone(const struct llama_sampler * smpl) {
@ -1212,14 +1249,16 @@ static struct llama_sampler_i llama_sampler_mirostat_v2_i = {
 };

 struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
+    auto seed_cur = get_rng_seed(seed);
    return new llama_sampler {
        /* .iface = */ &llama_sampler_mirostat_v2_i,
        /* .ctx   = */ new llama_sampler_mirostat_v2 {
            /* .seed     = */ seed,
+            /* .seed_cur = */ seed_cur,
            /* .tau      = */ tau,
            /* .eta      = */ eta,
            /* .mu       = */ 2.0f*tau,
-            /* .rng   = */ std::mt19937(seed),
+            /* .rng      = */ std::mt19937(seed_cur),
        },
    };
 }
@ -1505,6 +1544,8 @@ struct llama_sampler * llama_sampler_init_penalties(
        ignore_eos = false;
    }

+    penalty_last_n = std::max(penalty_last_n, 0);
+
    return new llama_sampler {
        /* .iface = */ &llama_sampler_penalties_i,
        /* .ctx   = */ new llama_sampler_penalties {
@ -1568,6 +1609,7 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to
        }
    }
 }
+
 static struct llama_sampler * llama_sampler_logit_bias_clone(const struct llama_sampler * smpl) {
    const auto * ctx = (const llama_sampler_logit_bias *) smpl->ctx;
    return llama_sampler_init_logit_bias(ctx->n_vocab, ctx->logit_bias.size(), ctx->logit_bias.data());
@ -1599,3 +1641,31 @@ struct llama_sampler * llama_sampler_init_logit_bias(
        },
    };
 }
+
+// utils
+
+uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
+    if (smpl->iface == &llama_sampler_dist_i) {
+        return ((const llama_sampler_dist *) smpl->ctx)->seed_cur;
+    }
+
+    if (smpl->iface == &llama_sampler_mirostat_i) {
+        return ((const llama_sampler_mirostat *) smpl->ctx)->seed_cur;
+    }
+
+    if (smpl->iface == &llama_sampler_mirostat_v2_i) {
+        return ((const llama_sampler_mirostat_v2 *) smpl->ctx)->seed_cur;
+    }
+
+    if (smpl->iface == &llama_sampler_chain_i) {
+        const auto * ctx = (const llama_sampler_chain *) smpl->ctx;
+        for (auto it = ctx->samplers.rbegin(); it != ctx->samplers.rend(); ++it) {
+            const uint32_t seed = llama_sampler_get_seed(*it);
+            if (seed != LLAMA_DEFAULT_SEED) {
+                return seed;
+            }
+        }
+    }
+
+    return LLAMA_DEFAULT_SEED;
+}
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -2156,6 +2156,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
    if (host_buffer) {
        buft = ggml_backend_sycl_host_buffer_type();
    }
+#elif defined(GGML_USE_CANN)
+    if (host_buffer) {
+        buft = ggml_backend_cann_host_buffer_type();
+    }
 #elif defined(GGML_USE_CPU_HBM)
    buft = ggml_backend_cpu_hbm_buffer_type();
 #elif defined(GGML_USE_VULKAN)
@ -9258,7 +9262,7 @@ static struct ggml_tensor * llm_build_copy_mask_state(
    // FIXME: zero-out NANs?
    states = ggml_mul(ctx, states, state_mask);

-    // copy states which won't be changed further (between n_seqs and n_rs)
+    // copy states which won't be changed further (between n_seqs and n_kv)
    ggml_build_forward_expand(graph,
        ggml_cpy(ctx,
            ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*ggml_element_size(states)),
@ -9877,8 +9881,8 @@ struct llm_build_context {
    struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
        // find result_norm tensor for input
        struct ggml_tensor * inp = nullptr;
-        for (int i = gf->n_nodes - 1; i >= 0; --i) {
-            inp = gf->nodes[i];
+        for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+            inp = ggml_graph_node(gf, i);
            if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
                break;
            } else {
@ -16076,19 +16080,21 @@ static int llama_decode_internal(
        return -1;
    }

-    for (uint32_t i = 0; i < n_tokens_all; ++i) {
-        if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
-            LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
-            return -1;
-        }
-    }
-
    const auto & model   = lctx.model;
    const auto & hparams = model.hparams;
    const auto & cparams = lctx.cparams;

    GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT

+    if (batch_all.token) {
+        for (uint32_t i = 0; i < n_tokens_all; ++i) {
+            if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
+                return -1;
+            }
+        }
+    }
+
    GGML_ASSERT(n_tokens_all <= cparams.n_batch);

    GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
@ -16205,8 +16211,8 @@ static int llama_decode_internal(
        ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);

        // the output is always the last tensor in the graph
-        struct ggml_tensor * res  = gf->nodes[gf->n_nodes - 1];
-        struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
+        struct ggml_tensor * res  = ggml_graph_node(gf, -1);
+        struct ggml_tensor * embd = ggml_graph_node(gf, -2);

        if (lctx.n_outputs == 0) {
            // no output
@ -16215,9 +16221,9 @@ static int llama_decode_internal(
        } else if (cparams.embeddings) {
            res  = nullptr; // do not extract logits for embedding case
            embd = nullptr;
-            for (int i = gf->n_nodes - 1; i >= 0; --i) {
-                if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
-                    embd = gf->nodes[i];
+            for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+                if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
+                    embd = ggml_graph_node(gf, i);
                    break;
                }
            }
@ -16375,19 +16381,21 @@ static int llama_encode_internal(
        return -1;
    }

-    for (uint32_t i = 0; i < n_tokens; ++i) {
-        if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
-            LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
-            return -1;
-        }
-    }
-
    const auto & model   = lctx.model;
    const auto & hparams = model.hparams;
    const auto & cparams = lctx.cparams;

    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT

+    if (batch.token) {
+        for (uint32_t i = 0; i < n_tokens; ++i) {
+            if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
+                return -1;
+            }
+        }
+    }
+
    // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
    GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");

@ -16432,15 +16440,15 @@ static int llama_encode_internal(
    // there are two cases here
    if (llama_model_has_decoder(&lctx.model)) {
        // first case is an encoder-decoder T5 model where embeddings are passed to decoder
-        embd = gf->nodes[gf->n_nodes - 1];
+        embd = ggml_graph_node(gf, -1);
        GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
    } else {
        // second case is an encoder-only T5 model
        if (cparams.embeddings) {
            // only output embeddings if required
-            embd = gf->nodes[gf->n_nodes - 1];
+            embd = ggml_graph_node(gf, -1);
            if (strcmp(embd->name, "result_embd_pooled") != 0) {
-                embd = gf->nodes[gf->n_nodes - 2];
+                embd = ggml_graph_node(gf, -2);
            }
            GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
        }
@ -17530,6 +17538,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        quantize &= name.find("time_mix_first.weight") == std::string::npos;
        quantize &= name.find("time_mix_w1.weight") == std::string::npos;
        quantize &= name.find("time_mix_w2.weight") == std::string::npos;
+        quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
+        quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;

        // do not quantize relative position bias (T5)
        quantize &= name.find("attn_rel_b.weight") == std::string::npos;
@ -18486,7 +18496,7 @@ struct llama_context * llama_new_context_with_model(

            // note: the number of splits during measure is higher than during inference due to the kv shift
            int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
-            LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, gf->n_nodes);
+            LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, ggml_graph_n_nodes(gf));
            LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
        }
    }
@ -20666,6 +20676,7 @@ const char * llama_print_system_info(void) {
    s += "ARM_FMA = "     + std::to_string(ggml_cpu_has_arm_fma())     + " | ";
    s += "F16C = "        + std::to_string(ggml_cpu_has_f16c())        + " | ";
    s += "FP16_VA = "     + std::to_string(ggml_cpu_has_fp16_va())     + " | ";
+    s += "RISCV_VECT = "  + std::to_string(ggml_cpu_has_riscv_v())     + " | ";
    s += "WASM_SIMD = "   + std::to_string(ggml_cpu_has_wasm_simd())   + " | ";
    s += "BLAS = "        + std::to_string(ggml_cpu_has_blas())        + " | ";
    s += "SSE3 = "        + std::to_string(ggml_cpu_has_sse3())        + " | ";
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -519,7 +519,7 @@ struct test_case {

        // add sentinels as graph nodes so that they are checked in the callback
        for (ggml_tensor * sentinel : sentinels) {
-            gf->nodes[gf->n_nodes++] = sentinel;
+            ggml_graph_add_node(gf, sentinel);
        }

        // randomize tensors
@ -679,9 +679,9 @@ struct test_case {

        // duplicate the op
        size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
-        int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1;
+        int n_runs = std::min((size_t) ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
        for (int i = 1; i < n_runs; i++) {
-            gf->nodes[gf->n_nodes++] = out;
+            ggml_graph_add_node(gf, out);
        }

        // calculate memory
@ -696,11 +696,11 @@ struct test_case {
            }
            return size;
        };
-        for (int i = 0; i < gf->n_nodes; i++) {
-            if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) {
+        for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
+            if (ggml_is_view_op(ggml_graph_node(gf, i)->op) || ggml_graph_node(gf, i) == out) {
                continue;
            }
-            mem += tensor_op_size(gf->nodes[i]);
+            mem += tensor_op_size(ggml_graph_node(gf, i));
        }

        // run
@ -804,7 +804,7 @@ struct test_case {
        ggml_graph_cpy(gf, gb);
        ggml_build_backward_expand(ctx, gf, gb, false);
        if (expect.size() != 1 || expect[0] != 0.0f) {
-            GGML_ASSERT(gb->n_nodes > gf->n_nodes);
+            GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
            for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
                GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE);
            }