Merge remote-tracking branch 'origin' into add-support-for-phi3-vision

2024-12-25 02:44:36 +00:00 · 2024-08-27 18:13:54 -04:00 · 2024-08-27 18:13:54 -04:00 · 951f1d9053
commit 951f1d9053
parent dc0625ab8f 20f1789dfb
57 changed files with 9189 additions and 7109 deletions
--- a/.devops/llama-server-cuda.Dockerfile
+++ b/.devops/llama-server-cuda.Dockerfile
@ -24,6 +24,8 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 ENV GGML_CUDA=1
 # Enable cURL
 ENV LLAMA_CURL=1
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
 ENV LLAMA_ARG_HOST=0.0.0.0
 RUN make -j$(nproc) llama-server
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@ -26,6 +26,8 @@ RUN apt-get update && \
 COPY --from=build /app/build/bin/llama-server /llama-server
 ENV LC_ALL=C.utf8
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
 ENV LLAMA_ARG_HOST=0.0.0.0
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
--- a/.devops/llama-server-rocm.Dockerfile
+++ b/.devops/llama-server-rocm.Dockerfile
@ -39,6 +39,8 @@ ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
 ENV LLAMA_ARG_HOST=0.0.0.0
 # Enable cURL
 ENV LLAMA_CURL=1
--- a/.devops/llama-server-vulkan.Dockerfile
+++ b/.devops/llama-server-vulkan.Dockerfile
@ -23,6 +23,8 @@ RUN cp /app/build/bin/llama-server /llama-server && \
    rm -rf /app
 ENV LC_ALL=C.utf8
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
 ENV LLAMA_ARG_HOST=0.0.0.0
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
--- a/.devops/llama-server.Dockerfile
+++ b/.devops/llama-server.Dockerfile
@ -21,6 +21,8 @@ RUN apt-get update && \
 COPY --from=build /app/llama-server /llama-server
 ENV LC_ALL=C.utf8
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
 ENV LLAMA_ARG_HOST=0.0.0.0
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
--- a/.ecrc
+++ b/.ecrc
@ -1,5 +1,5 @@
 {
-  "Exclude": ["^\\.gitmodules$"],
+  "Exclude": ["^\\.gitmodules$", "stb_image\\.h"],
  "Disable": {
    "IndentSize": true
  }
--- a/CMakePresets.json
+++ b/CMakePresets.json
@ -28,6 +28,7 @@
    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
    { "name": "reldbg",  "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
    { "name": "static",  "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
    { "name": "sycl_f16",  "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
    {
        "name": "arm64-windows-msvc", "hidden": true,
@ -60,6 +61,8 @@
    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
    { "name": "x64-windows-sycl-debug"  , "inherits": [ "sycl-base", "debug"   ] },
-    { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
+    { "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
    { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
    { "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] }
  ]
 }
--- a/ci/run.sh
+++ b/ci/run.sh
@ -13,6 +13,9 @@
 # # with SYCL support
 # GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
 # # with VULKAN support
 # GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
 if [ -z "$2" ]; then
    echo "usage: $0 <output-dir> <mnt-dir>"
@ -40,7 +43,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
 fi
 if [ ! -z ${GG_BUILD_CUDA} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=1"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
 fi
 if [ ! -z ${GG_BUILD_SYCL} ]; then
@ -52,6 +55,10 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
 fi
 if [ ! -z ${GG_BUILD_VULKAN} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
 fi
 ## helpers
 # download a file if it does not exist or if it is outdated
@ -107,7 +114,7 @@ function gg_run_ctest_debug {
    gg_check_build_requirements
    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                          ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j$(nproc)                                  ) 2>&1 | tee -a $OUT/${ci}-make.log
    (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
@ -138,7 +145,7 @@ function gg_run_ctest_release {
    gg_check_build_requirements
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
    if [ -z ${GG_BUILD_LOW_PERF} ]; then
        (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
@ -266,7 +273,6 @@ function gg_sum_ctest_with_model_release {
 }
 # open_llama_7b_v2
 # requires: GG_BUILD_CUDA
 function gg_run_open_llama_7b_v2 {
    cd ${SRC}
@ -290,8 +296,8 @@ function gg_run_open_llama_7b_v2 {
    set -e
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
    python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
@ -425,7 +431,7 @@ function gg_run_pythia_1_4b {
    set -e
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
@ -535,7 +541,6 @@ function gg_sum_pythia_1_4b {
 }
 # pythia_2_8b
 # requires: GG_BUILD_CUDA
 function gg_run_pythia_2_8b {
    cd ${SRC}
@ -556,8 +561,8 @@ function gg_run_pythia_2_8b {
    set -e
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
@ -692,7 +697,7 @@ function gg_run_embd_bge_small {
    set -e
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
@ -761,7 +766,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
    fi
    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
-        if [ -z ${GG_BUILD_CUDA} ]; then
+        if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then
            test $ret -eq 0 && gg_run pythia_1_4b
        else
            test $ret -eq 0 && gg_run pythia_2_8b
--- a/common/common.cpp
+++ b/common/common.cpp
@ -327,6 +327,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
 void gpt_params_parse_from_env(gpt_params & params) {
    // we only care about server-related params for now
    get_env("LLAMA_ARG_MODEL",            params.model);
    get_env("LLAMA_ARG_MODEL_URL",        params.model_url);
    get_env("LLAMA_ARG_MODEL_ALIAS",      params.model_alias);
    get_env("LLAMA_ARG_HF_REPO",          params.hf_repo);
    get_env("LLAMA_ARG_HF_FILE",          params.hf_file);
    get_env("LLAMA_ARG_THREADS",          params.n_threads);
    get_env("LLAMA_ARG_CTX_SIZE",         params.n_ctx);
    get_env("LLAMA_ARG_N_PARALLEL",       params.n_parallel);
@ -341,6 +345,9 @@ void gpt_params_parse_from_env(gpt_params & params) {
    get_env("LLAMA_ARG_EMBEDDINGS",       params.embedding);
    get_env("LLAMA_ARG_FLASH_ATTN",       params.flash_attn);
    get_env("LLAMA_ARG_DEFRAG_THOLD",     params.defrag_thold);
    get_env("LLAMA_ARG_CONT_BATCHING",    params.cont_batching);
    get_env("LLAMA_ARG_HOST",             params.hostname);
    get_env("LLAMA_ARG_PORT",             params.port);
 }
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
@ -901,7 +908,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        }
        return true;
    }
-    if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
+    if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--n-gpu-layers-draft") {
        CHECK_ARG
        params.n_gpu_layers_draft = std::stoi(argv[i]);
        if (!llama_supports_gpu_offload()) {
@ -1861,13 +1868,19 @@ std::string string_get_sortable_timestamp() {
 void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
    if (search.empty()) {
-        return; // Avoid infinite loop if 'search' is an empty string
+        return;
    }
    std::string builder;
    builder.reserve(s.length());
    size_t pos = 0;
-    while ((pos = s.find(search, pos)) != std::string::npos) {
+    size_t last_pos = 0;
-        s.replace(pos, search.length(), replace);
+    while ((pos = s.find(search, last_pos)) != std::string::npos) {
-        pos += replace.length();
+        builder.append(s, last_pos, pos - last_pos);
        builder.append(replace);
        last_pos = pos + search.length();
    }
    builder.append(s, last_pos, std::string::npos);
    s = std::move(builder);
 }
 void string_process_escapes(std::string & input) {
--- a/common/stb_image.h
+++ b/common/stb_image.h
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -63,6 +63,7 @@ class Model:
    model_name: str | None
    metadata_override: Path | None
    dir_model_card: Path
    is_lora: bool
    # subclasses should define this!
    model_arch: gguf.MODEL_ARCH
@ -70,7 +71,7 @@ class Model:
    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
                 use_temp_file: bool = False, eager: bool = False,
                 metadata_override: Path | None = None, model_name: str | None = None,
-                 split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
+                 split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, is_lora: bool = False):
        if type(self) is Model:
            raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
@ -92,6 +93,7 @@ class Model:
        self.metadata_override = metadata_override
        self.model_name = model_name
        self.dir_model_card = dir_model  # overridden in convert_lora_to_gguf.py
        self.is_lora = is_lora  # true if model is used inside convert_lora_to_gguf.py
        # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
        if self.ftype == gguf.LlamaFileType.GUESSED:
@ -1570,7 +1572,7 @@ class LlamaModel(Model):
        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
            if rope_scaling.get("rope_type", '').lower() == "llama3":
                base = self.hparams.get("rope_theta", 10000.0)
-                dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+                dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
                factor = rope_scaling.get("factor", 8.0)
@ -1593,7 +1595,8 @@ class LlamaModel(Model):
                        smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
                        rope_factors.append(1 / ((1 - smooth) / factor + smooth))
-                self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
+                if not self.is_lora:
                    self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
        super().prepare_tensors()
@ -2140,8 +2143,9 @@ class Phi3MiniModel(Model):
        if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
            raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
-        self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG]  + ".weight", np.array(long_factors, dtype=np.float32))
+        if not self.is_lora:
-        self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
+            self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG]  + ".weight", np.array(long_factors, dtype=np.float32))
            self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
@Model.register("PlamoForCausalLM")
@ -3816,7 +3820,7 @@ class ExaoneModel(Model):
        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
            if rope_scaling.get("rope_type", '').lower() == "llama3":
                base = self.hparams.get("rope_theta", 10000.0)
-                dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+                dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
                factor = rope_scaling.get("factor", 8.0)
@ -3839,7 +3843,8 @@ class ExaoneModel(Model):
                        smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
                        rope_factors.append(1 / ((1 - smooth) / factor + smooth))
-                self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
+                if not self.is_lora:
                    self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
        super().prepare_tensors()
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@ -386,6 +386,7 @@ if __name__ == '__main__':
            dry_run=args.dry_run,
            dir_lora_model=dir_lora,
            lora_alpha=alpha,
            is_lora=True,
        )
        logger.info("Exporting model...")
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@ -20,7 +20,7 @@
 **oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
 - **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL - Math Kernel Library)*.
+- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
 - **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
 - **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
@ -28,10 +28,6 @@
 The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).
 When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
 It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
 ## Recommended Release
 The SYCL backend would be broken by some PRs due to no online CI.
@ -45,6 +41,10 @@ The following release is verified with good quality:
 ## News
 - 2024.8
  - Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
 - 2024.5
  - Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
  - Arch Linux is verified successfully.
@ -196,7 +196,7 @@ Please follow the instructions for downloading and installing the Toolkit for Li
 Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
-Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI MKL for intel GPUs.
+Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI oneDNN for Intel GPUs.
 - **Adding support to Nvidia GPUs**
@ -255,8 +255,6 @@ or
 # Export relevant ENV variables
 source /opt/intel/oneapi/setvars.sh
 # Build LLAMA with MKL BLAS acceleration for intel GPU
 # Option 1: Use FP32 (recommended for better performance in most cases)
 cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
--- a/examples/llava/README-minicpmv2.5.md
+++ b/examples/llava/README-minicpmv2.5.md
@ -15,8 +15,8 @@ cd llama.cpp
 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
 ```bash
-python ./examples/minicpmv/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
+python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
-python ./examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
+python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
 python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
 # quantize int4 version
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -218,13 +218,19 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
 static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    if (search.empty()) {
-        return; // Avoid infinite loop if 'search' is an empty string
+        return;
    }
    std::string builder;
    builder.reserve(s.length());
    size_t pos = 0;
-    while ((pos = s.find(search, pos)) != std::string::npos) {
+    size_t last_pos = 0;
-        s.replace(pos, search.length(), replace);
+    while ((pos = s.find(search, last_pos)) != std::string::npos) {
-        pos += replace.length();
+        builder.append(s, last_pos, pos - last_pos);
        builder.append(replace);
        last_pos = pos + search.length();
    }
    builder.append(s, last_pos, std::string::npos);
    s = std::move(builder);
 }
 static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -104,7 +104,7 @@ static void usage(const char * executable) {
    printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
    printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
    printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
-    printf("  --keep-split: will generate quatized model in the same shards as input");
+    printf("  --keep-split: will generate quantized model in the same shards as input\n");
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
    printf("Note: --include-weights and --exclude-weights cannot be used together\n");
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -249,23 +249,49 @@ logging:
 Available environment variables (if specified, these variables will override parameters specified in arguments):
- `LLAMA_CACHE` (cache directory, used by `--hf-repo`)
+- `LLAMA_CACHE`: cache directory, used by `--hf-repo`
- `HF_TOKEN` (Hugging Face access token, used when accessing a gated model with `--hf-repo`)
+- `HF_TOKEN`: Hugging Face access token, used when accessing a gated model with `--hf-repo`
- `LLAMA_ARG_MODEL`
+- `LLAMA_ARG_MODEL`: equivalent to `-m`
- `LLAMA_ARG_THREADS`
+- `LLAMA_ARG_MODEL_URL`: equivalent to `-mu`
- `LLAMA_ARG_CTX_SIZE`
+- `LLAMA_ARG_MODEL_ALIAS`: equivalent to `-a`
- `LLAMA_ARG_N_PARALLEL`
+- `LLAMA_ARG_HF_REPO`: equivalent to `--hf-repo`
- `LLAMA_ARG_BATCH`
+- `LLAMA_ARG_HF_FILE`: equivalent to `--hf-file`
- `LLAMA_ARG_UBATCH`
+- `LLAMA_ARG_THREADS`: equivalent to `-t`
- `LLAMA_ARG_N_GPU_LAYERS`
+- `LLAMA_ARG_CTX_SIZE`: equivalent to `-c`
- `LLAMA_ARG_THREADS_HTTP`
+- `LLAMA_ARG_N_PARALLEL`: equivalent to `-np`
- `LLAMA_ARG_CHAT_TEMPLATE`
+- `LLAMA_ARG_BATCH`: equivalent to `-b`
- `LLAMA_ARG_N_PREDICT`
+- `LLAMA_ARG_UBATCH`: equivalent to `-ub`
- `LLAMA_ARG_ENDPOINT_METRICS`
+- `LLAMA_ARG_N_GPU_LAYERS`: equivalent to `-ngl`
- `LLAMA_ARG_ENDPOINT_SLOTS`
+- `LLAMA_ARG_THREADS_HTTP`: equivalent to `--threads-http`
- `LLAMA_ARG_EMBEDDINGS`
+- `LLAMA_ARG_CHAT_TEMPLATE`: equivalent to `--chat-template`
- `LLAMA_ARG_FLASH_ATTN`
+- `LLAMA_ARG_N_PREDICT`: equivalent to `-n`
- `LLAMA_ARG_DEFRAG_THOLD`
+- `LLAMA_ARG_ENDPOINT_METRICS`: if set to `1`, it will enable metrics endpoint (equivalent to `--metrics`)
 - `LLAMA_ARG_ENDPOINT_SLOTS`: if set to `0`, it will **disable** slots endpoint (equivalent to `--no-slots`). This feature is enabled by default.
 - `LLAMA_ARG_EMBEDDINGS`: if set to `1`, it will enable embeddings endpoint (equivalent to `--embeddings`)
 - `LLAMA_ARG_FLASH_ATTN`: if set to `1`, it will enable flash attention (equivalent to `-fa`)
 - `LLAMA_ARG_CONT_BATCHING`: if set to `0`, it will **disable** continuous batching (equivalent to `--no-cont-batching`). This feature is enabled by default.
 - `LLAMA_ARG_DEFRAG_THOLD`: equivalent to `-dt`
 - `LLAMA_ARG_HOST`: equivalent to `--host`
 - `LLAMA_ARG_PORT`: equivalent to `--port`
 Example usage of docker compose with environment variables:
 ```yml
 services:
  llamacpp-server:
    image: ghcr.io/ggerganov/llama.cpp:server
    ports:
      - 8080:8080
    volumes:
      - ./models:/models
    environment:
      # alternatively, you can use "LLAMA_ARG_MODEL_URL" to download the model
      LLAMA_ARG_MODEL: /models/my_model.gguf
      LLAMA_ARG_CTX_SIZE: 4096
      LLAMA_ARG_N_PARALLEL: 2
      LLAMA_ARG_ENDPOINT_METRICS: 1  # to disable, either remove or set to 0
      LLAMA_ARG_PORT: 8080
 ```
 ## Build
--- a/examples/server/public/index.js
+++ b/examples/server/public/index.js
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@ -63,6 +63,7 @@ extern "C" {
    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
    // "offset" refers to the offset of the tensor data for setting/getting data
    GGML_API GGML_CALL void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -220,7 +220,7 @@
 #include <stdio.h>
 #define GGML_FILE_MAGIC   0x67676d6c // "ggml"
-#define GGML_FILE_VERSION 1
+#define GGML_FILE_VERSION 2
 #define GGML_QNT_VERSION        2    // bump this on quantization format changes
 #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
@ -453,6 +453,8 @@ extern "C" {
        GGML_OP_SQR,
        GGML_OP_SQRT,
        GGML_OP_LOG,
        GGML_OP_SIN,
        GGML_OP_COS,
        GGML_OP_SUM,
        GGML_OP_SUM_ROWS,
        GGML_OP_MEAN,
@ -490,9 +492,11 @@ extern "C" {
        GGML_OP_CLAMP,
        GGML_OP_CONV_TRANSPOSE_1D,
        GGML_OP_IM2COL,
        GGML_OP_IM2COL_BACK,
        GGML_OP_CONV_TRANSPOSE_2D,
        GGML_OP_POOL_1D,
        GGML_OP_POOL_2D,
        GGML_OP_POOL_2D_BACK,
        GGML_OP_UPSCALE, // nearest interpolate
        GGML_OP_PAD,
        GGML_OP_ARANGE,
@ -969,6 +973,22 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    GGML_API struct ggml_tensor * ggml_sin(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    GGML_API struct ggml_tensor * ggml_sin_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    GGML_API struct ggml_tensor * ggml_cos(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    GGML_API struct ggml_tensor * ggml_cos_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // return scalar
    GGML_API struct ggml_tensor * ggml_sum(
            struct ggml_context * ctx,
@ -1566,34 +1586,49 @@ extern "C" {
            float                 min,
            float                 max);
    // im2col
    // converts data into a format that effectively results in a convolution when combined with matrix multiplication
    GGML_API struct ggml_tensor * ggml_im2col(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
+            struct ggml_tensor  * a,  // convolution kernel
-            struct ggml_tensor  * b,
+            struct ggml_tensor  * b,  // data
-            int                  s0,
+            int                   s0, // stride dimension 0
-            int                  s1,
+            int                   s1, // stride dimension 1
-            int                  p0,
+            int                   p0, // padding dimension 0
-            int                  p1,
+            int                   p1, // padding dimension 1
-            int                  d0,
+            int                   d0, // dilation dimension 0
-            int                  d1,
+            int                   d1, // dilation dimension 1
-            bool                 is_2D,
+            bool                  is_2D,
-            enum ggml_type       dst_type);
+            enum ggml_type        dst_type);
    GGML_API struct ggml_tensor * ggml_im2col_back(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,  // convolution kernel
        struct ggml_tensor  * b,  // gradient of im2col output
        int64_t             * ne, // shape of im2col input
        int                   s0, // stride dimension 0
        int                   s1, // stride dimension 1
        int                   p0, // padding dimension 0
        int                   p1, // padding dimension 1
        int                   d0, // dilation dimension 0
        int                   d1, // dilation dimension 1
        bool                  is_2D);
    GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
+            struct ggml_tensor  * a,  // convolution kernel
-            struct ggml_tensor  * b,
+            struct ggml_tensor  * b,  // data
-            int                  s0,
+            int                  s0,  // stride dimension 0
-            int                  s1,
+            int                  s1,  // stride dimension 1
-            int                  p0,
+            int                  p0,  // padding dimension 0
-            int                  p1,
+            int                  p1,  // padding dimension 1
-            int                  d0,
+            int                  d0,  // dilation dimension 0
-            int                  d1);
+            int                  d1); // dilation dimension 1
    GGML_API struct ggml_tensor * ggml_conv_1d(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
+            struct ggml_tensor  * a,   // convolution kernel
-            struct ggml_tensor  * b,
+            struct ggml_tensor  * b,   // data
            int                   s0,  // stride
            int                   p0,  // padding
            int                   d0); // dilation
@ -1602,29 +1637,29 @@ extern "C" {
    // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
    GGML_API struct ggml_tensor* ggml_conv_1d_ph(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
+            struct ggml_tensor  * a,  // convolution kernel
-            struct ggml_tensor  * b,
+            struct ggml_tensor  * b,  // data
-            int                   s,
+            int                   s,  // stride
-            int                   d);
+            int                   d); // dilation
    GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
+            struct ggml_tensor  * a,   // convolution kernel
-            struct ggml_tensor  * b,
+            struct ggml_tensor  * b,   // data
-            int                   s0,
+            int                   s0,  // stride
-            int                   p0,
+            int                   p0,  // padding
-            int                   d0);
+            int                   d0); // dilation
    GGML_API struct ggml_tensor * ggml_conv_2d(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
+            struct ggml_tensor  * a,   // convolution kernel
-            struct ggml_tensor  * b,
+            struct ggml_tensor  * b,   // data
-            int                   s0,
+            int                   s0,  // stride dimension 0
-            int                   s1,
+            int                   s1,  // stride dimension 1
-            int                   p0,
+            int                   p0,  // padding dimension 0
-            int                   p1,
+            int                   p1,  // padding dimension 1
-            int                   d0,
+            int                   d0,  // dilation dimension 0
-            int                   d1);
+            int                   d1); // dilation dimension 1
    // kernel size is a->ne[0] x a->ne[1]
@ -1686,6 +1721,18 @@ extern "C" {
            float                 p0,
            float                 p1);
    GGML_API struct ggml_tensor * ggml_pool_2d_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * af, // "a"/input used in forward pass
            enum ggml_op_pool     op,
            int                   k0,
            int                   k1,
            int                   s0,
            int                   s1,
            float                 p0,
            float                 p1);
    // nearest interpolate
    // multiplies ne0 and ne1 by scale factor
    // used in stable-diffusion
@ -1760,7 +1807,8 @@ extern "C" {
            struct ggml_tensor  * v,
            struct ggml_tensor  * mask,
            float                 scale,
-            float                 max_bias);
+            float                 max_bias,
            float                 logit_softcap);
    GGML_API void ggml_flash_attn_ext_set_prec(
            struct ggml_tensor * a,
@ -1777,10 +1825,8 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_ssm_conv(
            struct ggml_context * ctx,
-            struct ggml_tensor  * s,
+            struct ggml_tensor  * sx,
-            struct ggml_tensor  * x,
+            struct ggml_tensor  * c);
            struct ggml_tensor  * c,
            struct ggml_tensor  * sq);
    GGML_API struct ggml_tensor * ggml_ssm_scan(
            struct ggml_context * ctx,
@ -1789,8 +1835,7 @@ extern "C" {
            struct ggml_tensor  * dt,
            struct ggml_tensor  * A,
            struct ggml_tensor  * B,
-            struct ggml_tensor  * C,
+            struct ggml_tensor  * C);
            struct ggml_tensor  * sq);
    // partition into non-overlapping windows with padding if needed
    // example:
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -549,6 +549,13 @@ if (GGML_SYCL)
    file(GLOB   GGML_SOURCES_SYCL "ggml-sycl/*.cpp")
    list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
    find_package(DNNL)
    message("-- DNNL found:" ${DNNL_FOUND})
    if (GGML_SYCL_TARGET STREQUAL "INTEL")
        add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
    else()
        add_compile_definitions(GGML_SYCL_DNNL=0)
    endif()
    if (WIN32)
        find_package(IntelSYCL REQUIRED)
        find_package(MKL REQUIRED)
@ -561,6 +568,9 @@ if (GGML_SYCL)
            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
        endif()
    endif()
    if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
        list(APPEND GGML_EXTRA_LIBS DNNL::dnnl)
    endif()
 endif()
 if (GGML_RPC)
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@ -337,33 +337,18 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
 }
 size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
-    if (!quant_weights) {
+    UNUSED(quant_weights);
-        return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
+    return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
    }
    else {
        assert(false);
        return 0;
    }
 }
 size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
-    if (!quant_weights) {
+    UNUSED(quant_weights);
-        return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8);
+    return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8);
    }
    else {
        assert(false);
        return 0;
    }
 }
 size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
-    if (!quant_weights) {
+    UNUSED(quant_weights);
-        return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
+    return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
    }
    else {
        assert(false);
        return 0;
    }
 }
 void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@ -9,8 +9,10 @@
 #include "ggml-cuda/binbcast.cuh"
 #include "ggml-cuda/clamp.cuh"
 #include "ggml-cuda/concat.cuh"
 #include "ggml-cuda/conv-transpose-1d.cuh"
 #include "ggml-cuda/convert.cuh"
 #include "ggml-cuda/cpy.cuh"
 #include "ggml-cuda/cross-entropy-loss.cuh"
 #include "ggml-cuda/diagmask.cuh"
 #include "ggml-cuda/dmmv.cuh"
 #include "ggml-cuda/fattn.cuh"
@ -29,7 +31,6 @@
 #include "ggml-cuda/tsembd.cuh"
 #include "ggml-cuda/unary.cuh"
 #include "ggml-cuda/upscale.cuh"
 #include "ggml-cuda/conv-transpose-1d.cuh"
 #include <algorithm>
 #include <array>
@ -2181,6 +2182,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_ADD:
            ggml_cuda_op_add(ctx, dst);
            break;
        case GGML_OP_SUB:
            ggml_cuda_op_sub(ctx, dst);
            break;
        case GGML_OP_ACC:
            ggml_cuda_op_acc(ctx, dst);
            break;
@ -2267,6 +2271,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_SQRT:
            ggml_cuda_op_sqrt(ctx, dst);
            break;
        case GGML_OP_SIN:
            ggml_cuda_op_sin(ctx, dst);
            break;
        case GGML_OP_COS:
            ggml_cuda_op_cos(ctx, dst);
            break;
        case GGML_OP_CLAMP:
            ggml_cuda_op_clamp(ctx, dst);
            break;
@ -2303,6 +2313,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_FLASH_ATTN_EXT:
            ggml_cuda_flash_attn_ext(ctx, dst);
            break;
        case GGML_OP_CROSS_ENTROPY_LOSS:
            ggml_cuda_cross_entropy_loss(ctx, dst);
            break;
        default:
            return false;
    }
@ -2610,6 +2623,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
                assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
                for (int j = 0; j < GGML_MAX_SRC; j++) {
                    if (node->src[j] != nullptr) {
                        assert(node->src[j]->buffer);
                        assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
                    }
                }
@ -2853,12 +2867,15 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
        case GGML_OP_TRANSPOSE:
        case GGML_OP_NORM:
        case GGML_OP_ADD:
        case GGML_OP_SUB:
        case GGML_OP_MUL:
        case GGML_OP_DIV:
        case GGML_OP_RMS_NORM:
        case GGML_OP_SCALE:
        case GGML_OP_SQR:
        case GGML_OP_SQRT:
        case GGML_OP_SIN:
        case GGML_OP_COS:
        case GGML_OP_CLAMP:
        case GGML_OP_CONT:
        case GGML_OP_DIAG_MASK_INF:
@ -2890,6 +2907,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
            }
            return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA &&
                op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
        case GGML_OP_CROSS_ENTROPY_LOSS:
            return true;
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
        default:
            return false;
--- a/ggml/src/ggml-cuda/binbcast.cu
+++ b/ggml/src/ggml-cuda/binbcast.cu
@ -9,6 +9,10 @@ static __device__ __forceinline__ float op_add(const float a, const float b) {
    return a + b;
 }
 static __device__ __forceinline__ float op_sub(const float a, const float b) {
    return a - b;
 }
 static __device__ __forceinline__ float op_mul(const float a, const float b) {
    return a * b;
 }
@ -271,6 +275,10 @@ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
 }
 void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_sub>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
 }
 void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
 }
--- a/ggml/src/ggml-cuda/binbcast.cuh
+++ b/ggml/src/ggml-cuda/binbcast.cuh
@ -2,5 +2,6 @@
 void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/cross-entropy-loss.cu
+++ b/ggml/src/ggml-cuda/cross-entropy-loss.cu
@ -0,0 +1,106 @@
 #include "common.cuh"
 #include "cross-entropy-loss.cuh"
 #include "sumrows.cuh"
 #include <cmath>
 #include <cstdint>
 static __global__ void cross_entropy_loss_f32(const float * logits, const float * labels, float * dst, const int nclasses, const int k) {
    const int warp_id = threadIdx.x / WARP_SIZE;
    const int lane_id = threadIdx.x % WARP_SIZE;
    const int i0 = blockDim.x*blockIdx.x + warp_id*WARP_SIZE;
    const int ne_tmp = WARP_SIZE*nclasses;
    extern __shared__ float tmp_all[];
    float * tmp_logits = tmp_all + (2*warp_id + 0)*ne_tmp;
    float * tmp_labels = tmp_all + (2*warp_id + 1)*ne_tmp;
    // Each warp first loads ne_tmp logits/labels into shared memory:
    for (int i = lane_id; i < ne_tmp; i += WARP_SIZE) {
        const int ig = i0*nclasses + i; // ig == i global
        tmp_logits[i] = ig < k*nclasses ? logits[ig] : 0.0f;
        tmp_labels[i] = ig < k*nclasses ? labels[ig] : 0.0f;
    }
    // Each thread in the warp then calculates the cross entropy loss for a single row.
    // TODO: pad in order to avoid shared memory bank conflicts.
    // Find maximum for softmax:
    float max = -INFINITY;
    for (int i = 0; i < nclasses; ++i) {
        max = fmaxf(max, tmp_logits[lane_id*nclasses + i]);
    }
    // Calculate log(softmax(logits)) which is just logits - max:
    float sum = 0.0f;
    for (int i = 0; i < nclasses; ++i) {
        float val = tmp_logits[lane_id*nclasses + i] - max;
        sum += expf(val);
        tmp_logits[lane_id*nclasses + i] = val;
    }
    sum = logf(sum);
    // log(exp(logits - max) / sum) = (logits - max) - log(sum)
    float loss = 0.0f;
    for (int i = 0; i < nclasses; ++i) {
        loss += (tmp_logits[lane_id*nclasses + i] - sum) * tmp_labels[lane_id*nclasses + i];
    }
    loss = -warp_reduce_sum(loss) / (float)k;
    __syncthreads();
    if (lane_id == 0) {
        tmp_all[warp_id] = loss;
    }
    __syncthreads();
    if (warp_id != 0) {
        return;
    }
    loss = lane_id < CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE/WARP_SIZE ? tmp_all[lane_id] : 0.0f;
    loss = warp_reduce_sum(loss);
    if (lane_id != 0) {
        return;
    }
    dst[blockIdx.x] = loss;
 }
 void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const ggml_tensor * src1 = dst->src[1];
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(ggml_is_contiguous(src1));
    GGML_ASSERT(ggml_is_contiguous(dst));
    const int64_t ne00  = src0->ne[0];
    const int64_t nrows = ggml_nrows(src0);
    const float * src0_d = (const float *) src0->data;
    const float * src1_d = (const float *) src1->data;
    float       * dst_d  = (float       *) dst->data;
    ggml_cuda_pool & pool = ctx.pool();
    cudaStream_t stream = ctx.stream();
    const dim3 blocks_dim(CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE, 1, 1);
    const dim3 blocks_num((nrows + CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE - 1) / CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE, 1, 1);
    const int shmem = 2*CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE*ne00*sizeof(float);
    ggml_cuda_pool_alloc<float> dst_tmp(pool, blocks_num.x);
    cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
    // Combine results from individual blocks:
    sum_rows_f32_cuda(dst_tmp.ptr, dst_d, blocks_num.x, 1, stream);
 }
--- a/ggml/src/ggml-cuda/cross-entropy-loss.cuh
+++ b/ggml/src/ggml-cuda/cross-entropy-loss.cuh
@ -0,0 +1,5 @@
 #include "common.cuh"
 #define CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE 256
 void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@ -22,6 +22,7 @@ typedef void (* fattn_kernel_t)(
        const float m0,
        const float m1,
        const uint32_t n_head_log2,
        const float logit_softcap,
        const int ne00,
        const int ne01,
        const int ne02,
@ -657,11 +658,17 @@ void launch_fattn(
    const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
    const int  shmem = 0;
-    float scale    = 1.0f;
+    float scale         = 1.0f;
-    float max_bias = 0.0f;
+    float max_bias      = 0.0f;
    float logit_softcap = 0.0f;
-    memcpy(&scale,    (float *) KQV->op_params + 0, sizeof(float));
+    memcpy(&scale,         (float *) KQV->op_params + 0, sizeof(float));
-    memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
+    memcpy(&max_bias,      (float *) KQV->op_params + 1, sizeof(float));
    memcpy(&logit_softcap, (float *) KQV->op_params + 2, sizeof(float));
    if (logit_softcap != 0.0f) {
        scale /= logit_softcap;
    }
    const uint32_t n_head      = Q->ne[2];
    const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
@ -675,7 +682,7 @@ void launch_fattn(
        V_data,
        mask ? ((const char *) mask->data) : nullptr,
        (parallel_blocks) == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
-        scale, max_bias, m0, m1, n_head_log2,
+        scale, max_bias, m0, m1, n_head_log2, logit_softcap,
        Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
        K->ne[0], K->ne[1], K->ne[2], K->ne[3],
        mask ? mask->ne[1] : 0, mask ?  mask->nb[1] : 0,
--- a/ggml/src/ggml-cuda/fattn-tile-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu
@ -4,7 +4,7 @@
 #define FATTN_KQ_STRIDE_TILE_F16 64
-template<int D, int ncols, int nwarps, int parallel_blocks> // D == head size
+template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(nwarps*WARP_SIZE, 1)
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
@ -20,6 +20,7 @@ static __global__ void flash_attn_tile_ext_f16(
        const float m0,
        const float m1,
        const uint32_t n_head_log2,
        const float logit_softcap,
        const int ne00,
        const int ne01,
        const int ne02,
@ -44,6 +45,12 @@ static __global__ void flash_attn_tile_ext_f16(
        const int ne2,
        const int ne3) {
 #ifdef FP16_AVAILABLE
    // Skip unused kernel variants for faster compilation:
    if (use_logit_softcap && !(D == 128 || D == 256)) {
        NO_DEVICE_CODE;
        return;
    }
    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
@ -154,7 +161,13 @@ static __global__ void flash_attn_tile_ext_f16(
            for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
                const int j_KQ = j_KQ_0 + threadIdx.y;
-                half sum = __low2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]) + __high2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
+                half sum;
                if (use_logit_softcap) {
                    const float2 tmp = __half22float2(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
                    sum = logit_softcap * tanhf(tmp.x + tmp.y);
                } else {
                    sum = __low2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]) + __high2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
                }
                sum += mask ? slopeh*maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
                kqmax_new[j_KQ_0/nwarps] = ggml_cuda_hmax(kqmax_new[j_KQ_0/nwarps], sum);
@ -270,20 +283,20 @@ static __global__ void flash_attn_tile_ext_f16(
 #endif // FP16_AVAILABLE
 }
-template <int cols_per_block, int parallel_blocks>
+template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
 void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * Q = dst->src[0];
    switch (Q->ne[0]) {
        case  64: {
            constexpr int      D = 64;
            constexpr int nwarps = 8;
-            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks>;
+            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
        } break;
        case 128: {
            constexpr int      D = 128;
            constexpr int nwarps = 8;
-            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks>;
+            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
        } break;
        default: {
@ -296,24 +309,45 @@ void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_ten
    const ggml_tensor * KQV = dst;
    const ggml_tensor * Q   = dst->src[0];
-    const int32_t precision = KQV->op_params[2];
+    const int32_t precision = KQV->op_params[3];
    GGML_ASSERT(precision == GGML_PREC_DEFAULT);
    float logit_softcap;
    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
    if (Q->ne[1] <= 16) {
        constexpr int cols_per_block = 16;
        constexpr int parallel_blocks = 4;
-        launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
+        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
            launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
            launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
        }
        return;
    }
    if (Q->ne[1] <= 32) {
        constexpr int cols_per_block = 32;
        constexpr int parallel_blocks = 4;
-        launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
+        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
            launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
            launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
        }
        return;
    }
    constexpr int cols_per_block = 32;
    constexpr int parallel_blocks = 1;
-    launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
+    if (logit_softcap == 0.0f) {
        constexpr bool use_logit_softcap = false;
        launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
    } else {
        constexpr bool use_logit_softcap = true;
        launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
    }
 }
--- a/ggml/src/ggml-cuda/fattn-tile-f32.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu
@ -4,7 +4,7 @@
 #define FATTN_KQ_STRIDE_TILE_F32 32
-template<int D, int ncols, int nwarps, int parallel_blocks> // D == head size
+template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(nwarps*WARP_SIZE, 1)
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
@ -20,6 +20,7 @@ static __global__ void flash_attn_tile_ext_f32(
        const float m0,
        const float m1,
        const uint32_t n_head_log2,
        const float logit_softcap,
        const int ne00,
        const int ne01,
        const int ne02,
@ -43,6 +44,12 @@ static __global__ void flash_attn_tile_ext_f32(
        const int ne1,
        const int ne2,
        const int ne3) {
    // Skip unused kernel variants for faster compilation:
    if (use_logit_softcap && !(D == 128 || D == 256)) {
        NO_DEVICE_CODE;
        return;
    }
    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
@ -151,6 +158,10 @@ static __global__ void flash_attn_tile_ext_f32(
            for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
                const int j_KQ = j_KQ_0 + threadIdx.y;
                if (use_logit_softcap) {
                    sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] = logit_softcap * tanhf(sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
                }
                sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
                kqmax_new[j_KQ_0/nwarps] = fmaxf(kqmax_new[j_KQ_0/nwarps], sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
@ -267,20 +278,20 @@ static __global__ void flash_attn_tile_ext_f32(
    }
 }
-template <int cols_per_block, int parallel_blocks>
+template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
 void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * Q = dst->src[0];
    switch (Q->ne[0]) {
        case  64: {
            constexpr int      D = 64;
            constexpr int nwarps = 8;
-            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks>;
+            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
        } break;
        case 128: {
            constexpr int      D = 128;
            constexpr int nwarps = 8;
-            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks>;
+            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
        } break;
        default: {
@ -290,23 +301,45 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
 }
 void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * KQV = dst;
    const ggml_tensor * Q = dst->src[0];
    float logit_softcap;
    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
    if (Q->ne[1] <= 16) {
        constexpr int cols_per_block = 16;
        constexpr int parallel_blocks = 4;
-        launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
+        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
            launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
            launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
        }
        return;
    }
    if (Q->ne[1] <= 32) {
        constexpr int cols_per_block = 32;
        constexpr int parallel_blocks = 4;
-        launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
+        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
            launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
            launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
        }
        return;
    }
    constexpr int cols_per_block = 32;
    constexpr int parallel_blocks = 1;
-    launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
+    if (logit_softcap == 0.0f) {
        constexpr bool use_logit_softcap = false;
        launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
    } else {
        constexpr bool use_logit_softcap = true;
        launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
    }
 }
--- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@ -1,7 +1,7 @@
 #include "common.cuh"
 #include "fattn-common.cuh"
-template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V> // D == head size
+template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
@ -17,6 +17,7 @@ static __global__ void flash_attn_vec_ext_f16(
        const float m0,
        const float m1,
        const uint32_t n_head_log2,
        const float logit_softcap,
        const int ne00,
        const int ne01,
        const int ne02,
@ -41,6 +42,12 @@ static __global__ void flash_attn_vec_ext_f16(
        const int ne2,
        const int ne3) {
 #ifdef FP16_AVAILABLE
    // Skip unused kernel variants for faster compilation:
    if (use_logit_softcap && !(D == 128 || D == 256)) {
        NO_DEVICE_CODE;
        return;
    }
    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
    constexpr vec_dot_KQ_f16_t vec_dot_KQ = get_vec_dot_KQ_f16<D>(type_K);
@ -190,6 +197,11 @@ static __global__ void flash_attn_vec_ext_f16(
            for (int j = 0; j < ncols; ++j) {
                half sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_h2[j], Q_i32[j], Q_ds[j]);
                sum = warp_reduce_sum(sum);
                if (use_logit_softcap) {
                    sum = logit_softcap*tanhf(sum);
                }
                sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
                if (ncols == 1) {
@ -286,10 +298,10 @@ static __global__ void flash_attn_vec_ext_f16(
 #endif // FP16_AVAILABLE
 }
-template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V>
+template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
 void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    constexpr int nwarps = D/WARP_SIZE;
-    fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks, type_K, type_V>;
+    fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>;
    constexpr bool need_f16_K = D != 128;
    constexpr bool need_f16_V = D != 128 && D != 64;
    launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, need_f16_K, need_f16_V);
@ -297,48 +309,81 @@ void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx,
 template <int D, ggml_type type_K, ggml_type type_V>
 void ggml_cuda_flash_attn_ext_vec_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    ggml_tensor * KQV = dst;
+    const ggml_tensor * KQV = dst;
-    ggml_tensor * Q   = dst->src[0];
+    const ggml_tensor * Q   = dst->src[0];
-    ggml_tensor * K   = dst->src[1];
+    const ggml_tensor * K   = dst->src[1];
-    ggml_tensor * V   = dst->src[2];
+    const ggml_tensor * V   = dst->src[2];
-    const int32_t precision = KQV->op_params[2];
+    const int32_t precision = KQV->op_params[3];
    GGML_ASSERT(precision == GGML_PREC_DEFAULT);
    GGML_ASSERT(K->type == type_K);
    GGML_ASSERT(V->type == type_V);
    float logit_softcap;
    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
    if (Q->ne[1] == 1) {
        constexpr int cols_per_block  = 1;
        constexpr int parallel_blocks = 4;
-        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
+        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
        }
        return;
    }
    if (Q->ne[1] == 2) {
        constexpr int cols_per_block  = 2;
        constexpr int parallel_blocks = 4;
-        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
+        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
        }
        return;
    }
    if (Q->ne[1] <= 4) {
        constexpr int cols_per_block  = 4;
        constexpr int parallel_blocks = 4;
-        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
+        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
        }
        return;
    }
    if (Q->ne[1] <= 8) {
        constexpr int cols_per_block  = 8;
        constexpr int parallel_blocks = 4;
-        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
+        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
        }
        return;
    }
    constexpr int cols_per_block  = 8;
    constexpr int parallel_blocks = 1;
-    ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
+    if (logit_softcap == 0.0f) {
        constexpr bool use_logit_softcap = false;
        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
    } else {
        constexpr bool use_logit_softcap = true;
        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
    }
 }
 #define DECL_FATTN_VEC_F16_CASE(D, type_K, type_V)                          \
--- a/ggml/src/ggml-cuda/fattn-vec-f32.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
@ -1,7 +1,7 @@
 #include "common.cuh"
 #include "fattn-common.cuh"
-template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V> // D == head size
+template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
@ -17,6 +17,7 @@ static __global__ void flash_attn_vec_ext_f32(
        const float m0,
        const float m1,
        const uint32_t n_head_log2,
        const float logit_softcap,
        const int ne00,
        const int ne01,
        const int ne02,
@ -40,6 +41,12 @@ static __global__ void flash_attn_vec_ext_f32(
        const int ne1,
        const int ne2,
        const int ne3) {
    // Skip unused kernel variants for faster compilation:
    if (use_logit_softcap && !(D == 128 || D == 256)) {
        NO_DEVICE_CODE;
        return;
    }
    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
    constexpr vec_dot_KQ_f32_t vec_dot_KQ = get_vec_dot_KQ_f32<D>(type_K);
@ -180,6 +187,11 @@ static __global__ void flash_attn_vec_ext_f32(
            for (int j = 0; j < ncols; ++j) {
                float sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_f2[j], Q_i32[j], Q_ds[j]);
                sum = warp_reduce_sum(sum);
                if (use_logit_softcap) {
                    sum = logit_softcap*tanhf(sum);
                }
                sum += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
                kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum);
@ -267,10 +279,10 @@ static __global__ void flash_attn_vec_ext_f32(
    }
 }
-template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V>
+template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
 void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    constexpr int nwarps = D/WARP_SIZE;
-    fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks, type_K, type_V>;
+    fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>;
    constexpr bool need_f16_K = D != 128;
    constexpr bool need_f16_V = D != 128 && D != 64;
    launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, need_f16_K, need_f16_V);
@ -278,44 +290,78 @@ void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx,
 template <int D, ggml_type type_K, ggml_type type_V>
 void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    ggml_tensor * Q   = dst->src[0];
+    const ggml_tensor * KQV = dst;
-    ggml_tensor * K   = dst->src[1];
+    const ggml_tensor * Q   = dst->src[0];
-    ggml_tensor * V   = dst->src[2];
+    const ggml_tensor * K   = dst->src[1];
    const ggml_tensor * V   = dst->src[2];
    GGML_ASSERT(K->type == type_K);
    GGML_ASSERT(V->type == type_V);
    float logit_softcap;
    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
    if (Q->ne[1] == 1) {
        constexpr int cols_per_block  = 1;
        constexpr int parallel_blocks = 4;
-        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
+        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
        }
        return;
    }
    if (Q->ne[1] == 2) {
        constexpr int cols_per_block  = 2;
        constexpr int parallel_blocks = 4;
-        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
+        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
        }
        return;
    }
    if (Q->ne[1] <= 4) {
        constexpr int cols_per_block  = 4;
        constexpr int parallel_blocks = 4;
-        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
+        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
        }
        return;
    }
    if (Q->ne[1] <= 8) {
        constexpr int cols_per_block  = 8;
        constexpr int parallel_blocks = 4;
-        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
+        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
        }
        return;
    }
    constexpr int cols_per_block  = 8;
    constexpr int parallel_blocks = 1;
-    ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
+    if (logit_softcap == 0.0f) {
        constexpr bool use_logit_softcap = false;
        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
    } else {
        constexpr bool use_logit_softcap = true;
        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
    }
 }
 #define DECL_FATTN_VEC_F32_CASE(D, type_K, type_V)                          \
--- a/ggml/src/ggml-cuda/fattn-wmma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cuh
@ -6,7 +6,7 @@
 #endif // FP16_MMA_AVAILABLE
 // D == head size, VKQ_stride == num VKQ rows calculated in parallel:
-template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
+template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t, bool use_logit_softcap>
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(nwarps*WARP_SIZE, 1)
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
@ -22,6 +22,7 @@ static __global__ void flash_attn_ext_f16(
        const float m0,
        const float m1,
        const uint32_t n_head_log2,
        const float logit_softcap,
        const int ne00,
        const int ne01,
        const int ne02,
@ -46,6 +47,12 @@ static __global__ void flash_attn_ext_f16(
        const int ne2,
        const int ne3) {
 #ifdef FP16_MMA_AVAILABLE
    // Skip unused kernel variants for faster compilation:
    if (use_logit_softcap && !(D == 128 || D == 256)) {
        NO_DEVICE_CODE;
        return;
    }
    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
    const int ic0 = ncols*(blockIdx.x / parallel_blocks); // Index of the first Q/QKV column to work on.
@ -85,6 +92,8 @@ static __global__ void flash_attn_ext_f16(
    const half  slopeh = __float2half(slopef);
    const half2 slope2 = make_half2(slopef, slopef);
    const half2 logit_softcap_2 = make_half2(logit_softcap, logit_softcap);
    frag_b Q_b[D/16][ncols/frag_n];
    // A single buffer for temporarily holding tiles of KQ and VKQ parts:
@ -194,6 +203,10 @@ static __global__ void flash_attn_ext_f16(
                    const int k = k0 + threadIdx.x;
                    KQ_f_tmp[k0/WARP_SIZE] = KQ_f[j*kqs_padded + k];
                    if (use_logit_softcap) {
                        KQ_f_tmp[k0/WARP_SIZE] = logit_softcap*tanhf(KQ_f_tmp[k0/WARP_SIZE]);
                    }
                }
                float KQ_max_new = KQ_max_f[j0/nwarps];
@ -237,6 +250,15 @@ static __global__ void flash_attn_ext_f16(
                    const int k = k0 + threadIdx.x;
                    KQ2_tmp[k0/WARP_SIZE] = KQ2[j*(kqs_padded/2) + k];
                    if (use_logit_softcap) {
                        // There is no dedicated tangens hyperbolicus function for half2.
                        KQ2_tmp[k0/WARP_SIZE] = h2exp(KQ2_tmp[k0/WARP_SIZE]*make_half2(2.0f, 2.0f));
                        KQ2_tmp[k0/WARP_SIZE] = (KQ2_tmp[k0/WARP_SIZE] - make_half2(1.0f, 1.0f))
                                               /(KQ2_tmp[k0/WARP_SIZE] + make_half2(1.0f, 1.0f));
                        KQ2_tmp[k0/WARP_SIZE] *= logit_softcap_2;
                    }
                }
                half2 KQ_max_new = KQ_max_h2[j0/nwarps];
@ -427,7 +449,8 @@ static_assert(get_VKQ_stride( 80, 4, 16) ==  16, "Test failed.");
 template <int D, int cols_per_block, typename KQ_acc_t>
 void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * Q = dst->src[0];
+    const ggml_tensor * KQV = dst;
    const ggml_tensor * Q   = dst->src[0];
    constexpr int nwarps = 4;
@ -435,20 +458,50 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
    const int blocks_num_pb1 = ((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3];
    const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
    float logit_softcap;
    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
    if (4*blocks_num_pb1 < 2*nsm) {
        constexpr int parallel_blocks = 4;
-        fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
+        fattn_kernel_t fattn_kernel;
        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
            fattn_kernel = flash_attn_ext_f16<
                D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
        } else {
            constexpr bool use_logit_softcap = true;
            fattn_kernel = flash_attn_ext_f16<
                D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
        }
        launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
        return;
    }
    if (2*blocks_num_pb1 < 2*nsm) {
        constexpr int parallel_blocks = 2;
-        fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
+        fattn_kernel_t fattn_kernel;
        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
            fattn_kernel = flash_attn_ext_f16<
                D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
        } else {
            constexpr bool use_logit_softcap = true;
            fattn_kernel = flash_attn_ext_f16<
                D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
        }
        launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
        return;
    }
    constexpr int parallel_blocks = 1;
-    fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
+    fattn_kernel_t fattn_kernel;
    if (logit_softcap == 0.0f) {
        constexpr bool use_logit_softcap = false;
        fattn_kernel = flash_attn_ext_f16<
            D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
    } else {
        constexpr bool use_logit_softcap = true;
        fattn_kernel = flash_attn_ext_f16<
            D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
    }
    launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
 }
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@ -13,7 +13,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
    const ggml_tensor * KQV = dst;
    const ggml_tensor * Q   = dst->src[0];
-    const int32_t precision = KQV->op_params[2];
+    const int32_t precision = KQV->op_params[3];
    if (precision != GGML_PREC_DEFAULT) {
        if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
@ -301,7 +301,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
    ggml_cuda_set_device(ctx.device);
    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
-    const int32_t precision = KQV->op_params[2];
+    const int32_t precision = KQV->op_params[3];
    // On AMD the tile kernels perform poorly, use the vec kernel instead:
    if (cc >= CC_OFFSET_AMD) {
--- a/ggml/src/ggml-cuda/sumrows.cu
+++ b/ggml/src/ggml-cuda/sumrows.cu
@ -16,7 +16,7 @@ static __global__ void k_sum_rows_f32(const float * x, float * dst, const int nc
    }
 }
-static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    const dim3 block_dims(WARP_SIZE, 1, 1);
    const dim3 block_nums(nrows, 1, 1);
    k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
@ -32,7 +32,6 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
    GGML_ASSERT(ggml_is_contiguous(src0));
    const int64_t ncols = src0->ne[0];
    const int64_t nrows = ggml_nrows(src0);
--- a/ggml/src/ggml-cuda/sumrows.cuh
+++ b/ggml/src/ggml-cuda/sumrows.cuh
@ -1,3 +1,5 @@
 #include "common.cuh"
 void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
 void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@ -101,6 +101,24 @@ static __global__ void sqrt_f32(const float * x, float * dst, const int k) {
    dst[i] = sqrtf(x[i]);
 }
 static __global__ void sin_f32(const float * x, float * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;
    if (i >= k) {
        return;
    }
    dst[i] = sinf(x[i]);
 }
 static __global__ void cos_f32(const float * x, float * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;
    if (i >= k) {
        return;
    }
    dst[i] = cosf(x[i]);
 }
 static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
    gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@ -156,6 +174,16 @@ static void sqrt_f32_cuda(const float * x, float * dst, const int k, cudaStream_
    sqrt_f32<<<num_blocks, CUDA_SQRT_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 }
 static void sin_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_SIN_BLOCK_SIZE - 1) / CUDA_SIN_BLOCK_SIZE;
    sin_f32<<<num_blocks, CUDA_SIN_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 }
 static void cos_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_COS_BLOCK_SIZE - 1) / CUDA_COS_BLOCK_SIZE;
    cos_f32<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 }
 void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const float * src0_d = (const float *)src0->data;
@ -312,3 +340,31 @@ void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    sqrt_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 }
 void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const float * src0_d = (const float *)src0->data;
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();
    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
    sin_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 }
 void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const float * src0_d = (const float *)src0->data;
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();
    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
    cos_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 }
--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@ -9,6 +9,8 @@
 #define CUDA_HARDSWISH_BLOCK_SIZE 256
 #define CUDA_SQR_BLOCK_SIZE 256
 #define CUDA_SQRT_BLOCK_SIZE 256
 #define CUDA_SIN_BLOCK_SIZE 256
 #define CUDA_COS_BLOCK_SIZE 256
 void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
@ -31,3 +33,7 @@ void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
 void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@ -31,6 +31,8 @@ struct ggml_metal_kernel {
 enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_ADD,
    GGML_METAL_KERNEL_TYPE_ADD_ROW,
    GGML_METAL_KERNEL_TYPE_SUB,
    GGML_METAL_KERNEL_TYPE_SUB_ROW,
    GGML_METAL_KERNEL_TYPE_MUL,
    GGML_METAL_KERNEL_TYPE_MUL_ROW,
    GGML_METAL_KERNEL_TYPE_DIV,
@ -82,6 +84,8 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_RMS_NORM,
    GGML_METAL_KERNEL_TYPE_GROUP_NORM,
    GGML_METAL_KERNEL_TYPE_NORM,
    GGML_METAL_KERNEL_TYPE_SSM_CONV_F32,
    GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16,
    GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32,
@ -205,6 +209,9 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL,
    GGML_METAL_KERNEL_TYPE_CONCAT,
    GGML_METAL_KERNEL_TYPE_SQR,
    GGML_METAL_KERNEL_TYPE_SQRT,
    GGML_METAL_KERNEL_TYPE_SIN,
    GGML_METAL_KERNEL_TYPE_COS,
    GGML_METAL_KERNEL_TYPE_SUM_ROWS,
    GGML_METAL_KERNEL_TYPE_COUNT
@ -491,6 +498,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD,                           add,                            true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW,                       add_row,                        true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB,                           sub,                            true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB_ROW,                       sub_row,                        true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL,                           mul,                            true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW,                       mul_row,                        true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV,                           div,                            true);
@ -542,6 +551,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM,                      rms_norm,                       ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM,                    group_norm,                     ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM,                          norm,                           true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_CONV_F32,                  ssm_conv_f32,                   true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32,                  ssm_scan_f32,                   true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,                mul_mv_f32_f32,                 ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16,                mul_mv_f16_f16,                 ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32,                mul_mv_f16_f32,                 ctx->support_simdgroup_reduction);
@ -665,6 +676,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL,                cpy_f32_iq4_nl,                 true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONCAT,                        concat,                         true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQR,                           sqr,                            true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT,                          sqrt,                           true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN,                           sin,                            true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS,                           cos,                            true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                      sum_rows,                       true);
    }
@ -765,15 +779,20 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
        case GGML_OP_PERMUTE:
        case GGML_OP_CONCAT:
        case GGML_OP_ADD:
        case GGML_OP_SUB:
        case GGML_OP_ACC:
        case GGML_OP_MUL:
        case GGML_OP_DIV:
        case GGML_OP_REPEAT:
        case GGML_OP_SCALE:
        case GGML_OP_CLAMP:
        case GGML_OP_SQR:
        case GGML_OP_SUM_ROWS:
            return true;
        case GGML_OP_SQR:
        case GGML_OP_SQRT:
        case GGML_OP_SIN:
        case GGML_OP_COS:
            return ggml_is_contiguous(op->src[0]);
        case GGML_OP_SUM_ROWS:
        case GGML_OP_SOFT_MAX:
        case GGML_OP_RMS_NORM:
        case GGML_OP_GROUP_NORM:
@ -803,6 +822,9 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
                return false;
            }
            return ctx->support_simdgroup_mm; // TODO: over-restricted for vec-kernels
        case GGML_OP_SSM_CONV:
        case GGML_OP_SSM_SCAN:
            return true;
        case GGML_OP_MUL_MAT:
        case GGML_OP_MUL_MAT_ID:
            return ctx->support_simdgroup_reduction &&
@ -1050,6 +1072,7 @@ static enum ggml_status ggml_metal_graph_compute(
                        [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                    } break;
                case GGML_OP_ADD:
                case GGML_OP_SUB:
                case GGML_OP_MUL:
                case GGML_OP_DIV:
                    {
@ -1073,6 +1096,7 @@ static enum ggml_status ggml_metal_graph_compute(
                            nb = ne00 / 4;
                            switch (dst->op) {
                                case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
                                case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB_ROW].pipeline; break;
                                case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
                                case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
                                default: GGML_ABORT("fatal error");
@ -1082,6 +1106,7 @@ static enum ggml_status ggml_metal_graph_compute(
                        } else {
                            switch (dst->op) {
                                case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
                                case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB].pipeline; break;
                                case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
                                case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
                                default: GGML_ABORT("fatal error");
@ -1409,6 +1434,48 @@ static enum ggml_status ggml_metal_graph_compute(
                        const int64_t n = ggml_nelements(dst);
                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                    } break;
                case GGML_OP_SQRT:
                    {
                        GGML_ASSERT(ggml_is_contiguous(src0));
                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQRT].pipeline;
                        [encoder setComputePipelineState:pipeline];
                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                        [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
                        const int64_t n = ggml_nelements(dst);
                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                    } break;
                case GGML_OP_SIN:
                    {
                        GGML_ASSERT(ggml_is_contiguous(src0));
                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIN].pipeline;
                        [encoder setComputePipelineState:pipeline];
                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                        [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
                        const int64_t n = ggml_nelements(dst);
                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                    } break;
                case GGML_OP_COS:
                    {
                        GGML_ASSERT(ggml_is_contiguous(src0));
                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_COS].pipeline;
                        [encoder setComputePipelineState:pipeline];
                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                        [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
                        const int64_t n = ggml_nelements(dst);
                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                    } break;
                case GGML_OP_SUM_ROWS:
@ -1538,6 +1605,121 @@ static enum ggml_status ggml_metal_graph_compute(
                            [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        }
                    } break;
                case GGML_OP_SSM_CONV:
                    {
                        GGML_ASSERT(src0t == GGML_TYPE_F32);
                        GGML_ASSERT(src1t == GGML_TYPE_F32);
                        GGML_ASSERT(ggml_is_contiguous(src0));
                        GGML_ASSERT(ggml_is_contiguous(src1));
                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_CONV_F32].pipeline;
                        [encoder setComputePipelineState:pipeline];
                        [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
                        [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
                        [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
                        [encoder setBytes:&ne00    length:sizeof(ne00) atIndex:3];
                        [encoder setBytes:&ne01    length:sizeof(ne01) atIndex:4];
                        [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:5];
                        [encoder setBytes:&nb00    length:sizeof(nb00) atIndex:6];
                        [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:7];
                        [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:8];
                        [encoder setBytes:&ne10    length:sizeof(ne10) atIndex:9];
                        [encoder setBytes:&ne11    length:sizeof(ne11) atIndex:10];
                        [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:11];
                        [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:12];
                        [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:13];
                        [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:14];
                        [encoder setBytes:&ne2     length:sizeof(ne2)  atIndex:15];
                        [encoder setBytes:&nb0     length:sizeof(nb0)  atIndex:16];
                        [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:17];
                        [encoder setBytes:&nb2     length:sizeof(nb2)  atIndex:18];
                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne1, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                    } break;
                case GGML_OP_SSM_SCAN:
                    {
                        struct ggml_tensor * src3 = gf->nodes[i]->src[3];
                        struct ggml_tensor * src4 = gf->nodes[i]->src[4];
                        struct ggml_tensor * src5 = gf->nodes[i]->src[5];
                        GGML_ASSERT(src3);
                        GGML_ASSERT(src4);
                        GGML_ASSERT(src5);
                        size_t offs_src3 = 0;
                        size_t offs_src4 = 0;
                        size_t offs_src5 = 0;
                        id<MTLBuffer> id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil;
                        id<MTLBuffer> id_src4 = src4 ? ggml_metal_get_buffer(src4, &offs_src4) : nil;
                        id<MTLBuffer> id_src5 = src5 ? ggml_metal_get_buffer(src5, &offs_src5) : nil;
                        const int64_t  ne30 = src3->ne[0]; GGML_UNUSED(ne30);
                        const int64_t  ne31 = src3->ne[1]; GGML_UNUSED(ne31);
                        const uint64_t nb30 = src3->nb[0];
                        const uint64_t nb31 = src3->nb[1];
                        const int64_t  ne40 = src4->ne[0]; GGML_UNUSED(ne40);
                        const int64_t  ne41 = src4->ne[1]; GGML_UNUSED(ne41);
                        const int64_t  ne42 = src4->ne[2]; GGML_UNUSED(ne42);
                        const uint64_t nb40 = src4->nb[0];
                        const uint64_t nb41 = src4->nb[1];
                        const uint64_t nb42 = src4->nb[2];
                        const int64_t  ne50 = src5->ne[0]; GGML_UNUSED(ne50);
                        const int64_t  ne51 = src5->ne[1]; GGML_UNUSED(ne51);
                        const int64_t  ne52 = src5->ne[2]; GGML_UNUSED(ne52);
                        const uint64_t nb50 = src5->nb[0];
                        const uint64_t nb51 = src5->nb[1];
                        const uint64_t nb52 = src5->nb[2];
                        const int64_t d_state      = ne00;
                        const int64_t d_inner      = ne01;
                        const int64_t n_seq_tokens = ne11;
                        const int64_t n_seqs       = ne02;
                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32].pipeline;
                        [encoder setComputePipelineState:pipeline];
                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                        [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                        [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
                        [encoder setBuffer:id_src3 offset:offs_src3 atIndex:3];
                        [encoder setBuffer:id_src4 offset:offs_src4 atIndex:4];
                        [encoder setBuffer:id_src5 offset:offs_src5 atIndex:5];
                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:6];
                        [encoder setBytes:&d_state      length:sizeof(d_state)      atIndex:7];
                        [encoder setBytes:&d_inner      length:sizeof(d_inner)      atIndex:8];
                        [encoder setBytes:&n_seq_tokens length:sizeof(n_seq_tokens) atIndex:9];
                        [encoder setBytes:&n_seqs       length:sizeof(n_seqs)       atIndex:10];
                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:11];
                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:12];
                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:13];
                        [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
                        [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
                        [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
                        [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17];
                        [encoder setBytes:&nb20 length:sizeof(nb20) atIndex:18];
                        [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:19];
                        [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:20];
                        [encoder setBytes:&nb30 length:sizeof(nb30) atIndex:21];
                        [encoder setBytes:&nb31 length:sizeof(nb31) atIndex:22];
                        [encoder setBytes:&nb40 length:sizeof(nb40) atIndex:23];
                        [encoder setBytes:&nb41 length:sizeof(nb41) atIndex:24];
                        [encoder setBytes:&nb42 length:sizeof(nb42) atIndex:25];
                        [encoder setBytes:&nb50 length:sizeof(nb50) atIndex:26];
                        [encoder setBytes:&nb51 length:sizeof(nb51) atIndex:27];
                        [encoder setBytes:&nb52 length:sizeof(nb52) atIndex:28];
                        [encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                    } break;
                case GGML_OP_MUL_MAT:
                    {
                        GGML_ASSERT(ne00 == ne10);
@ -2624,9 +2806,14 @@ static enum ggml_status ggml_metal_graph_compute(
                        float scale;
                        float max_bias;
                        float logit_softcap;
                        memcpy(&scale,         ((int32_t *) dst->op_params) + 0, sizeof(scale));
                        memcpy(&max_bias,      ((int32_t *) dst->op_params) + 1, sizeof(max_bias));
                        memcpy(&logit_softcap, ((int32_t *) dst->op_params) + 2, sizeof(logit_softcap));
-                        memcpy(&scale,    ((int32_t *) dst->op_params) + 0, sizeof(scale));
+                        if (logit_softcap != 0.0f) {
-                        memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias));
+                            scale /= logit_softcap;
                        }
                        const uint32_t n_head      = src0->ne[2];
                        const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
@ -2677,30 +2864,31 @@ static enum ggml_status ggml_metal_graph_compute(
                        } else {
                            [encoder setBuffer:id_src0     offset:offs_src0           atIndex:3];
                        }
-                        [encoder setBuffer:id_dst      offset:offs_dst            atIndex:4];
+                        [encoder setBuffer:id_dst        offset:offs_dst              atIndex:4];
-                        [encoder setBytes:&ne01        length:sizeof( int64_t)    atIndex:5];
+                        [encoder setBytes:&ne01          length:sizeof( int64_t)      atIndex:5];
-                        [encoder setBytes:&ne02        length:sizeof( int64_t)    atIndex:6];
+                        [encoder setBytes:&ne02          length:sizeof( int64_t)      atIndex:6];
-                        [encoder setBytes:&ne03        length:sizeof( int64_t)    atIndex:7];
+                        [encoder setBytes:&ne03          length:sizeof( int64_t)      atIndex:7];
-                        [encoder setBytes:&nb01        length:sizeof(uint64_t)    atIndex:8];
+                        [encoder setBytes:&nb01          length:sizeof(uint64_t)      atIndex:8];
-                        [encoder setBytes:&nb02        length:sizeof(uint64_t)    atIndex:9];
+                        [encoder setBytes:&nb02          length:sizeof(uint64_t)      atIndex:9];
-                        [encoder setBytes:&nb03        length:sizeof(uint64_t)    atIndex:10];
+                        [encoder setBytes:&nb03          length:sizeof(uint64_t)      atIndex:10];
-                        [encoder setBytes:&ne11        length:sizeof( int64_t)    atIndex:11];
+                        [encoder setBytes:&ne11          length:sizeof( int64_t)      atIndex:11];
-                        [encoder setBytes:&ne12        length:sizeof( int64_t)    atIndex:12];
+                        [encoder setBytes:&ne12          length:sizeof( int64_t)      atIndex:12];
-                        [encoder setBytes:&ne13        length:sizeof( int64_t)    atIndex:13];
+                        [encoder setBytes:&ne13          length:sizeof( int64_t)      atIndex:13];
-                        [encoder setBytes:&nb11        length:sizeof(uint64_t)    atIndex:14];
+                        [encoder setBytes:&nb11          length:sizeof(uint64_t)      atIndex:14];
-                        [encoder setBytes:&nb12        length:sizeof(uint64_t)    atIndex:15];
+                        [encoder setBytes:&nb12          length:sizeof(uint64_t)      atIndex:15];
-                        [encoder setBytes:&nb13        length:sizeof(uint64_t)    atIndex:16];
+                        [encoder setBytes:&nb13          length:sizeof(uint64_t)      atIndex:16];
-                        [encoder setBytes:&nb21        length:sizeof(uint64_t)    atIndex:17];
+                        [encoder setBytes:&nb21          length:sizeof(uint64_t)      atIndex:17];
-                        [encoder setBytes:&nb22        length:sizeof(uint64_t)    atIndex:18];
+                        [encoder setBytes:&nb22          length:sizeof(uint64_t)      atIndex:18];
-                        [encoder setBytes:&nb23        length:sizeof(uint64_t)    atIndex:19];
+                        [encoder setBytes:&nb23          length:sizeof(uint64_t)      atIndex:19];
-                        [encoder setBytes:&nb31        length:sizeof(uint64_t)    atIndex:20];
+                        [encoder setBytes:&nb31          length:sizeof(uint64_t)      atIndex:20];
-                        [encoder setBytes:&ne1         length:sizeof( int64_t)    atIndex:21];
+                        [encoder setBytes:&ne1           length:sizeof( int64_t)      atIndex:21];
-                        [encoder setBytes:&ne2         length:sizeof( int64_t)    atIndex:22];
+                        [encoder setBytes:&ne2           length:sizeof( int64_t)      atIndex:22];
-                        [encoder setBytes:&scale       length:sizeof(   float)    atIndex:23];
+                        [encoder setBytes:&scale         length:sizeof(   float)      atIndex:23];
-                        [encoder setBytes:&max_bias    length:sizeof(   float)    atIndex:24];
+                        [encoder setBytes:&max_bias      length:sizeof(   float)      atIndex:24];
-                        [encoder setBytes:&m0          length:sizeof(m0)          atIndex:25];
+                        [encoder setBytes:&m0            length:sizeof(m0)            atIndex:25];
-                        [encoder setBytes:&m1          length:sizeof(m1)          atIndex:26];
+                        [encoder setBytes:&m1            length:sizeof(m1)            atIndex:26];
-                        [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:27];
+                        [encoder setBytes:&n_head_log2   length:sizeof(n_head_log2)   atIndex:27];
                        [encoder setBytes:&logit_softcap length:sizeof(logit_softcap) atIndex:28];
                        if (!use_vec_kernel) {
                            // half8x8 kernel
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@ -17,7 +17,7 @@ enum ggml_sort_order {
    GGML_SORT_ORDER_DESC,
 };
-// general-purpose kernel for addition, multiplication and division of two tensors
+// general-purpose kernel for addition, subtraction, multiplication and division of two tensors
 // pros: works for non-contiguous tensors, supports broadcast across all dims
 // cons: not very efficient
 kernel void kernel_add(
@ -70,6 +70,56 @@ kernel void kernel_add(
    }
 }
 kernel void kernel_sub(
        device const char * src0,
        device const char * src1,
        device       char * dst,
        constant  int64_t & ne00,
        constant  int64_t & ne01,
        constant  int64_t & ne02,
        constant  int64_t & ne03,
        constant uint64_t & nb00,
        constant uint64_t & nb01,
        constant uint64_t & nb02,
        constant uint64_t & nb03,
        constant  int64_t & ne10,
        constant  int64_t & ne11,
        constant  int64_t & ne12,
        constant  int64_t & ne13,
        constant uint64_t & nb10,
        constant uint64_t & nb11,
        constant uint64_t & nb12,
        constant uint64_t & nb13,
        constant  int64_t & ne0,
        constant  int64_t & ne1,
        constant  int64_t & ne2,
        constant  int64_t & ne3,
        constant uint64_t & nb0,
        constant uint64_t & nb1,
        constant uint64_t & nb2,
        constant uint64_t & nb3,
        constant  int64_t & offs,
        uint3 tgpig[[threadgroup_position_in_grid]],
        uint3 tpitg[[thread_position_in_threadgroup]],
        uint3   ntg[[threads_per_threadgroup]]) {
    const int64_t i03 = tgpig.z;
    const int64_t i02 = tgpig.y;
    const int64_t i01 = tgpig.x;
    const int64_t i13 = i03 % ne13;
    const int64_t i12 = i02 % ne12;
    const int64_t i11 = i01 % ne11;
    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + offs;
    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1  + offs;
    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
        const int i10 = i0 % ne10;
        *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) - *((device float *)(src1_ptr + i10*nb10));
    }
 }
 kernel void kernel_mul(
        device const char * src0,
        device const char * src1,
@ -226,6 +276,15 @@ kernel void kernel_add_row(
    dst[tpig] = src0[tpig] + src1[tpig % nb];
 }
 kernel void kernel_sub_row(
        device const float4 * src0,
        device const float4 * src1,
        device       float4 * dst,
        constant   uint64_t & nb [[buffer(28)]],
        uint tpig[[thread_position_in_grid]]) {
    dst[tpig] = src0[tpig] - src1[tpig % nb];
 }
 kernel void kernel_mul_row(
        device const float4 * src0,
        device const float4 * src1,
@ -358,6 +417,27 @@ kernel void kernel_sqr(
    dst[tpig] = src0[tpig] * src0[tpig];
 }
 kernel void kernel_sqrt(
        device const float * src0,
        device       float * dst,
        uint tpig[[thread_position_in_grid]]) {
    dst[tpig] = sqrt(src0[tpig]);
 }
 kernel void kernel_sin(
        device const float * src0,
        device       float * dst,
        uint tpig[[thread_position_in_grid]]) {
    dst[tpig] = sin(src0[tpig]);
 }
 kernel void kernel_cos(
        device const float * src0,
        device       float * dst,
        uint tpig[[thread_position_in_grid]]) {
    dst[tpig] = cos(src0[tpig]);
 }
 kernel void kernel_sum_rows(
        device const float * src0,
        device       float * dst,
@ -667,6 +747,127 @@ kernel void kernel_diag_mask_inf_8(
    }
 }
 // ref: ggml.c:ggml_compute_forward_ssm_conv_f32
 // TODO: optimize
 kernel void kernel_ssm_conv_f32(
        device const  void * src0,
        device const  void * src1,
        device       float * dst,
        constant   int64_t & ne00,
        constant   int64_t & ne01,
        constant   int64_t & ne02,
        constant  uint64_t & nb00,
        constant  uint64_t & nb01,
        constant  uint64_t & nb02,
        constant   int64_t & ne10,
        constant   int64_t & ne11,
        constant  uint64_t & nb10,
        constant  uint64_t & nb11,
        constant   int64_t & ne0,
        constant   int64_t & ne1,
        constant   int64_t & ne2,
        constant  uint64_t & nb0,
        constant  uint64_t & nb1,
        constant  uint64_t & nb2,
        uint3 tgpig[[threadgroup_position_in_grid]],
        uint3 tpitg[[thread_position_in_threadgroup]],
        uint3   ntg[[threads_per_threadgroup]]) {
    const int64_t ir = tgpig.x;
    const int64_t i2 = tgpig.y;
    const int64_t i3 = tgpig.z;
    const int64_t nc  = ne10;
    const int64_t ncs = ne00;
    const int64_t nr  = ne01;
    const int64_t n_t = ne1;
    const int64_t n_s = ne2;
    device const float * s = (device const float *) ((device const char *) src0 + ir*nb01 + i2*nb00 + i3*nb02);
    device const float * c = (device const float *) ((device const char *) src1 + ir*nb11);
    device       float * x = (device       float *) ((device       char *) dst  + ir*nb0  + i2*nb1  + i3*nb2);
    float sumf = 0.0f;
    for (int64_t i0 = 0; i0 < nc; ++i0) {
        sumf += s[i0] * c[i0];
    }
    x[0] = sumf;
 }
 // ref: ggml.c:ggml_compute_forward_ssm_scan_f32
 // TODO: optimize
 kernel void kernel_ssm_scan_f32(
        device const void * src0,
        device const void * src1,
        device const void * src2,
        device const void * src3,
        device const void * src4,
        device const void * src5,
        device      float * dst,
        constant  int64_t & d_state,
        constant  int64_t & d_inner,
        constant  int64_t & n_seq_tokens,
        constant  int64_t & n_seqs,
        constant uint64_t & nb00,
        constant uint64_t & nb01,
        constant uint64_t & nb02,
        constant uint64_t & nb10,
        constant uint64_t & nb11,
        constant uint64_t & nb12,
        constant uint64_t & nb13,
        constant uint64_t & nb20,
        constant uint64_t & nb21,
        constant uint64_t & nb22,
        constant uint64_t & nb30,
        constant uint64_t & nb31,
        constant uint64_t & nb40,
        constant uint64_t & nb41,
        constant uint64_t & nb42,
        constant uint64_t & nb50,
        constant uint64_t & nb51,
        constant uint64_t & nb52,
        uint3 tgpig[[threadgroup_position_in_grid]],
        uint3 tpitg[[thread_position_in_threadgroup]],
        uint3   ntg[[threads_per_threadgroup]]) {
    const int64_t ir = tgpig.x;
    const int64_t i3 = tgpig.y;
    const int64_t nc  = d_state;
    const int64_t nr  = d_inner;
    const int64_t n_t = n_seq_tokens;
    const int64_t n_s = n_seqs;
    for (int64_t i2 = 0; i2 < n_t; ++i2) {
        device const float * s0 = (device const float *) ((device const char *) src0 + ir*nb01 + i3*nb02);
        device const float * x  = (device const float *) ((device const char *) src1 + ir*nb10 + i2*nb11 + i3*nb12);
        device const float * dt = (device const float *) ((device const char *) src2 + ir*nb20 + i2*nb21 + i3*nb22);
        device const float * A  = (device const float *) ((device const char *) src3 + ir*nb31);
        device const float * B  = (device const float *) ((device const char *) src4 + i2*nb41 + i3*nb42);
        device const float * C  = (device const float *) ((device const char *) src5 + i2*nb51 + i3*nb52);
        device       float * y  = (device       float *) ((device       char *) dst  + ir*nb10 + i2*nb11 + i3*nb12); // TODO: do not use src1 strides
        device       float * s  = (device       float *) ((device       char *) dst  + ir*nb01 + i3*nb02 +    nb13);
        if (i2 > 0) {
            s0 = s;
        }
        // i1 == 0
        float dt_soft_plus = dt[0] <= 20.0f ? log(1.0f + exp(dt[0])) : dt[0];
        float x_dt = x[0] * dt_soft_plus;
        float sumf = 0.0f;
        for (int64_t i0 = 0; i0 < nc; ++i0) {
            int64_t i = i0;
            float state = (s0[i] * exp(dt_soft_plus * A[i])) + (B[i0] * x_dt);
            sumf += state * C[i0];
            s[i] = state;
        }
        y[0] = sumf;
    }
 }
 kernel void kernel_norm(
        device const  void * src0,
        device       float * dst,
@ -1976,6 +2177,7 @@ typedef void (flash_attn_ext_f16_t)(
        constant     float & m0,
        constant     float & m1,
        constant  uint32_t & n_head_log2,
        constant     float & logit_softcap,
        threadgroup   half * shared,
        uint3  tgpig[[threadgroup_position_in_grid]],
        uint3  tpitg[[thread_position_in_threadgroup]],
@ -2014,6 +2216,7 @@ kernel void kernel_flash_attn_ext_f16(
        constant     float & m0,
        constant     float & m1,
        constant  uint32_t & n_head_log2,
        constant     float & logit_softcap,
        threadgroup   half * shared [[threadgroup(0)]],
        uint3  tgpig[[threadgroup_position_in_grid]],
        uint3  tpitg[[thread_position_in_threadgroup]],
@ -2138,19 +2341,6 @@ kernel void kernel_flash_attn_ext_f16(
                    }
                    simdgroup_store(mqk, ss + 8*cc, TF, 0, false);
                    const short tx = tiisg%4;
                    const short ty = tiisg/4;
                    if (mask != q) {
                        // mqk = mqk*scale + mask*slope
                        ss[8*cc + ty*TF + 2*tx + 0] = scale*ss[8*cc + ty*TF + 2*tx + 0] + slope*mp[ic + 8*cc + ty*nb31/sizeof(half) + 2*tx + 0];
                        ss[8*cc + ty*TF + 2*tx + 1] = scale*ss[8*cc + ty*TF + 2*tx + 1] + slope*mp[ic + 8*cc + ty*nb31/sizeof(half) + 2*tx + 1];
                    } else {
                        // mqk = mqk*scale
                        ss[8*cc + ty*TF + 2*tx + 0] *= scale;
                        ss[8*cc + ty*TF + 2*tx + 1] *= scale;
                    }
                }
            }
@ -2162,10 +2352,19 @@ kernel void kernel_flash_attn_ext_f16(
                float ms[Q];
                for (short j = 0; j < Q; ++j) {
                    const short p = tiisg;
                    const float m = M[j];
-                    const float s = ss[j*TF + p];
+
                    // scale and apply the logitcap / mask
                    float s = ss[j*TF + tiisg]*scale;
                    if (logit_softcap != 0.0f) {
                        s = logit_softcap*precise::tanh(s);
                    }
                    if (mask != q) {
                        // mqk = mqk + mask*slope
                        s += slope*mp[ic + j*nb31/sizeof(half) + tiisg];
                    }
                    smax = simd_max(max(smax, s));
                    M[j] = simd_max(max(M[j], s));
@ -2176,7 +2375,7 @@ kernel void kernel_flash_attn_ext_f16(
                    S[j] = S[j]*ms[j] + simd_sum(vs);
                    // the P matrix from the paper (Q rows, C columns)
-                    ss[j*TF + p] = vs;
+                    ss[j*TF + tiisg] = vs;
                }
                // create a QxQ diagonal matrix for rescaling the output
@ -2345,6 +2544,7 @@ kernel void kernel_flash_attn_ext_vec_f16(
        constant     float & m0,
        constant     float & m1,
        constant  uint32_t & n_head_log2,
        constant     float & logit_softcap,
        threadgroup   half * shared [[threadgroup(0)]],
        uint3  tgpig[[threadgroup_position_in_grid]],
        uint3  tpitg[[thread_position_in_threadgroup]],
@ -2479,7 +2679,13 @@ kernel void kernel_flash_attn_ext_vec_f16(
                    // mqk = mqk*scale + mask*slope
                    if (tiisg == 0) {
-                        mqk = mqk*scale + ((mask != q) ? ((float4) mp4[ic/4 + cc])*slope : (float4) 0.0f);
+                        mqk *= scale;
                        if (logit_softcap != 0.0f) {
                            mqk = logit_softcap*precise::tanh(mqk);
                        }
                        mqk += (mask != q) ? ((float4) mp4[ic/4 + cc])*slope : (float4) 0.0f;
                        ss4[cc] = mqk;
                    }
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@ -3644,7 +3644,7 @@ void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
    quantize_row_q8_K_ref(x, y, k);
 }
-//===================================== Dot ptoducts =================================
+//===================================== Dot products =================================
 //
 // Helper functions
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@ -38,6 +38,7 @@
 #include "ggml-sycl/backend.hpp"
 #include "ggml-sycl/presets.hpp"
 #include "ggml-sycl/gemm.hpp"
 bool   ggml_sycl_loaded(void);
 void   ggml_sycl_free_data(struct ggml_tensor * tensor);
@ -2482,6 +2483,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
        const sycl::half alpha_f16 = 1.0f;
        const sycl::half beta_f16 = 0.0f;
 #if !GGML_SYCL_DNNL
        SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
            *stream, oneapi::mkl::transpose::trans,
            oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
@ -2491,6 +2493,13 @@ inline void ggml_sycl_op_mul_mat_sycl(
            dpct::library_data_t::real_half)));
        const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
        to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
 #else
        auto dnnl_stream = ctx.stream_dnnl(stream);
        DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
            src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(), dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>());
        const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
        to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
 #endif
    }
    else {
        // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp32 path\n");
@ -2513,13 +2522,18 @@ inline void ggml_sycl_op_mul_mat_sycl(
        const float alpha = 1.0f;
        const float beta = 0.0f;
-
+#if !GGML_SYCL_DNNL
        SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
            *stream, oneapi::mkl::transpose::trans,
            oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
            dpct::get_value(&alpha, *stream), src0_ddf_i, ne00,
            src1_ddf1_i, ne10, dpct::get_value(&beta, *stream),
            dst_dd_i, ldc)));
 #else
        auto dnnl_stream = ctx.stream_dnnl(stream);
         DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
            src0_ddf_i, DnnlGemmWrapper::to_dt<float>(), dst_dd_i, DnnlGemmWrapper::to_dt<float>());
 #endif
    }
    (void) dst;
    (void) src1_ddq_i;
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@ -19,6 +19,10 @@
 #include "dpct/helper.hpp"
 #include "ggml-sycl.h"
 #include "presets.hpp"
 #if GGML_SYCL_DNNL
 #include "dnnl.hpp"
 #include "dnnl_sycl.hpp"
 #endif
 #define GGML_COMMON_DECL_SYCL
 #define GGML_COMMON_IMPL_SYCL
@ -277,6 +281,52 @@ struct ggml_backend_sycl_context {
        return stream(device, 0);
    }
 #if GGML_SYCL_DNNL
    dnnl::engine make_engine(sycl::queue* q) {
        // Get the device associated with the queue
        sycl::device dev = q->get_device();
        // Get the context associated with the queue
        sycl::context ctx = q->get_context();
        const dnnl::engine eng = dnnl::sycl_interop::make_engine(dev, ctx);
        return eng;
    }
    std::unordered_map<sycl::queue*, dnnl::stream> stream_map;
    std::unordered_map<sycl::queue*, dnnl::engine> engine_map;
    dnnl::stream stream_dnnl(int device, int _stream) {
        auto q = stream(device, _stream);
        return stream_dnnl(q);
    }
    dnnl::engine engine_dnnl(sycl::queue* qptr) {
        auto it = engine_map.find(qptr);
        if (it == engine_map.end()) {
            auto eng = make_engine(qptr);
            engine_map[qptr] = eng;
            return eng;
        }
        else
        {
            return it->second;
        }
    }
    dnnl::stream stream_dnnl(sycl::queue* qptr) {
        auto it = stream_map.find(qptr);
        if (it == stream_map.end()) {
            auto eng = engine_dnnl(qptr);
            auto stream = dnnl::sycl_interop::make_stream(eng, *qptr);
            stream_map[qptr] = stream;
            return stream;
        }
        else
        {
            return it->second;
        }
    }
    dnnl::stream stream_dnnl() {
        return stream_dnnl(device, 0);
    }
 #endif
    // pool
    std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
--- a/ggml/src/ggml-sycl/gemm.hpp
+++ b/ggml/src/ggml-sycl/gemm.hpp
@ -0,0 +1,101 @@
 //
 // MIT license
 // Copyright (C) 2024 Intel Corporation
 // SPDX-License-Identifier: MIT
 //
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 #ifndef GGML_SYCL_GEMM_HPP
 #define GGML_SYCL_GEMM_HPP
 #include <fstream>
 #include <iostream>
 #include "ggml-sycl.h"
 #if GGML_SYCL_DNNL
 #include "dnnl.hpp"
 #include "dnnl_sycl.hpp"
 class DnnlGemmWrapper {
 public:
    using dt = dnnl::memory::data_type;
    using tag = dnnl::memory::format_tag;
    template<typename T>
    static constexpr dt to_dt() {
        if constexpr (std::is_same_v<T, float>) return dt::f32;
        else if constexpr (std::is_same_v<T, sycl::half>) return dt::f16;
        else static_assert(0);
    }
    static inline void row_gemm(sycl::queue& q, bool a_trans,
        bool b_trans, int m, int n, int k,
        const void* a, dt at, const void* b, dt bt, void* c, dt ct)
    {
        // Get the device associated with the queue
        sycl::device dev = q.get_device();
        // Get the context associated with the queue
        sycl::context ctx = q.get_context();
        const dnnl::engine eng = dnnl::sycl_interop::make_engine(dev, ctx);
        const dnnl::stream stream = dnnl::sycl_interop::make_stream(eng, q);
        dnnl::memory::dims a_dims = { m, k };
        dnnl::memory::dims b_dims = { k, n };
        dnnl::memory::dims c_dims = { m, n };
        const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
        const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
        const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
        auto a_mem = dnnl::memory(a_in_md, eng, (void*)a);
        auto b_mem = dnnl::memory(b_in_md, eng, (void*)b);
        auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
        auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
        // Create the primitive.
        auto matmul_prim = dnnl::matmul(matmul_pd);
        // Primitive arguments.
        std::unordered_map<int, dnnl::memory> matmul_args;
        matmul_args.insert({ DNNL_ARG_SRC, a_mem });
        matmul_args.insert({ DNNL_ARG_WEIGHTS, b_mem });
        matmul_args.insert({ DNNL_ARG_DST, c_mem });
        matmul_prim.execute(stream, matmul_args);
    }
    static inline void row_gemm(const dnnl::stream& stream, bool a_trans,
        bool b_trans, int m, int n, int k,
        const void* a, dt at, const void* b, dt bt, void* c, dt ct)
    {
        auto const eng = stream.get_engine();
        dnnl::memory::dims a_dims = { m, k };
        dnnl::memory::dims b_dims = { k, n };
        dnnl::memory::dims c_dims = { m, n };
        const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
        const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
        const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
        auto a_mem = dnnl::memory(a_in_md, eng, (void*)a);
        auto b_mem = dnnl::memory(b_in_md, eng, (void*)b);
        auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
        auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
        // Create the primitive.
        auto matmul_prim = dnnl::matmul(matmul_pd);
        // Primitive arguments.
        std::unordered_map<int, dnnl::memory> matmul_args;
        matmul_args.insert({ DNNL_ARG_SRC, a_mem });
        matmul_args.insert({ DNNL_ARG_WEIGHTS, b_mem });
        matmul_args.insert({ DNNL_ARG_DST, c_mem });
        matmul_prim.execute(stream, matmul_args);
    }
 };
 #endif
 #endif // GGML_SYCL_GEMM_HPP
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@ -188,6 +188,8 @@ struct vk_device_struct {
    vk_pipeline pipeline_upscale_f32;
    vk_pipeline pipeline_scale_f32;
    vk_pipeline pipeline_sqr_f32;
    vk_pipeline pipeline_sin_f32;
    vk_pipeline pipeline_cos_f32;
    vk_pipeline pipeline_clamp_f32;
    vk_pipeline pipeline_pad_f32;
    vk_pipeline pipeline_repeat_f32;
@ -1702,6 +1704,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_sin_f32, "sin_f32", sin_f32_len, sin_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_cos_f32, "cos_f32", cos_f32_len, cos_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
@ -4023,6 +4027,16 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
            return ctx->device->pipeline_sqr_f32;
        }
        return nullptr;
    case GGML_OP_SIN:
        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
            return ctx->device->pipeline_sin_f32;
        }
        return nullptr;
    case GGML_OP_COS:
        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
            return ctx->device->pipeline_cos_f32;
        }
        return nullptr;
    case GGML_OP_CLAMP:
        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
            return ctx->device->pipeline_clamp_f32;
@ -4171,6 +4185,8 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
    case GGML_OP_UPSCALE:
    case GGML_OP_SCALE:
    case GGML_OP_SQR:
    case GGML_OP_SIN:
    case GGML_OP_COS:
    case GGML_OP_CLAMP:
    case GGML_OP_PAD:
    case GGML_OP_REPEAT:
@ -4381,6 +4397,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
    case GGML_OP_MUL:
    case GGML_OP_SCALE:
    case GGML_OP_SQR:
    case GGML_OP_SIN:
    case GGML_OP_COS:
    case GGML_OP_CLAMP:
    case GGML_OP_PAD:
    case GGML_OP_REPEAT:
@ -4598,6 +4616,32 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const
    }, dryrun);
 }
 static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
    const uint32_t src0_type_size = ggml_type_size(src0->type);
    const uint32_t dst_type_size = ggml_type_size(dst->type);
    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, {
        (uint32_t)ggml_nelements(src0),
        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
        0,
        0.0f, 0.0f,
    });
 }
 static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
    const uint32_t src0_type_size = ggml_type_size(src0->type);
    const uint32_t dst_type_size = ggml_type_size(dst->type);
    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, {
        (uint32_t)ggml_nelements(src0),
        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
        0,
        0.0f, 0.0f,
    });
 }
 static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
    float * op_params = (float *)dst->op_params;
    const uint32_t src0_type_size = ggml_type_size(src0->type);
@ -5658,6 +5702,8 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
    case GGML_OP_UPSCALE:
    case GGML_OP_SCALE:
    case GGML_OP_SQR:
    case GGML_OP_SIN:
    case GGML_OP_COS:
    case GGML_OP_CLAMP:
    case GGML_OP_PAD:
    case GGML_OP_CPY:
@ -5735,6 +5781,14 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
    case GGML_OP_SQR:
        ggml_vk_sqr(ctx, compute_ctx, src0, node, dryrun);
        break;
    case GGML_OP_SIN:
        ggml_vk_sin(ctx, compute_ctx, src0, node);
        break;
    case GGML_OP_COS:
        ggml_vk_cos(ctx, compute_ctx, src0, node);
        break;
    case GGML_OP_CLAMP:
        ggml_vk_clamp(ctx, compute_ctx, src0, node, dryrun);
@ -5851,6 +5905,8 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
    case GGML_OP_UPSCALE:
    case GGML_OP_SCALE:
    case GGML_OP_SQR:
    case GGML_OP_SIN:
    case GGML_OP_COS:
    case GGML_OP_CLAMP:
    case GGML_OP_PAD:
    case GGML_OP_CPY:
@ -6582,6 +6638,8 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
        case GGML_OP_UPSCALE:
        case GGML_OP_SCALE:
        case GGML_OP_SQR:
        case GGML_OP_SIN:
        case GGML_OP_COS:
        case GGML_OP_CLAMP:
        case GGML_OP_PAD:
        case GGML_OP_CONT:
@ -7024,6 +7082,10 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
        tensor_clone = ggml_scale(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0]);
    } else if (tensor->op == GGML_OP_SQR) {
        tensor_clone = ggml_sqr(ggml_ctx, src0_clone);
    } else if (tensor->op == GGML_OP_SIN) {
        tensor_clone = ggml_sin(ggml_ctx, src0_clone);
    } else if (tensor->op == GGML_OP_COS) {
        tensor_clone = ggml_cos(ggml_ctx, src0_clone);
    } else if (tensor->op == GGML_OP_CLAMP) {
        tensor_clone = ggml_clamp(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
    } else if (tensor->op == GGML_OP_PAD) {
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
--- a/ggml/src/vulkan-shaders/cos.comp
+++ b/ggml/src/vulkan-shaders/cos.comp
@ -0,0 +1,15 @@
 #version 450
 #include "types.comp"
 #include "generic_unary_head.comp"
 void main() {
    const uint idx = get_idx();
    if (idx >= p.ne) {
        return;
    }
    const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(cos(val));
 }
--- a/ggml/src/vulkan-shaders/sin.comp
+++ b/ggml/src/vulkan-shaders/sin.comp
@ -0,0 +1,15 @@
 #version 450
 #include "types.comp"
 #include "generic_unary_head.comp"
 void main() {
    const uint idx = get_idx();
    if (idx >= p.ne) {
        return;
    }
    const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(sin(val));
 }
--- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
@ -396,6 +396,14 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
        string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("sin_f32", "sin.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("cos_f32", "cos.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
    }));
--- a/include/llama.h
+++ b/include/llama.h
@ -511,6 +511,9 @@ extern "C" {
    // to the decoder to start generating output sequence. For other models, it returns -1.
    LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
    // Returns true if the model is recurrent (like Mamba, RWKV, etc.)
    LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
    // Returns 0 on success
    LLAMA_API uint32_t llama_model_quantize(
            const char * fname_inp,
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@ -1 +1 @@
-797faa25af14126eb30134d4033139ae3c5428ed
+28b7633d733bbeef0026570fbc61c79c5e9aa5ae
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@ -31,11 +31,17 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
 static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    if (search.empty()) {
-        return; // Avoid infinite loop if 'search' is an empty string
+        return;
    }
    std::string builder;
    builder.reserve(s.length());
    size_t pos = 0;
-    while ((pos = s.find(search, pos)) != std::string::npos) {
+    size_t last_pos = 0;
-        s.replace(pos, search.length(), replace);
+    while ((pos = s.find(search, last_pos)) != std::string::npos) {
-        pos += replace.length();
+        builder.append(s, last_pos, pos - last_pos);
        builder.append(replace);
        last_pos = pos + search.length();
    }
    builder.append(s, last_pos, std::string::npos);
    s = std::move(builder);
 }
--- a/src/llama.cpp
+++ b/src/llama.cpp
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -949,6 +949,58 @@ struct test_rms_norm : public test_case {
    }
 };
 // GGML_OP_SSM_CONV
 struct test_ssm_conv : public test_case {
    const ggml_type type;
    const std::array<int64_t, 4> ne_a;
    const std::array<int64_t, 4> ne_b;
    std::string vars() override {
        return VARS_TO_STR3(type, ne_a, ne_b);
    }
    test_ssm_conv(ggml_type type = GGML_TYPE_F32,
            std::array<int64_t, 4> ne_a = {10, 10, 10, 1},
            std::array<int64_t, 4> ne_b = {3, 3, 1, 1})
        : type(type), ne_a(ne_a), ne_b(ne_b) {}
    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a   = ggml_new_tensor(ctx, type, 4, ne_a.data());
        ggml_tensor * b   = ggml_new_tensor(ctx, type, 4, ne_b.data());
        ggml_tensor * out = ggml_ssm_conv(ctx, a, b);
        return out;
    }
 };
 // GGML_OP_SSM_SCAN
 struct test_ssm_scan : public test_case {
    const ggml_type type;
    const int64_t d_state;
    const int64_t d_inner;
    const int64_t n_seq_tokens;
    const int64_t n_seqs;
    std::string vars() override {
        return VARS_TO_STR5(type, d_state, d_inner, n_seq_tokens, n_seqs);
    }
    test_ssm_scan(ggml_type type = GGML_TYPE_F32,
            int64_t d_state = 32, int64_t d_inner = 32, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
        : type(type), d_state(d_state), d_inner(d_inner), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * s   = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, d_inner,      n_seqs, 1 }.data());
        ggml_tensor * x   = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_inner, n_seq_tokens, n_seqs, 1 }.data());
        ggml_tensor * dt  = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_inner, n_seq_tokens, n_seqs, 1 }.data());
        ggml_tensor * A   = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, d_inner,      1     , 1 }.data());
        ggml_tensor * B   = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, n_seq_tokens, n_seqs, 1 }.data());
        ggml_tensor * C   = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, n_seq_tokens, n_seqs, 1 }.data());
        ggml_tensor * out = ggml_ssm_scan(ctx, s, x, dt, A, B, C);
        return out;
    }
 };
 // GGML_OP_MUL_MAT
 struct test_mul_mat : public test_case {
    const ggml_type type_a;
@ -1108,6 +1160,58 @@ struct test_sqrt : public test_case {
    }
 };
 // GGML_OP_SIN
 struct test_sin : public test_case {
    const ggml_type type;
    const std::array<int64_t, 4> ne;
    std::string vars() override {
        return VARS_TO_STR2(type, ne);
    }
    test_sin(ggml_type type = GGML_TYPE_F32,
            std::array<int64_t, 4> ne = {10, 10, 10, 10})
        : type(type), ne(ne) {}
    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
        ggml_tensor * out = ggml_sin(ctx, a);
        return out;
    }
    void initialize_tensors(ggml_context * ctx) override {
        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
            init_tensor_uniform(t, -100.0f, 100.0f);
        }
    }
 };
 // GGML_OP_COS
 struct test_cos : public test_case {
    const ggml_type type;
    const std::array<int64_t, 4> ne;
    std::string vars() override {
        return VARS_TO_STR2(type, ne);
    }
    test_cos(ggml_type type = GGML_TYPE_F32,
            std::array<int64_t, 4> ne = {10, 10, 10, 10})
        : type(type), ne(ne) {}
    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
        ggml_tensor * out = ggml_cos(ctx, a);
        return out;
    }
    void initialize_tensors(ggml_context * ctx) override {
        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
            init_tensor_uniform(t, -100.0f, 100.0f);
        }
    }
 };
 // GGML_OP_CLAMP
 struct test_clamp : public test_case {
    const ggml_type type;
@ -1652,19 +1756,20 @@ struct test_flash_attn_ext : public test_case {
    const bool mask; // use mask
    const float max_bias; // ALiBi
    const float logit_softcap; // Gemma 2
    const ggml_type type_KV;
    std::string vars() override {
-        return VARS_TO_STR7(hs, nh, kv, nb, mask, max_bias, type_KV);
+        return VARS_TO_STR8(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV);
    }
    double max_nmse_err() override {
        return 5e-4;
    }
-    test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f, ggml_type type_KV = GGML_TYPE_F16)
+    test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_type type_KV = GGML_TYPE_F16)
-        : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), type_KV(type_KV) {}
+        : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), type_KV(type_KV) {}
    ggml_tensor * build_graph(ggml_context * ctx) override {
        const int64_t hs_padded = GGML_PAD(hs, ggml_blck_size(type_KV));
@ -1673,7 +1778,28 @@ struct test_flash_attn_ext : public test_case {
        ggml_tensor * k = ggml_new_tensor_4d(ctx, type_KV,       hs_padded, kv, nh, 1);
        ggml_tensor * v = ggml_new_tensor_4d(ctx, type_KV,       hs_padded, kv, nh, 1);
        ggml_tensor * m = mask ? ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1) : nullptr;
-        ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias);
+        ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias, logit_softcap);
        return out;
    }
 };
 // GGML_OP_CROSS_ENTROPY_LOSS
 struct test_cross_entropy_loss : public test_case {
    const ggml_type type;
    const std::array<int64_t, 4> ne;
    std::string vars() override {
        return VARS_TO_STR2(type, ne);
    }
    test_cross_entropy_loss(ggml_type type = GGML_TYPE_F32,
            std::array<int64_t, 4> ne = {10, 10, 10, 10})
        : type(type), ne(ne) {}
    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * logits = ggml_new_tensor(ctx, type, 4, ne.data());
        ggml_tensor * labels = ggml_new_tensor(ctx, type, 4, ne.data());
        ggml_tensor * out = ggml_cross_entropy_loss(ctx, logits, labels);
        return out;
    }
 };
@ -2239,6 +2365,12 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
        test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
    }
    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 1, 1}, {4, 1536, 1, 1}));
    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {8, 1536, 1, 1}, {4, 1536, 1, 1}));
    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 4, 1}, {4, 1536, 1, 1}));
    test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 16, 1024, 32, 4));
 #if 1
    for (ggml_type type_a : base_types) {
        for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
@ -2334,6 +2466,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    test_cases.emplace_back(new test_sqr());
    test_cases.emplace_back(new test_sqrt());
    test_cases.emplace_back(new test_sin());
    test_cases.emplace_back(new test_cos());
    test_cases.emplace_back(new test_clamp());
    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10,  1,  1}, 5));
@ -2437,11 +2571,14 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
        for (bool mask : { true, false } ) {
            for (float max_bias : { 0.0f, 8.0f }) {
                if (!mask && max_bias > 0.0f) continue;
-                for (int nh : { 32, }) {
+                for (float logit_softcap : {0.0f, 10.0f}) {
-                    for (int kv : { 512, 1024, }) {
+                    if (hs != 128 && logit_softcap != 0.0f) continue;
-                        for (int nb : { 1, 2, 4, 8, }) {
+                    for (int nh : { 32, }) {
-                            for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
+                        for (int kv : { 512, 1024, }) {
-                                test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, type_KV));
+                            for (int nb : { 1, 2, 4, 8, }) {
                                for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
                                    test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV));
                                }
                            }
                        }
                    }
@ -2450,6 +2587,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
        }
    }
    test_cases.emplace_back(new test_cross_entropy_loss());
    // these tests are disabled to save execution time, but they can be handy for debugging
 #if 0
    test_cases.emplace_back(new test_llama(1));
@ -2483,7 +2622,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    }
    GGML_ABORT("fatal error");
    return false;
 }
 static void usage(char ** argv) {
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@ -1,10 +1,14 @@
 #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
 #include "ggml.h"
 #include <cfloat>
 #include <cmath>
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
 #include <cassert>
 #include <initializer_list>
 #include <vector>
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@ -217,7 +221,8 @@ static bool check_gradient(
        int nargs,
        float eps,
        float max_error_abs,
-        float max_error_rel) {
+        float max_error_rel,
        std::vector<double> expected_vals) {
    static int n_threads = -1;
    if (n_threads < 0) {
@ -248,9 +253,10 @@ static bool check_gradient(
    // ggml_graph_dump_dot(gb, gf,  "test-grad0-backward.dot");
    for (int i = 0; i < nargs; ++i) {
        bool all_g0_bad = true;
        const int nelements = ggml_nelements(x[i]);
        for (int k = 0; k < nelements; ++k) {
-            // compute gradient using finite differences
+            // Calculate gradient numerically:
            const float x0 = ggml_get_f32_1d(x[i], k);
            const float xm = x0 - eps;
            const float xp = x0 + eps;
@ -267,6 +273,28 @@ static bool check_gradient(
            const double f1 = ggml_get_f32_1d(f, 0);
            const double g0 = (f0 - f1)/(2.0*(double) eps);
            // The numerical calculation of the gradient fails around noncontinuities (e.g. 0 for ReLU).
            // In such cases, provide a vector of expected values and skip the comparison for failed calculations.
            if (!expected_vals.empty()) {
                bool matches_any = false;
                for (const double & ev : expected_vals) {
                    const double error_abs = std::fabs(g0 - ev);
                    if (error_abs > max_error_abs) {
                        continue;
                    }
                    const double error_rel = g0 != 0.0 ? fabs(g0 - ev)/fabs(g0) : 0.0;
                    if (error_rel > max_error_rel) {
                        continue;
                    }
                    matches_any = true;
                    break;
                }
                if (!matches_any) {
                    continue;
                }
            }
            all_g0_bad = false;
            ggml_set_f32_1d(x[i], k, x0);
            // compute gradient using backward graph
@ -278,7 +306,7 @@ static bool check_gradient(
            const double g1 = ggml_get_f32_1d(x[i]->grad, k);
            const double error_abs = fabs(g0 - g1);
-            const double error_rel = g0 != 0 ? fabs(g0 - g1)/fabs(g0) : 0;
+            const double error_rel = g0 != 0.0 ? fabs(g0 - g1)/fabs(g0) : 0.0;
            if (error_abs > max_error_abs || error_rel > max_error_rel) {
                printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
@ -287,6 +315,10 @@ static bool check_gradient(
                return false;
            }
        }
        if (all_g0_bad) {
            printf("%s: numerical calculation of the gradient failed for all values\n", op_name);
            return false;
        }
    }
    return true;
@ -404,7 +436,7 @@ int main(int argc, const char ** argv) {
        seed_iter = rand();
        unsigned seed = rand();
-        printf("test-grad0: iter:%d/%d\n", iter, niter);
+        printf("test-grad0: iter:%d/%d\n", (iter+1), niter);
        struct ggml_context * ctx0 = ggml_init(params);
        get_random_dims(ne, 4);
@ -424,7 +456,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
-                check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f);
+                check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f, {});
            }
        }
@ -441,7 +473,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
-                check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f);
+                check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f, {});
            }
        }
@ -458,7 +490,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sub(ctx0, x[0], x[1]));
-                check_gradient("sub", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+                check_gradient("sub", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
            }
        }
@ -475,7 +507,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_mul(ctx0, x[0], x[1]));
-                check_gradient("mul", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                check_gradient("mul", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
            }
        }
@ -492,7 +524,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_div(ctx0, x[0], x[1]));
-                check_gradient("div", ctx0, x, f, ndims, nargs, 1e-3f, 1e-1f, 1e-1f);
+                check_gradient("div", ctx0, x, f, ndims, nargs, 1e-3f, 1e-1f, 1e-1f, {});
            }
        }
@ -509,7 +541,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, x[0]));
-                check_gradient("sqr", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                check_gradient("sqr", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
            }
        }
@ -526,7 +558,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0]));
-                check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, 2e-2f, 1e-1f);
+                check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, 2e-2f, 1e-1f, {});
            }
        }
@ -543,7 +575,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_log(ctx0, x[0]));
-                check_gradient("log", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f);
+                check_gradient("log", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f, {});
            }
        }
@ -560,7 +592,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, x[0]);
-                check_gradient("sum", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+                check_gradient("sum", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
            }
        }
@ -578,7 +610,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sum_rows(ctx0, x[0])));
-                check_gradient("sum_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
+                check_gradient("sum_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY, {});
            }
        }
@ -596,7 +628,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_mean(ctx0, x[0]));
-                check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+                check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
            }
        }
@ -614,7 +646,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_argmax(ctx0, x[0]));
-                check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+                check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
            }
        }
@ -637,7 +669,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1]))));
-                check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
+                check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY, {});
            }
        }
@ -660,25 +692,25 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[0], ggml_repeat_back(ctx0, x[1], x[0]))));
-                check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
+                check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY, {});
            }
        }
-        // abs (finite differences do not work)
+        // abs
-        //{
+        {
-        //    const int nargs = 1;
+           const int nargs = 1;
-        //    for (int ndims = 1; ndims <= 2; ++ndims) {
+           for (int ndims = 1; ndims <= 4; ++ndims) {
-        //        for (int i = 0; i < nargs; ++i) {
+               for (int i = 0; i < nargs; ++i) {
-        //            x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                   x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-        //            ggml_set_param(ctx0, x[i]);
+                   ggml_set_param(ctx0, x[i]);
-        //        }
+               }
-        //        struct ggml_tensor * f = ggml_sum(ctx0, ggml_abs(ctx0, x[0]));
+               struct ggml_tensor * f = ggml_sum(ctx0, ggml_abs(ctx0, x[0]));
-        //        check_gradient("abs", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-3f);
+               check_gradient("abs", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-3f, {-1.0, 1.0});
-        //    }
+           }
-        //}
+        }
        // sgn
        {
@ -693,7 +725,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor* f = ggml_sum(ctx0, ggml_sgn(ctx0, x[0]));
-                check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+                check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {0.0});
            }
        }
@ -710,7 +742,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor* f = ggml_sum(ctx0, ggml_neg(ctx0, x[0]));
-                check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+                check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
            }
        }
@ -727,7 +759,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor* f = ggml_sum(ctx0, ggml_step(ctx0, x[0]));
-                check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+                check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {0.0});
            }
        }
@ -745,7 +777,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor* f = ggml_sum(ctx0, ggml_tanh(ctx0, x[0]));
-                check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+                check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
            }
        }
@ -776,7 +808,7 @@ int main(int argc, const char ** argv) {
                        GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
-                        check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                        check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
                        if (ndims == 2) {
                            // check_mat_mul does not support ndims > 2
                            check_mat_mul(m, x[1], x[0]);
@ -800,7 +832,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor* f = ggml_sum(ctx0, ggml_elu(ctx0, x[0]));
-                check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+                check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
            }
        }
@ -817,7 +849,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor* f = ggml_sum(ctx0, ggml_relu(ctx0, x[0]));
-                check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {0.0, 1.0});
            }
        }
@ -835,7 +867,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor* f = ggml_sum(ctx0, ggml_gelu(ctx0, x[0]));
-                check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+                check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
            }
        }
@ -854,9 +886,9 @@ int main(int argc, const char ** argv) {
 #ifdef GGML_SILU_FP16
                // due to GGML_SILU_FP16 the finite difference method will be slightly wrong -> increase error bounds.
-                check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 0.5, INFINITY);
+                check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 0.5, INFINITY, {});
 #else
-                check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
 #endif
            }
        }
@ -874,7 +906,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0], 1e-6f));
-                check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY);
+                check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY, {});
            }
        }
@ -892,7 +924,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], s));
-                check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
            }
        }
@ -910,7 +942,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
-                check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
            }
        }
@ -928,7 +960,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
-                check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
+                check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY, {});
            }
        }
@ -952,7 +984,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
-                check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
            }
        }
@ -976,7 +1008,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
-                check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
            }
        }
@ -1004,7 +1036,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
-                check_gradient("acc 1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                check_gradient("acc 1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
            }
        }
@ -1037,7 +1069,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
-                check_gradient("acc 2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                check_gradient("acc 2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
            }
        }
@ -1072,7 +1104,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
-                check_gradient("acc 3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                check_gradient("acc 3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
            }
        }
@ -1109,7 +1141,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
-                check_gradient("acc 4d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                check_gradient("acc 4d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
            }
        }
@ -1137,7 +1169,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_1d(ctx0, x[0], x[1], offset));
-                check_gradient("set_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                check_gradient("set_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
            }
        }
@ -1170,7 +1202,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_2d(ctx0, x[0], x[1], x[1]->nb[1], offset));
-                check_gradient("set_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                check_gradient("set_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
            }
        }
@ -1194,7 +1226,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_1d(ctx0, x[0], nelem, offset));
-                check_gradient("view_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                check_gradient("view_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
            }
        }
@ -1225,7 +1257,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_2d(ctx0, x[0], ne2[0], ne2[1], nb2[1], offset));
-                check_gradient("view_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                check_gradient("view_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
            }
        }
@ -1257,7 +1289,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_3d(ctx0, x[0], ne2[0], ne2[1], ne2[2], nb2[1], nb2[2], offset));
-                check_gradient("view_3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                check_gradient("view_3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
            }
        }
@ -1291,7 +1323,7 @@ int main(int argc, const char ** argv) {
                // sum requires contiguous tensor rows
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, x[0], ax0, ax1, ax2, ax3)));
-                check_gradient("permute", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                check_gradient("permute", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
            }
        }
@ -1319,7 +1351,7 @@ int main(int argc, const char ** argv) {
                // sum requires contiguous tensor rows
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, x[0])));
-                check_gradient("transpose", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                check_gradient("transpose", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
            }
        }
@ -1337,7 +1369,7 @@ int main(int argc, const char ** argv) {
            struct ggml_tensor * f = ggml_sum(ctx0, ggml_get_rows(ctx0, x[0], x[1]));
-            check_gradient("get_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            check_gradient("get_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
        }
        // diag_mask_inf
@ -1353,7 +1385,7 @@ int main(int argc, const char ** argv) {
            struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_inf(ctx0, x[0], n_past));
-            check_gradient("diag_mask_inf", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            check_gradient("diag_mask_inf", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
        }
        // diag_mask_zero
@ -1369,7 +1401,7 @@ int main(int argc, const char ** argv) {
            struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_zero(ctx0, x[0], n_past));
-            check_gradient("diag_mask_zero", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            check_gradient("diag_mask_zero", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
        }
        // softmax
@ -1395,7 +1427,7 @@ int main(int argc, const char ** argv) {
                                                        1.0f - eps),
                                                    ggml_new_f32(ctx0, eps))));
-                check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY);
+                check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY, {});
                // NOTE: softmax forward is computed using f16 table lookup instead of using actual expf, but backward assumes actual expf.
                // this may result in different gradients too finite differences.
                // when this test reports errors, first try to replace the table lookup with actual expf and test again to see if just that was the cause.
@ -1412,7 +1444,7 @@ int main(int argc, const char ** argv) {
            get_random_dims(ne2, 4);
            for (int ndims = 1; ndims <= 4; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -0.1f, 0.1f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
                // the second argument to cross_entropy_loss must sum up to 1 for each row
                int nr = ggml_nrows(x[1]);
@ -1430,7 +1462,7 @@ int main(int argc, const char ** argv) {
                struct ggml_tensor * f = ggml_cross_entropy_loss(ctx0, x[0], x[1]);
-                check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-4f, 1e-3f, INFINITY);
+                check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
            }
        }
@ -1468,7 +1500,7 @@ int main(int argc, const char ** argv) {
                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode));
                        GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
-                        check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
+                        check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY, {});
                    }
                }
            }
@ -1508,12 +1540,93 @@ int main(int argc, const char ** argv) {
                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode));
                        GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
-                        check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
+                        check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY, {});
                    }
                }
            }
        }
        // im2col f32
        {
            srand(seed);
            const int nargs = 1;
            const int ndims = 4;
            for (const bool is_2D : {false, true}) {
                int64_t ne0[ndims];
                int64_t ne1[ndims];
                get_random_dims(ne0, ndims);
                get_random_dims(ne1, ndims);
                // // Ensure that the output is not zero-sized:
                ne1[0] += 8;
                ne1[1] += 8;
                if (is_2D) {
                    ne1[2] = ne0[2];
                } else {
                    ne1[1] = ne0[1];
                    ne0[3] = 1;
                    ne1[3] = 1;
                }
                // The order of arguments is swapped because the first tensor is only used for its shape.
                x[1] = get_random_tensor_f16(ctx0, ndims, ne0, -1.0f, 1.0f);
                x[0] = get_random_tensor_f32(ctx0, ndims, ne1, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);
                const int s0 =         1 + irand(2);
                const int s1 = is_2D ? 1 + irand(2) : 0;
                const int p0 =         0 + irand(2);
                const int p1 = is_2D ? 0 + irand(2) : 0;
                const int d0 =         1 + irand(2);
                const int d1 = is_2D ? 1 + irand(2) : 0;
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_im2col(ctx0, x[1], x[0], s0, s1, p0, p1, d0, d1, is_2D, GGML_TYPE_F32));
                GGML_PRINT_DEBUG("im2col f32: is_2D=%s, s0=%d, s1=%d, p0=%d, p1=%d, d0=%d, d1=%d\n", is_2D ? "yes" : "no", s0, s1, p0, p1, d0, d1);
                check_gradient("im2col f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY, {});
            }
        }
        // pool_2d f32
        {
            srand(seed);
            const int nargs = 1;
            const int ndims = 4;
            for (const enum ggml_op_pool op : {GGML_OP_POOL_AVG, GGML_OP_POOL_MAX}) {
                int64_t ne0[ndims];
                get_random_dims(ne0, ndims);
                ne0[0] += 8;
                ne0[1] += 8;
                x[0] = get_random_tensor_f32(ctx0, ndims, ne0, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);
                const int k0 = 2 + irand(2);
                const int k1 = 2 + irand(2);
                const int s0 = 2 + irand(2);
                const int s1 = 2 + irand(2);
                const int p0 = 0 + irand(2);
                const int p1 = 0 + irand(2);
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_pool_2d(ctx0, x[0], op, k0, k1, s0, s1, p0, p1));
                GGML_PRINT_DEBUG("ggml_pool_2d f32: op=%s k0=%d, k1=%d, s0=%d, s1=%d, p0=%d, p1=%d\n",
                                 op == GGML_OP_POOL_MAX ? "max" : "avg", k0, k1, s0, s1, p0, p1);
                std::vector<double> expected_vals;
                if (op == GGML_OP_POOL_MAX) {
                    expected_vals.push_back(0.0);
                    expected_vals.push_back(1.0);
                }
                check_gradient("ggml_pool_2d f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, expected_vals);
            }
        }
        // flash_attn f32
        // TODO: adapt to ggml_flash_attn_ext() changes
        //{
@ -1553,7 +1666,7 @@ int main(int argc, const char ** argv) {
        //                struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
-        //                check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
+        //                check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY, {});
        //            }
        //        }
        //    }
--- a/tests/test-lora-conversion-inference.sh
+++ b/tests/test-lora-conversion-inference.sh
@ -14,7 +14,7 @@ MODELS_REPO_URL=https://huggingface.co/ggml-org/$MODELS_REPO
 # Clone the Hugging Face repository if the directory does not exist
 if [ ! -d "$MODELS_REPO" ]; then
    echo "Cloning the Hugging Face repository..."
-    git clone $MODELS_REPO_URL
+    git clone $MODELS_REPO_URL --depth 1
 else
    echo "Repository already exists. Skipping clone."
 fi
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@ -166,12 +166,12 @@ static void test_sampler_queue(
    for (auto s : samplers_sequence) {
        switch (s){
            case 'k': llama_sample_top_k    (nullptr, &candidates_p, top_k, 1); break;
-            case 'f': GGML_ABORT("tail_free test not implemented");   break;
+            case 'f': GGML_ABORT("tail_free test not implemented");
-            case 'y': GGML_ABORT("typical test not implemented");     break;
+            case 'y': GGML_ABORT("typical test not implemented");
            case 'p': llama_sample_top_p    (nullptr, &candidates_p, top_p, 1); break;
            case 'm': llama_sample_min_p    (nullptr, &candidates_p, min_p, 1); break;
-            case 't': GGML_ABORT("temperature test not implemented"); break;
+            case 't': GGML_ABORT("temperature test not implemented");
-            default : GGML_ABORT("Unknown sampler");                  break;
+            default : GGML_ABORT("Unknown sampler");
        }
        llama_sample_softmax(nullptr, &candidates_p); // make sure tokens are sorted for tests
`@ -1 +1 @@`
	`797faa25af14126eb30134d4033139ae3c5428ed`	`28b7633d733bbeef0026570fbc61c79c5e9aa5ae`