Merge branch 'master' into compilade/bitnet-ternary

2024-12-26 19:34:35 +00:00 · 2024-07-28 21:27:33 -04:00 · 2024-07-28 21:27:33 -04:00 · 79a278e922
commit 79a278e922
parent dd3e62a703 6eeaeba126
340 changed files with 43840 additions and 162220 deletions
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@ -6,7 +6,7 @@ ARG CUDA_VERSION=11.7.1
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

-FROM ${BASE_CUDA_DEV_CONTAINER} as build
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build

 # Unless otherwise specified, we make a fat build.
 ARG CUDA_DOCKER_ARCH=all
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

-FROM ${BASE_ROCM_DEV_CONTAINER} as build
+FROM ${BASE_ROCM_DEV_CONTAINER} AS build

 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=22.04

-FROM ubuntu:$UBUNTU_VERSION as build
+FROM ubuntu:$UBUNTU_VERSION AS build

 RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
--- a/.devops/llama-cli-cuda.Dockerfile
+++ b/.devops/llama-cli-cuda.Dockerfile
@ -6,7 +6,7 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER
 # Target the CUDA runtime image
 ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

-FROM ${BASE_CUDA_DEV_CONTAINER} as build
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build

 # Unless otherwise specified, we make a fat build.
 ARG CUDA_DOCKER_ARCH=all
@ -25,7 +25,7 @@ ENV GGML_CUDA=1

 RUN make -j$(nproc) llama-cli

-FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
+FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime

 RUN apt-get update && \
    apt-get install -y libgomp1
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@ -1,6 +1,6 @@
 ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04

-FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build

 ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
@ -14,10 +14,12 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        echo "GGML_SYCL_F16 is set" && \
        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
-    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
+    echo "Building with static libs" && \
+    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
+    ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
    cmake --build build --config Release --target llama-cli

-FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime

 COPY --from=build /app/build/bin/llama-cli /llama-cli

--- a/.devops/llama-cli-rocm.Dockerfile
+++ b/.devops/llama-cli-rocm.Dockerfile
@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

-FROM ${BASE_ROCM_DEV_CONTAINER} as build
+FROM ${BASE_ROCM_DEV_CONTAINER} AS build

 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
--- a/.devops/llama-cli-vulkan.Dockerfile
+++ b/.devops/llama-cli-vulkan.Dockerfile
@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=jammy

-FROM ubuntu:$UBUNTU_VERSION as build
+FROM ubuntu:$UBUNTU_VERSION AS build

 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget libgomp1
--- a/.devops/llama-cli.Dockerfile
+++ b/.devops/llama-cli.Dockerfile
@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=22.04

-FROM ubuntu:$UBUNTU_VERSION as build
+FROM ubuntu:$UBUNTU_VERSION AS build

 RUN apt-get update && \
    apt-get install -y build-essential git
@ -11,7 +11,7 @@ COPY . .

 RUN make -j$(nproc) llama-cli

-FROM ubuntu:$UBUNTU_VERSION as runtime
+FROM ubuntu:$UBUNTU_VERSION AS runtime

 RUN apt-get update && \
    apt-get install -y libgomp1
--- a/.devops/llama-server-cuda.Dockerfile
+++ b/.devops/llama-server-cuda.Dockerfile
@ -6,7 +6,7 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER
 # Target the CUDA runtime image
 ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

-FROM ${BASE_CUDA_DEV_CONTAINER} as build
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build

 # Unless otherwise specified, we make a fat build.
 ARG CUDA_DOCKER_ARCH=all
@ -27,7 +27,7 @@ ENV LLAMA_CURL=1

 RUN make -j$(nproc) llama-server

-FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
+FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime

 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1 curl
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@ -1,6 +1,6 @@
 ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04

-FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build

 ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
@ -14,10 +14,11 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        echo "GGML_SYCL_F16 is set" && \
        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
+    echo "Building with dynamic libs" && \
    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
    cmake --build build --config Release --target llama-server

-FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime

 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev curl
--- a/.devops/llama-server-rocm.Dockerfile
+++ b/.devops/llama-server-rocm.Dockerfile
@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

-FROM ${BASE_ROCM_DEV_CONTAINER} as build
+FROM ${BASE_ROCM_DEV_CONTAINER} AS build

 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
--- a/.devops/llama-server-vulkan.Dockerfile
+++ b/.devops/llama-server-vulkan.Dockerfile
@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=jammy

-FROM ubuntu:$UBUNTU_VERSION as build
+FROM ubuntu:$UBUNTU_VERSION AS build

 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget
--- a/.devops/llama-server.Dockerfile
+++ b/.devops/llama-server.Dockerfile
@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=22.04

-FROM ubuntu:$UBUNTU_VERSION as build
+FROM ubuntu:$UBUNTU_VERSION AS build

 RUN apt-get update && \
    apt-get install -y build-essential git libcurl4-openssl-dev curl
@ -13,7 +13,7 @@ ENV LLAMA_CURL=1

 RUN make -j$(nproc) llama-server

-FROM ubuntu:$UBUNTU_VERSION as runtime
+FROM ubuntu:$UBUNTU_VERSION AS runtime

 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1
--- a/.devops/nix/apps.nix
+++ b/.devops/nix/apps.nix
@ -10,7 +10,6 @@
            "llama-embedding"
            "llama-server"
            "llama-quantize"
-            "llama-train-text-from-scratch"
          ];
          mkApp = name: {
            type = "app";
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -17,19 +17,19 @@
  rocmPackages,
  vulkan-headers,
  vulkan-loader,
-  clblast,
+  curl,
+  shaderc,
  useBlas ? builtins.all (x: !x) [
    useCuda
    useMetalKit
-    useOpenCL
    useRocm
    useVulkan
  ] && blas.meta.available,
  useCuda ? config.cudaSupport,
-  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
+  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
  useMpi ? false, # Increases the runtime closure size by ~700M
-  useOpenCL ? false,
  useRocm ? config.rocmSupport,
+  enableCurl ? true,
  useVulkan ? false,
  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake

@ -56,7 +56,6 @@ let
    ++ lib.optionals useCuda [ "CUDA" ]
    ++ lib.optionals useMetalKit [ "MetalKit" ]
    ++ lib.optionals useMpi [ "MPI" ]
-    ++ lib.optionals useOpenCL [ "OpenCL" ]
    ++ lib.optionals useRocm [ "ROCm" ]
    ++ lib.optionals useVulkan [ "Vulkan" ];

@ -91,6 +90,22 @@ let
      ps.tiktoken
      ps.torchWithoutCuda
      ps.transformers
+
+      # server bench
+      ps.matplotlib
+
+      # server tests
+      ps.openai
+      ps.behave
+      ps.prometheus-client
+
+      # for examples/pydantic-models-to-grammar-examples.py
+      ps.docstring-parser
+      ps.pydantic
+
+      # for scripts/compare-llama-bench.py
+      ps.gitpython
+      ps.tabulate
    ]
  );

@ -132,6 +147,7 @@ let
  vulkanBuildInputs = [
    vulkan-headers
    vulkan-loader
+    shaderc
  ];
 in

@ -198,19 +214,19 @@ effectiveStdenv.mkDerivation (
      optionals effectiveStdenv.isDarwin darwinBuildInputs
      ++ optionals useCuda cudaBuildInputs
      ++ optionals useMpi [ mpi ]
-      ++ optionals useOpenCL [ clblast ]
      ++ optionals useRocm rocmBuildInputs
      ++ optionals useBlas [ blas ]
-      ++ optionals useVulkan vulkanBuildInputs;
+      ++ optionals useVulkan vulkanBuildInputs
+      ++ optionals enableCurl [ curl ];

    cmakeFlags =
      [
        (cmakeBool "LLAMA_BUILD_SERVER" true)
        (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
+        (cmakeBool "LLAMA_CURL" enableCurl)
        (cmakeBool "GGML_NATIVE" false)
        (cmakeBool "GGML_BLAS" useBlas)
-        (cmakeBool "GGML_CLBLAST" useOpenCL)
        (cmakeBool "GGML_CUDA" useCuda)
        (cmakeBool "GGML_HIPBLAS" useRocm)
        (cmakeBool "GGML_METAL" useMetalKit)
@ -254,7 +270,6 @@ effectiveStdenv.mkDerivation (
        useCuda
        useMetalKit
        useMpi
-        useOpenCL
        useRocm
        useVulkan
        ;
@ -281,7 +296,7 @@ effectiveStdenv.mkDerivation (
      # Configurations we don't want even the CI to evaluate. Results in the
      # "unsupported platform" messages. This is mostly a no-op, because
      # cudaPackages would've refused to evaluate anyway.
-      badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
+      badPlatforms = optionals useCuda lib.platforms.darwin;

      # Configurations that are known to result in build failures. Can be
      # overridden by importing Nixpkgs with `allowBroken = true`.
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@ -8,13 +8,11 @@ arg1="$1"
 shift

 if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
-    python3 ./convert-hf-to-gguf.py "$@"
+    python3 ./convert_hf_to_gguf.py "$@"
 elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
    ./llama-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
    ./llama-cli "$@"
-elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
-    ./llama-finetune "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
    echo "Converting PTH to GGML..."
    for i in `ls $1/$2/ggml-model-f16.bin*`; do
@ -36,8 +34,6 @@ else
    echo "              ex: --outtype f16 \"/models/7B/\" "
    echo "  --quantize (-q): Optimize with quantization process ggml"
    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
-    echo "  --finetune (-f): Run finetune command to create a lora finetune of the model"
-    echo "              See documentation for finetune for command-line parameters"
    echo "  --all-in-one (-a): Execute --convert & --quantize"
    echo "              ex: \"/models/\" 7B"
    echo "  --server (-s): Run a model on the server"
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -9,5 +9,3 @@ contact_links:
  - name: Want to contribute?
    url: https://github.com/ggerganov/llama.cpp/wiki/contribute
    about: Head to the contribution guide page of the wiki for areas you can help with
-
-
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -16,7 +16,9 @@ SYCL:
        - any-glob-to-any-file:
            - ggml/include/ggml-sycl.h
            - ggml/src/ggml-sycl.cpp
-            - README-sycl.md
+            - ggml/src/ggml-sycl/**
+            - docs/backend/SYCL.md
+            - examples/sycl/**
 Nvidia GPU:
    - changed-files:
        - any-glob-to-any-file:
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -47,7 +47,7 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@ -105,7 +105,7 @@ jobs:
          sysctl -a
          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
          # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON
+          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@ -222,7 +222,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
          cmake --build . --config Release -j $(nproc)

      - name: Test
@ -355,8 +355,10 @@ jobs:
      - name: Dependencies
        id: depends
        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libvulkan-dev
+          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
+          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+          sudo apt-get update -y
+          sudo apt-get install -y build-essential vulkan-sdk

      - name: Build
        id: cmake_build
@ -799,6 +801,7 @@ jobs:
          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
          $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
          cd build
+          $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
          & $sde -future -- ctest -L main -C Release --verbose --timeout 900

      - name: Determine tag name
@ -857,6 +860,7 @@ jobs:
          mkdir build
          cd build
          cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
+          cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}

      - name: Determine tag name
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@ -0,0 +1,38 @@
+name: Python Type-Check
+
+on:
+  push:
+    paths:
+      - '.github/workflows/python-type-check.yml'
+      - '**.py'
+      - '**/requirements*.txt'
+  pull_request:
+    paths:
+      - '.github/workflows/python-type-check.yml'
+      - '**.py'
+      - '**/requirements*.txt'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  python-type-check:
+    runs-on: ubuntu-latest
+    name: pyright type-check
+    steps:
+      - name: Check out source repository
+        uses: actions/checkout@v4
+      - name: Set up Python environment
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install Python dependencies
+        # TODO: use a venv
+        run: pip install -r requirements/requirements-all.txt
+      - name: Type-check with Pyright
+        uses: jakebailey/pyright-action@v2
+        with:
+          version: 1.1.370
+          level: warning
+          warnings: true
--- a/.gitignore
+++ b/.gitignore
@ -47,8 +47,10 @@ build*
 !build-info.cpp.in
 !build-info.sh
 !build.zig
+!docs/build.md
 /libllama.so
 /llama-*
+/vulkan-shaders-gen
 android-ndk-*
 arm_neon.h
 cmake-build-*
@ -60,6 +62,11 @@ llama-batched-swift
 out/
 tmp/

+# Deprecated
+
+/main
+/server
+
 # CI

 !.github/workflows/*.yml
@ -98,13 +105,14 @@ examples/server/*.mjs.hpp

 # Python

-__pycache__
-.venv
-/Pipfile
-dist
-poetry.lock
+/.venv
+__pycache__/
+*/poetry.lock
 poetry.toml

+# Nix
+/result
+
 # Test binaries
 /tests/test-backend-ops
 /tests/test-double-float
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -42,13 +42,14 @@ endif()

 option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})

+if (WIN32)
+    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
+endif()
+
 #
 # option list
 #

-# general
-option(LLAMA_CCACHE "llama: use ccache if available" ON)
-
 # debug
 option(LLAMA_ALL_WARNINGS           "llama: enable all compiler warnings"                   ON)
 option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
@ -73,20 +74,26 @@ option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)

 # override ggml options
-set(GGML_CCACHE             ${LLAMA_CCACHE})
 set(GGML_SANITIZE_THREAD    ${LLAMA_SANITIZE_THREAD})
 set(GGML_SANITIZE_ADDRESS   ${LLAMA_SANITIZE_ADDRESS})
 set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
 set(GGML_ALL_WARNINGS       ${LLAMA_ALL_WARNINGS})
 set(GGML_FATAL_WARNINGS     ${LLAMA_FATAL_WARNINGS})
-set(GGML_LLAMAFILE          ON)
-set(GGML_CUDA_USE_GRAPHS    ON)
+
+# change the default for these ggml options
+if (NOT DEFINED GGML_LLAMAFILE)
+    set(GGML_LLAMAFILE ON)
+endif()
+
+if (NOT DEFINED GGML_CUDA_USE_GRAPHS)
+    set(GGML_CUDA_USE_GRAPHS ON)
+endif()

 # transition helpers
 function (llama_option_depr TYPE OLD NEW)
    if (${OLD})
        message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
-        set(${NEW} ON)
+        set(${NEW} ON PARENT_SCOPE)
    endif()
 endfunction()

@ -96,16 +103,19 @@ llama_option_depr(WARNING     LLAMA_KOMPUTE             GGML_KOMPUTE)
 llama_option_depr(WARNING     LLAMA_METAL               GGML_METAL)
 llama_option_depr(WARNING     LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
 llama_option_depr(WARNING     LLAMA_NATIVE              GGML_NATIVE)
-llama_option_depr(WARNING     LLAMA_OPENMP              GGML_OPENMP)
 llama_option_depr(WARNING     LLAMA_RPC                 GGML_RPC)
 llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
 llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)
+llama_option_depr(WARNING     LLAMA_CANN                GGML_CANN)

 #
 # build the library
 #

-add_subdirectory(ggml)
+if (NOT TARGET ggml)
+    add_subdirectory(ggml)
+    # ... otherwise assume ggml is added by a parent CMakeLists.txt
+endif()
 add_subdirectory(src)

 #
@ -123,7 +133,16 @@ set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location o
 set(LLAMA_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
 set(LLAMA_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")

-get_directory_property(LLAMA_TRANSIENT_DEFINES COMPILE_DEFINITIONS)
+
+# At the moment some compile definitions are placed within the ggml/src
+# directory but not exported on the `ggml` target. This could be improved by
+# determining _precisely_ which defines are necessary for the llama-config
+# package.
+#
+get_directory_property(GGML_DIR_DEFINES DIRECTORY ggml/src COMPILE_DEFINITIONS)
+get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
+set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
+get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)

 set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
 install(TARGETS llama LIBRARY PUBLIC_HEADER)
@ -146,7 +165,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama)

 install(
-    FILES convert-hf-to-gguf.py
+    FILES convert_hf_to_gguf.py
    PERMISSIONS
        OWNER_READ
        OWNER_WRITE
--- a/CMakePresets.json
+++ b/CMakePresets.json
@ -19,6 +19,7 @@
        "cacheVariables": {
            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
            "CMAKE_CXX_COMPILER": "icx",
+            "CMAKE_C_COMPILER": "cl",
            "GGML_SYCL": "ON",
            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
        }
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,14 +1,28 @@
-# Contributing Guidelines
+# Pull requests (for contributors)

-## Checklist
+- Test your changes:
+  - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
+  - Execute [the full CI locally on your machine](ci/README.md) before publishing
+- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
+  - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience
+- If your PR becomes stale, don't hesitate to ping the maintainers in the comments

-* Make sure your PR follows the [coding guidelines](https://github.com/ggerganov/llama.cpp/blob/master/README.md#coding-guidelines)
-* Test your changes using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
-* Execute [the full CI locally on your machine](ci/README.md) before publishing
+# Pull requests (for collaborators)

-## PR formatting
+- Squash-merge PRs
+- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
+- Optionally, pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
+
+# Coding guidelines
+
+- Avoid adding third-party dependencies, extra files, extra headers, etc.
+- Always consider cross-compatibility with other operating systems and architectures
+- Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
+- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
+- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
+- Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
+- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
+- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
+
+![matmul](media/matmul.png)

-* Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
-    - The PR template has a series of review complexity checkboxes `[ ]` that you can mark as `[X]` for your conveience. Refer to [About task lists](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) for more information.
-* If the pull request only contains documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times.
-* When squashing multiple commits on merge, use the following format for your commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : Fix typo in utils.py (#1234)`
--- a/272
+++ b/272
@ -11,9 +11,9 @@ BUILD_TARGETS = \
 	llama-embedding \
 	llama-eval-callback \
 	llama-export-lora \
-	llama-finetune \
 	llama-gbnf-validator \
 	llama-gguf \
+	llama-gguf-hash \
 	llama-gguf-split \
 	llama-gritlm \
 	llama-imatrix \
@ -36,7 +36,6 @@ BUILD_TARGETS = \
 	llama-simple \
 	llama-speculative \
 	llama-tokenize \
-	llama-train-text-from-scratch \
 	llama-vdot \
 	llama-cvector-generator \
 	tests/test-c.o
@ -45,6 +44,7 @@ BUILD_TARGETS = \
 TEST_TARGETS = \
 	tests/test-autorelease \
 	tests/test-backend-ops \
+	tests/test-chat-template \
 	tests/test-double-float \
 	tests/test-grad0 \
 	tests/test-grammar-integration \
@ -61,6 +61,15 @@ TEST_TARGETS = \
 	tests/test-tokenizer-1-bpe \
 	tests/test-tokenizer-1-spm

+# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
+LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
+	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
+	retrieval speculative infill tokenize benchmark-matmult parallel export-lora lookahead lookup passkey gritlm
+
+# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
+#  We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
+LEGACY_TARGETS_BUILD = main quantize perplexity embedding server
+
 # Deprecation aliases
 ifdef LLAMA_CUBLAS
 $(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
@ -186,7 +195,11 @@ ifdef GGML_RPC
 	BUILD_TARGETS += rpc-server
 endif

-default: $(BUILD_TARGETS)
+ifdef GGML_VULKAN
+	BUILD_TARGETS += vulkan-shaders-gen
+endif
+
+default: $(BUILD_TARGETS) $(LEGACY_TARGETS_BUILD)

 test: $(TEST_TARGETS)
 	@failures=0; \
@ -221,7 +234,7 @@ test: $(TEST_TARGETS)
 	fi
 	@echo 'All tests passed.'

-all: $(BUILD_TARGETS) $(TEST_TARGETS)
+all: $(BUILD_TARGETS) $(TEST_TARGETS) $(LEGACY_TARGETS_BUILD)

 ifdef RISCV_CROSS_COMPILE
 CC	:= riscv64-unknown-linux-gnu-gcc
@ -238,17 +251,22 @@ MK_CFLAGS    = -std=c11   -fPIC
 MK_CXXFLAGS  = -std=c++11 -fPIC
 MK_NVCCFLAGS = -std=c++11

-ifndef LLAMA_NO_CCACHE
+ifdef LLAMA_NO_CCACHE
+GGML_NO_CCACHE := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifndef GGML_NO_CCACHE
 CCACHE := $(shell which ccache)
 ifdef CCACHE
 export CCACHE_SLOPPINESS = time_macros
-$(info I ccache found, compilation results will be cached. Disable with LLAMA_NO_CCACHE.)
+$(info I ccache found, compilation results will be cached. Disable with GGML_NO_CCACHE.)
 CC    := $(CCACHE) $(CC)
 CXX   := $(CCACHE) $(CXX)
 else
 $(info I ccache not found. Consider installing it for faster compilation.)
 endif # CCACHE
-endif # LLAMA_NO_CCACHE
+endif # GGML_NO_CCACHE

 # clock_gettime came in POSIX.1b (1993)
 # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
@ -307,9 +325,9 @@ ifdef LLAMA_DEBUG
 	endif
 else
 	MK_CPPFLAGS   += -DNDEBUG
-	MK_CFLAGS     += -O3
-	MK_CXXFLAGS   += -O3
-	MK_NVCCFLAGS  += -O3
+	MK_CFLAGS     += -O3 -g
+	MK_CXXFLAGS   += -O3 -g
+	MK_NVCCFLAGS  += -O3 -g
 endif

 ifdef LLAMA_SANITIZE_THREAD
@ -510,10 +528,21 @@ ifndef GGML_NO_ACCELERATE
 	endif
 endif # GGML_NO_ACCELERATE

+ifdef GGML_MUSA
+	CC := clang
+	CXX := clang++
+	GGML_CUDA := 1
+	MK_CPPFLAGS += -DGGML_USE_MUSA
+endif
+
 ifndef GGML_NO_OPENMP
 	MK_CPPFLAGS += -DGGML_USE_OPENMP
 	MK_CFLAGS   += -fopenmp
 	MK_CXXFLAGS += -fopenmp
+	ifdef GGML_MUSA
+		MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
+		MK_LDFLAGS  += -L/usr/lib/llvm-10/lib
+	endif # GGML_MUSA
 endif # GGML_NO_OPENMP

 ifdef GGML_OPENBLAS
@ -531,14 +560,20 @@ ifdef GGML_OPENBLAS64
 endif # GGML_OPENBLAS64

 ifdef GGML_BLIS
-	MK_CPPFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis
+	MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis
 	MK_LDFLAGS  += -lblis -L/usr/local/lib
 	OBJ_GGML    += ggml/src/ggml-blas.o
 endif # GGML_BLIS

+ifdef GGML_NVPL
+	MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas
+	MK_LDFLAGS  += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp
+	OBJ_GGML    += ggml/src/ggml-blas.o
+endif # GGML_NVPL
+
 ifndef GGML_NO_LLAMAFILE
 	MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
-	OBJ_GGML    += ggml/src/sgemm.o
+	OBJ_GGML    += ggml/src/llamafile/sgemm.o
 endif

 ifdef GGML_RPC
@ -558,15 +593,27 @@ else
 endif # GGML_CUDA_FA_ALL_QUANTS

 ifdef GGML_CUDA
-	ifneq ('', '$(wildcard /opt/cuda)')
-		CUDA_PATH ?= /opt/cuda
-	else
-		CUDA_PATH ?= /usr/local/cuda
-	endif
+	ifdef GGML_MUSA
+		ifneq ('', '$(wildcard /opt/musa)')
+			CUDA_PATH ?= /opt/musa
+		else
+			CUDA_PATH ?= /usr/local/musa
+		endif

-	MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
-	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
-	MK_NVCCFLAGS += -use_fast_math
+		MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include
+		MK_LDFLAGS   += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64
+		MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_22
+	else
+		ifneq ('', '$(wildcard /opt/cuda)')
+			CUDA_PATH ?= /opt/cuda
+		else
+			CUDA_PATH ?= /usr/local/cuda
+		endif
+
+		MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
+		MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
+		MK_NVCCFLAGS += -use_fast_math
+	endif # GGML_MUSA

 	OBJ_GGML += ggml/src/ggml-cuda.o
 	OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
@ -576,9 +623,11 @@ ifdef LLAMA_FATAL_WARNINGS
 	MK_NVCCFLAGS += -Werror all-warnings
 endif # LLAMA_FATAL_WARNINGS

+ifndef GGML_MUSA
 ifndef JETSON_EOL_MODULE_DETECT
 	MK_NVCCFLAGS += --forward-unknown-to-host-compiler
 endif # JETSON_EOL_MODULE_DETECT
+endif # GGML_MUSA

 ifdef LLAMA_DEBUG
 	MK_NVCCFLAGS += -lineinfo
@ -591,8 +640,12 @@ endif # GGML_CUDA_DEBUG
 ifdef GGML_CUDA_NVCC
 	NVCC = $(CCACHE) $(GGML_CUDA_NVCC)
 else
-	NVCC = $(CCACHE) nvcc
-endif #GGML_CUDA_NVCC
+	ifdef GGML_MUSA
+		NVCC = $(CCACHE) mcc
+	else
+		NVCC = $(CCACHE) nvcc
+	endif # GGML_MUSA
+endif # GGML_CUDA_NVCC

 ifdef CUDA_DOCKER_ARCH
 	MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
@ -663,9 +716,15 @@ define NVCC_COMPILE
 	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 endef # NVCC_COMPILE
 else
+	ifdef GGML_MUSA
+define NVCC_COMPILE
+	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -c $< -o $@
+endef # NVCC_COMPILE
+	else
 define NVCC_COMPILE
 	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 endef # NVCC_COMPILE
+	endif # GGML_MUSA
 endif # JETSON_EOL_MODULE_DETECT

 ggml/src/ggml-cuda/%.o: \
@ -688,8 +747,8 @@ endif # GGML_CUDA

 ifdef GGML_VULKAN
 	MK_CPPFLAGS += -DGGML_USE_VULKAN
-	MK_LDFLAGS  += -lvulkan
-	OBJ_GGML    += ggml/src/ggml-vulkan.o
+	MK_LDFLAGS  += $(shell pkg-config --libs vulkan)
+	OBJ_GGML    += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o

 ifdef GGML_VULKAN_CHECK_RESULTS
 	MK_CPPFLAGS  += -DGGML_VULKAN_CHECK_RESULTS
@ -711,10 +770,28 @@ ifdef GGML_VULKAN_RUN_TESTS
 	MK_CPPFLAGS  += -DGGML_VULKAN_RUN_TESTS
 endif

-ggml/src/ggml-vulkan.o: \
-	ggml/src/ggml-vulkan.cpp \
-	ggml/include/ggml-vulkan.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+GLSLC_CMD  = glslc
+_ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen
+_ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp
+_ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp
+_ggml_vk_input_dir = ggml/src/vulkan-shaders
+_ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp)
+
+ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
+	$(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@
+
+$(_ggml_vk_header): $(_ggml_vk_source)
+
+$(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen
+	$(_ggml_vk_genshaders_cmd) \
+		--glslc      $(GLSLC_CMD) \
+		--input-dir  $(_ggml_vk_input_dir) \
+		--target-hpp $(_ggml_vk_header) \
+		--target-cpp $(_ggml_vk_source)
+
+vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
+
 endif # GGML_VULKAN

 ifdef GGML_HIPBLAS
@ -751,6 +828,14 @@ ifdef GGML_CUDA_FORCE_DMMV
 	HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
 endif # GGML_CUDA_FORCE_DMMV

+ifdef GGML_CUDA_FORCE_MMQ
+	HIPFLAGS += -DGGML_CUDA_FORCE_MMQ
+endif # GGML_CUDA_FORCE_MMQ
+
+ifdef GGML_CUDA_FORCE_CUBLAS
+	HIPFLAGS += -DGGML_CUDA_FORCE_CUBLAS
+endif # GGML_CUDA_FORCE_CUBLAS
+
 ifdef GGML_CUDA_NO_PEER_COPY
 	HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
 endif # GGML_CUDA_NO_PEER_COPY
@ -819,10 +904,14 @@ OBJ_GGML += \
 	ggml/src/ggml.o \
 	ggml/src/ggml-alloc.o \
 	ggml/src/ggml-backend.o \
-	ggml/src/ggml-quants.o
+	ggml/src/ggml-quants.o \
+	ggml/src/ggml-aarch64.o

 OBJ_LLAMA = \
 	src/llama.o \
+	src/llama-vocab.o \
+	src/llama-grammar.o \
+	src/llama-sampling.o \
 	src/unicode.o \
 	src/unicode-data.o

@ -890,6 +979,7 @@ $(info I CXX:       $(shell $(CXX)  --version | head -n 1))
 ifdef GGML_CUDA
 $(info I NVCC:      $(shell $(NVCC) --version | tail -n 1))
 CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
+ifndef GGML_MUSA
 ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)

 ifndef CUDA_DOCKER_ARCH
@ -899,6 +989,7 @@ endif # CUDA_POWER_ARCH
 endif # CUDA_DOCKER_ARCH

 endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
+endif # GGML_MUSA
 endif # GGML_CUDA
 $(info )

@ -919,6 +1010,7 @@ $(info   - LLAMA_NO_LLAMAFILE)
 $(info   - LLAMA_NO_ACCELERATE)
 $(info   - LLAMA_NO_OPENMP)
 $(info   - LLAMA_NO_METAL)
+$(info   - LLAMA_NO_CCACHE)
 $(info )
 endif

@ -952,15 +1044,22 @@ ggml/src/ggml-quants.o: \
 	ggml/src/ggml-common.h
 	$(CC) $(CFLAGS)    -c $< -o $@

+ggml/src/ggml-aarch64.o: \
+	ggml/src/ggml-aarch64.c \
+	ggml/include/ggml.h \
+	ggml/src/ggml-aarch64.h \
+	ggml/src/ggml-common.h
+	$(CC) $(CFLAGS)    -c $< -o $@
+
 ggml/src/ggml-blas.o: \
 	ggml/src/ggml-blas.cpp \
 	ggml/include/ggml-blas.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

 ifndef GGML_NO_LLAMAFILE
-ggml/src/sgemm.o: \
-	ggml/src/sgemm.cpp \
-	ggml/src/sgemm.h \
+ggml/src/llamafile/sgemm.o: \
+	ggml/src/llamafile/sgemm.cpp \
+	ggml/src/llamafile/sgemm.h \
 	ggml/include/ggml.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif # GGML_NO_LLAMAFILE
@ -994,6 +1093,10 @@ src/unicode-data.o: \

 src/llama.o: \
 	src/llama.cpp \
+	src/llama-impl.h \
+	src/llama-vocab.h \
+	src/llama-grammar.h \
+	src/llama-sampling.h \
 	src/unicode.h \
 	include/llama.h \
 	ggml/include/ggml-cuda.h \
@ -1003,6 +1106,29 @@ src/llama.o: \
 	ggml/include/ggml-backend.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

+src/llama-vocab.o: \
+	src/llama-vocab.cpp \
+	src/llama-vocab.h \
+	src/llama-impl.h \
+	include/llama.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+src/llama-grammar.o: \
+	src/llama-grammar.cpp \
+	src/llama-grammar.h \
+	src/llama-impl.h \
+	src/llama-vocab.h \
+	src/llama-sampling.h \
+	include/llama.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+src/llama-sampling.o: \
+	src/llama-sampling.cpp \
+	src/llama-sampling.h \
+	src/llama-impl.h \
+	include/llama.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 $(LIB_LLAMA): \
 	$(OBJ_LLAMA) \
 	$(LIB_GGML)
@ -1070,6 +1196,7 @@ clean:
 	rm -rvf src/*.o
 	rm -rvf tests/*.o
 	rm -rvf examples/*.o
+	rm -rvf common/*.o
 	rm -rvf *.a
 	rm -rvf *.dll
 	rm -rvf *.so
@ -1084,6 +1211,8 @@ clean:
 	rm -vrf ggml/src/ggml-cuda/template-instances/*.o
 	rm -rvf $(BUILD_TARGETS)
 	rm -rvf $(TEST_TARGETS)
+	rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp
+	rm -rvf $(LEGACY_TARGETS_CLEAN)
 	find examples pocs -type f -name "*.o" -delete

 #
@ -1170,6 +1299,23 @@ llama-gguf: examples/gguf/gguf.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

+examples/gguf-hash/deps/sha1/sha1.o: \
+	examples/gguf-hash/deps/sha1/sha1.c
+	$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
+
+examples/gguf-hash/deps/xxhash/xxhash.o: \
+	examples/gguf-hash/deps/xxhash/xxhash.c
+	$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
+
+examples/gguf-hash/deps/sha256/sha256.o: \
+	examples/gguf-hash/deps/sha256/sha256.c
+	$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
+
+llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o\
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 llama-gguf-split: examples/gguf-split/gguf-split.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@ -1185,11 +1331,6 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
 llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
 	$(OBJ_GGML) $(OBJ_LLAMA)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@ -1205,13 +1346,8 @@ llama-baby-llama: examples/baby-llama/baby-llama.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-finetune: examples/finetune/finetune.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
 llama-export-lora: examples/export-lora/export-lora.cpp \
-	$(OBJ_GGML) common/log.h
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

@ -1358,7 +1494,7 @@ run-benchmark-matmult: llama-benchmark-matmult
 .PHONY: run-benchmark-matmult swift

 tests/test-llama-grammar: tests/test-llama-grammar.cpp \
-	$(OBJ_GGML) $(OBJ_COMMON) src/unicode.o src/unicode-data.o
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

@ -1462,3 +1598,51 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+#
+# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
+#
+# Mark legacy binary targets as .PHONY so that they are always checked.
+.PHONY: main quantize perplexity embedding server
+
+# NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
+#  Eventually we will want to remove these target from building all the time.
+main: examples/deprecation-warning/deprecation-warning.cpp
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."
+
+server: examples/deprecation-warning/deprecation-warning.cpp
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."
+
+quantize: examples/deprecation-warning/deprecation-warning.cpp
+ifneq (,$(wildcard quantize))
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	@echo "#########"
+	@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
+	@echo "  Remove the 'quantize' binary to remove this warning."
+	@echo "#########"
+endif
+
+perplexity: examples/deprecation-warning/deprecation-warning.cpp
+ifneq (,$(wildcard perplexity))
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	@echo "#########"
+	@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
+	@echo "  Remove the 'perplexity' binary to remove this warning."
+	@echo "#########"
+endif
+
+embedding: examples/deprecation-warning/deprecation-warning.cpp
+ifneq (,$(wildcard embedding))
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	@echo "#########"
+	@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
+	@echo "  Remove the 'embedding' binary to remove this warning."
+	@echo "#########"
+endif
--- a/Package.swift
+++ b/Package.swift
@ -4,12 +4,16 @@ import PackageDescription

 var sources = [
    "src/llama.cpp",
+    "src/llama-vocab.cpp",
+    "src/llama-grammar.cpp",
+    "src/llama-sampling.cpp",
    "src/unicode.cpp",
    "src/unicode-data.cpp",
    "ggml/src/ggml.c",
    "ggml/src/ggml-alloc.c",
    "ggml/src/ggml-backend.c",
    "ggml/src/ggml-quants.c",
+    "ggml/src/ggml-aarch64.c",
 ]

 var resources: [Resource] = []
--- a/README.md
+++ b/README.md
@ -3,7 +3,7 @@
 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)

 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
-[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
+[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
 [![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)

 [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
@ -13,7 +13,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 > [!IMPORTANT]
 [2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)

-### Recent API changes
+## Recent API changes

 - [2024 Jun 26] The source code and CMake build scripts have been restructured https://github.com/ggerganov/llama.cpp/pull/8006
 - [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
@ -24,9 +24,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 - [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
 - [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849

-### Hot topics
+## Hot topics

- **`convert.py` has been deprecated and moved to `examples/convert-legacy-llama.py`, please use `convert-hf-to-gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
+- **`convert.py` has been deprecated and moved to `examples/convert_legacy_llama.py`, please use `convert_hf_to_gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
 - Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
 - BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
 - MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
@ -39,37 +39,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ----

-<details>
-  <summary>Table of Contents</summary>
-  <ol>
-    <li>
-      <a href="#description">Description</a>
-    </li>
-    <li>
-      <a href="#usage">Usage</a>
-      <ul>
-        <li><a href="#get-the-code">Get the Code</a></li>
-        <li><a href="#build">Build</a></li>
-        <li><a href="#blas-build">BLAS Build</a></li>
-        <li><a href="#prepare-and-quantize">Prepare and Quantize</a></li>
-        <li><a href="#run-the-quantized-model">Run the quantized model</a></li>
-        <li><a href="#memorydisk-requirements">Memory/Disk Requirements</a></li>
-        <li><a href="#quantization">Quantization</a></li>
-        <li><a href="#interactive-mode">Interactive mode</a></li>
-        <li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
-        <li><a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a></li>
-        <li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li>
-        <li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li>
-        <li><a href="#android">Android</a></li>
-        <li><a href="#docker">Docker</a></li>
-      </ul>
-    </li>
-    <li><a href="#contributing">Contributing</a></li>
-    <li><a href="#coding-guidelines">Coding guidelines</a></li>
-    <li><a href="#docs">Docs</a></li>
-  </ol>
-</details>
-
 ## Description

 The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
@ -87,14 +56,6 @@ Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomm
 improved significantly thanks to many contributions. It is the main playground for developing new features for the
 [ggml](https://github.com/ggerganov/ggml) library.

-**Supported platforms:**
-
- [X] Mac OS
- [X] Linux
- [X] Windows (via CMake)
- [X] Docker
- [X] FreeBSD
-
 **Supported models:**

 Typically finetunes of the base models below are supported as well.
@ -108,6 +69,7 @@ Typically finetunes of the base models below are supported as well.
 - [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
 - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
+- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
 - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
 - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
@ -134,8 +96,9 @@ Typically finetunes of the base models below are supported as well.
 - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
 - [x] [OLMo](https://allenai.org/olmo)
 - [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
+- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)

-(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
+(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))

 **Multimodal models:**

@ -149,12 +112,6 @@ Typically finetunes of the base models below are supported as well.
 - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
 - [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)

-**HTTP server**
-
-[llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
-
-[simplechat](./examples/server/public_simplechat) is a simple chat client, which can be used to chat with the model exposed using above web server (use --path to point to simplechat), from a local web browser.
-
 **Bindings:**

 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
@ -175,11 +132,13 @@ Typically finetunes of the base models below are supported as well.
 - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
 - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
 - PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
+- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)

 **UI:**

 Unless otherwise noted these projects are open-source with permissive licensing:

+- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
 - [iohub/collama](https://github.com/iohub/coLLaMA)
 - [janhq/jan](https://github.com/janhq/jan) (AGPL)
 - [nat/openplayground](https://github.com/nat/openplayground)
@ -217,10 +176,19 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 **Tools:**

 - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
+- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption

---
+**Infrastructure:**

-Here is a typical run using LLaMA v2 13B on M2 Ultra:
+- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
+
+**Games:**
+- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
+
+## Demo
+
+<details>
+<summary>Typical run using LLaMA v2 13B on M2 Ultra</summary>

 ```
 $ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
@ -300,454 +268,85 @@ llama_print_timings:        eval time = 24513.59 ms /   399 runs   (   61.44 ms
 llama_print_timings:       total time = 25431.49 ms
 ```

+</details>
+
+<details>
+<summary>Demo of running both LLaMA-7B and whisper.cpp on a single M1 Pro MacBook</summary>
+
 And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook:

 https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8b4f-add84093ffff.mp4

+</details>
+
 ## Usage

 Here are the end-to-end binary build and model conversion steps for most supported models.

-### Get the Code
+### Basic usage
+
+Firstly, you need to get the binary. There are different methods that you can follow:
+- Method 1: Clone this repository and build locally, see [how to build](./docs/build.md)
+- Method 2: If you are using MacOS or Linux, you can install llama.cpp via [brew, flox or nix](./docs/install.md)
+- Method 3: Use a Docker image, see [documentation for Docker](./docs/docker.md)
+- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/llama.cpp/releases)
+
+You can run a basic completion using this command:

 ```bash
-git clone https://github.com/ggerganov/llama.cpp
-cd llama.cpp
+llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
+
+# Output:
+# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
 ```

-### Build
+See [this page](./examples/main/README.md) for a full list of parameters.

-In order to build llama.cpp you have four different options.
+### Conversation mode

- Using `make`:
-  - On Linux or MacOS:
-
-      ```bash
-      make
-      ```
-
-  - On Windows:
-
-    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
-    2. Extract `w64devkit` on your pc.
-    3. Run `w64devkit.exe`.
-    4. Use the `cd` command to reach the `llama.cpp` folder.
-    5. From here you can run:
-        ```bash
-        make
-        ```
-
-  - Notes:
-    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
-    - For faster repeated compilation, install [ccache](https://ccache.dev/).
-    - For debug builds, run `make LLAMA_DEBUG=1`
-
- Using `CMake`:
-
-  ```bash
-  cmake -B build
-  cmake --build build --config Release
-  ```
-
-  **Notes**:
-
-    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
-    - For faster repeated compilation, install [ccache](https://ccache.dev/).
-    - For debug builds, there are two cases:
-
-      1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
-
-      ```bash
-      cmake -B build -DCMAKE_BUILD_TYPE=Debug
-      cmake --build build
-      ```
-
-      2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
-
-      ```bash
-      cmake -B build -G "Xcode"
-      cmake --build build --config Debug
-      ```
-
-   Using `gmake` (FreeBSD):
-
-    1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
-    2. Add your user to **video** group
-    3. Install compilation dependencies.
-
-        ```bash
-        sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
-
-        gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
-        ```
-
-### Homebrew
-
-On Mac and Linux, the homebrew package manager can be used via
-```
-brew install llama.cpp
-```
-The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
-
-### Nix
-
-On Mac and Linux, the Nix package manager can be used via
-```
-nix profile install nixpkgs#llama-cpp
-```
-For flake enabled installs.
-
-Or
-```
-nix-env --file '<nixpkgs>' --install --attr llama-cpp
-```
-For non-flake enabled installs.
-
-This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
-
-#### Flox
-
-On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
-```
-flox install llama-cpp
-```
-Flox follows the nixpkgs build of llama.cpp.
-
-### Metal Build
-
-On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
-To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option.
-
-When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
-argument.
-
-### BLAS Build
-
-Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
-
- #### Accelerate Framework:
-
-  This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
-
- #### OpenBLAS:
-
-  This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
-
-  - Using `make`:
-    - On Linux:
-      ```bash
-      make GGML_OPENBLAS=1
-      ```
-
-    - On Windows:
-
-      1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
-      2. Download the latest version of [OpenBLAS for Windows](https://github.com/xianyi/OpenBLAS/releases).
-      3. Extract `w64devkit` on your pc.
-      4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
-      5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
-      6. Run `w64devkit.exe`.
-      7. Use the `cd` command to reach the `llama.cpp` folder.
-      8. From here you can run:
-
-          ```bash
-          make GGML_OPENBLAS=1
-          ```
-
-  - Using `CMake` on Linux:
-
-      ```bash
-      cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
-      cmake --build build --config Release
-      ```
-
- #### BLIS
-
-  Check [BLIS.md](docs/BLIS.md) for more information.
-
- #### SYCL
-  SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
-
-  llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
-
-  For detailed info, please refer to [llama.cpp for SYCL](README-sycl.md).
-
- #### Intel oneMKL
-  Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md).
-
-  - Using manual oneAPI installation:
-    By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
-      ```bash
-      source /opt/intel/oneapi/setvars.sh # You can skip this step if  in oneapi-basekit docker image, only required for manual installation
-      cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=ON
-      cmake --build build --config Release
-      ```
-
-  - Using oneAPI docker image:
-    If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
-
-  Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
-
- #### CUDA
-
-  This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
-
-  For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
-
-  - Using `make`:
-    ```bash
-    make GGML_CUDA=1
-    ```
-  - Using `CMake`:
-
-    ```bash
-    cmake -B build -DGGML_CUDA=ON
-    cmake --build build --config Release
-    ```
-
-  The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
-
-  | Option                        | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
-  |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-  | GGML_CUDA_FORCE_DMMV          | Boolean                | false   | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
-  | GGML_CUDA_DMMV_X              | Positive integer >= 32 | 32      | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants.                                         |
-  | GGML_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                                                                                                                         |
-  | GGML_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower.                       |
-  | GGML_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models                                                                                                                                                                                       |
-  | GGML_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
-  | GGML_CUDA_KQUANTS_ITER        | 1 or 2                 | 2       | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                                                                     |
-  | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
-  | GGML_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |
-
- #### hipBLAS
-
-  This provides BLAS acceleration on HIP-supported AMD GPUs.
-  Make sure to have ROCm installed.
-  You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
-
-  - Using `make`:
-    ```bash
-    make GGML_HIPBLAS=1
-    ```
-  - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
-    ```bash
-    HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
-        cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
-        && cmake --build build --config Release -- -j 16
-    ```
-    On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
-    However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
-
-    Note that if you get the following error:
-    ```
-    clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
-    ```
-    Try searching for a directory under `HIP_PATH` that contains the file
-    `oclc_abi_version_400.bc`. Then, add the following to the start of the
-    command: `HIP_DEVICE_LIB_PATH=<directory-you-just-found>`, so something
-    like:
-    ```bash
-    HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
-    HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
-        cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
-        && cmake --build build -- -j 16
-    ```
-
-  - Using `make` (example for target gfx1030, build with 16 CPU threads):
-    ```bash
-    make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
-    ```
-
-  - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
-    ```bash
-    set PATH=%HIP_PATH%\bin;%PATH%
-    cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
-    cmake --build build
-    ```
-    Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
-    Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.
-
-
-  The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
-  If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
-  The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
-
-  | Option                 | Legal values           | Default | Description                                                                                                                                                                                                                                    |
-  |------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-  | GGML_CUDA_DMMV_X       | Positive integer >= 32 | 32      | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
-  | GGML_CUDA_MMV_Y        | Positive integer       | 1       | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants.                                                                       |
-  | GGML_CUDA_KQUANTS_ITER | 1 or 2                 | 2       | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                             |
-
- #### Vulkan
-
-  **With docker**:
-
-  You don't need to install Vulkan SDK. It will be installed inside the container.
-
-  ```sh
-  # Build the image
-  docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
-
-  # Then, use it:
-  docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
-  ```
-
-  **Without docker**:
-
-  Firstly, you need to make sure you have installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html)
-
-  For example, on Ubuntu 22.04 (jammy), use the command below:
-
-  ```bash
-  wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
-  wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-  apt update -y
-  apt-get install -y vulkan-sdk
-  # To verify the installation, use the command below:
-  vulkaninfo
-  ```
-
-  Alternatively your package manager might be able to provide the appropriate libraries.
-  For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
-  For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
-
-  Then, build llama.cpp using the cmake command below:
-
-  ```bash
-  cmake -B build -DGGML_VULKAN=1
-  cmake --build build --config Release
-  # Test the output binary (with "-ngl 33" to offload all layers to GPU)
-  ./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
-
-  # You should see in the output, ggml_vulkan detected your GPU. For example:
-  # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
-  ```
-
-### Prepare and Quantize
-
-> [!NOTE]
-> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
-
-To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
-
-Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
-It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
+If you want a more ChatGPT-like experience, you can run in conversation mode by passing `-cnv` as a parameter:

 ```bash
-# obtain the official LLaMA model weights and place them in ./models
-ls ./models
-llama-2-7b tokenizer_checklist.chk tokenizer.model
-# [Optional] for models using BPE tokenizers
-ls ./models
-<folder containing weights and tokenizer json> vocab.json
-# [Optional] for PyTorch .bin models like Mistral-7B
-ls ./models
-<folder containing weights and tokenizer json>
+llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv

-# install Python dependencies
-python3 -m pip install -r requirements.txt
-
-# convert the model to ggml FP16 format
-python3 convert-hf-to-gguf.py models/mymodel/
-
-# quantize the model to 4-bits (using Q4_K_M method)
-./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
-
-# update the gguf filetype to current version if older version is now unsupported
-./llama-quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
+# Output:
+# > hi, who are you?
+# Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
+#
+# > what is 1+1?
+# Easy peasy! The answer to 1+1 is... 2!
 ```

-### Run the quantized model
+By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)

 ```bash
-# start inference on a gguf model
-./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
+./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
 ```

-When running the larger models, make sure you have enough disk space to store all the intermediate files.
+You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:

-### Running on Windows with prebuilt binaries
-
-You will find prebuilt Windows binaries on the release page.
-
-Simply download and extract the latest zip package of choice: (e.g. `llama-b1380-bin-win-avx2-x64.zip`)
-
-From the unzipped folder, open a terminal/cmd window here and place a pre-converted `.gguf` model file. Test out the main example like so:
-
-```
-.\main -m llama-2-7b.Q4_0.gguf -n 128
+```bash
+./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
 ```

-### Memory/Disk Requirements
+### Web server

-As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
+[llama.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.

-| Model | Original size | Quantized size (Q4_0) |
-|------:|--------------:|----------------------:|
-|    7B |         13 GB |                3.9 GB |
-|   13B |         24 GB |                7.8 GB |
-|   30B |         60 GB |               19.5 GB |
-|   65B |        120 GB |               38.5 GB |
+Example usage:

-### Quantization
+```bash
+./llama-server -m your_model.gguf --port 8080

-Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
-
-*(outdated)*
-
-| Model | Measure      |    F16 |   Q4_0 |   Q4_1 |   Q5_0 |   Q5_1 |   Q8_0 |
-|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
-|    7B | perplexity   | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
-|    7B | file size    |  13.0G |   3.5G |   3.9G |   4.3G |   4.7G |   6.7G |
-|    7B | ms/tok @ 4th |    127 |     55 |     54 |     76 |     83 |     72 |
-|    7B | ms/tok @ 8th |    122 |     43 |     45 |     52 |     56 |     67 |
-|    7B | bits/weight  |   16.0 |    4.5 |    5.0 |    5.5 |    6.0 |    8.5 |
-|   13B | perplexity   | 5.2543 | 5.3860 | 5.3608 | 5.2856 | 5.2706 | 5.2548 |
-|   13B | file size    |  25.0G |   6.8G |   7.6G |   8.3G |   9.1G |    13G |
-|   13B | ms/tok @ 4th |      - |    103 |    105 |    148 |    160 |    131 |
-|   13B | ms/tok @ 8th |      - |     73 |     82 |     98 |    105 |    128 |
-|   13B | bits/weight  |   16.0 |    4.5 |    5.0 |    5.5 |    6.0 |    8.5 |
-
- [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684)
- recent k-quants improvements and new i-quants
-  - [#2707](https://github.com/ggerganov/llama.cpp/pull/2707)
-  - [#2807](https://github.com/ggerganov/llama.cpp/pull/2807)
-  - [#4773 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4773)
-  - [#4856 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4856)
-  - [#4861 - importance matrix](https://github.com/ggerganov/llama.cpp/pull/4861)
-  - [#4872 - MoE models](https://github.com/ggerganov/llama.cpp/pull/4872)
-  - [#4897 - 2-bit quantization](https://github.com/ggerganov/llama.cpp/pull/4897)
-  - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
-  - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
-  - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
-  - [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
-  - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
-  - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
-  - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)
-
-### Perplexity (measuring model quality)
-
-You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
-For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
-
-The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
-The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 threads.
-
-#### How to run
-
-1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
-2. Run `./llama-perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
-3. Output:
+# Basic web UI can be accessed via browser: http://localhost:8080
+# Chat completion endpoint: http://localhost:8080/v1/chat/completions
 ```
-perplexity : calculating perplexity over 655 chunks
-24.43 seconds per pass - ETA 4.45 hours
-[1]4.5970,[2]5.1807,[3]6.0382,...
-```
-And after 4.45 hours, you will have the final perplexity.

 ### Interactive mode

-If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
+> [!NOTE]
+> If you prefer basic usage, please consider using conversation mode instead of interactive mode
+
 In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.

 Here is an example of a few-shot interaction, invoked with the command
@ -798,18 +397,71 @@ The `grammars/` folder contains a handful of sample grammars. To write your own,

 For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.

-### Obtaining and using the Facebook LLaMA 2 model
+## Build

- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
- Alternatively, if you want to save time and space, you can download already converted and quantized models from [TheBloke](https://huggingface.co/TheBloke), including:
-  - [LLaMA 2 7B base](https://huggingface.co/TheBloke/Llama-2-7B-GGUF)
-  - [LLaMA 2 13B base](https://huggingface.co/TheBloke/Llama-2-13B-GGUF)
-  - [LLaMA 2 70B base](https://huggingface.co/TheBloke/Llama-2-70B-GGUF)
-  - [LLaMA 2 7B chat](https://huggingface.co/TheBloke/Llama-2-7B-chat-GGUF)
-  - [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF)
-  - [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGUF)
+Please refer to [Build llama.cpp locally](./docs/build.md)

-### Seminal papers and background on the models
+## Supported backends
+
+| Backend | Target devices |
+| --- | --- |
+| [Metal](./docs/build.md#metal-build) | Apple Silicon |
+| [BLAS](./docs/build.md#blas-build) | All |
+| [BLIS](./docs/backend/BLIS.md) | All |
+| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
+| [MUSA](./docs/build.md#musa) | Moore Threads GPU |
+| [CUDA](./docs/build.md#cuda) | Nvidia GPU |
+| [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
+| [Vulkan](./docs/build.md#vulkan) | GPU |
+
+## Tools
+
+### Prepare and Quantize
+
+> [!NOTE]
+> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
+
+To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
+
+Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
+It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face.
+
+To learn more about quantizing model, [read this documentation](./examples/quantize/README.md)
+
+### Perplexity (measuring model quality)
+
+You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
+For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
+
+To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md)
+
+## Contributing
+
+- Contributors can open PRs
+- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
+- Collaborators will be invited based on contributions
+- Any help with managing issues and PRs is very appreciated!
+- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
+- Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
+- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
+- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
+
+## Other documentations
+
+- [main (cli)](./examples/main/README.md)
+- [server](./examples/server/README.md)
+- [jeopardy](./examples/jeopardy/README.md)
+- [GBNF grammars](./grammars/README.md)
+
+**Development documentations**
+
+- [How to build](./docs/build.md)
+- [Running on Docker](./docs/docker.md)
+- [Build on Android](./docs/android.md)
+- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
+- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
+
+**Seminal papers and background on the models**

 If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
 - LLaMA:
@ -820,178 +472,3 @@ If your issue is with model generation quality, then please at least scan the fo
 - GPT-3.5 / InstructGPT / ChatGPT:
    - [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
    - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
-
-### Android
-
-#### Build on Android using Termux
-[Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required).
-```
-apt update && apt upgrade -y
-apt install git make cmake
-```
-
-It's recommended to move your model inside the `~/` directory for best performance:
-```
-cd storage/downloads
-mv model.gguf ~/
-```
-
-[Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
-
-#### Building the Project using Android NDK
-Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
-
-Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
-```
-$ mkdir build-android
-$ cd build-android
-$ export NDK=<your_ndk_directory>
-$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
-$ make
-```
-
-Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
-
-Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
-
-(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
-```
-$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
-$cd /data/data/com.termux/files/home/bin
-$chmod +x ./*
-```
-
-Download model [llama-2-7b-chat.Q4_K_M.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_K_M.gguf), and push it to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
-
-```
-$mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/home/model/
-```
-
-Now, you can start chatting:
-```
-$cd /data/data/com.termux/files/home/bin
-$./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
-```
-
-Here's a demo of an interactive session running on Pixel 5 phone:
-
-https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
-
-### Docker
-
-#### Prerequisites
-* Docker must be installed and running on your system.
-* Create a folder to store big models & intermediate files (ex. /llama/models)
-
-#### Images
-We have three Docker images available for this project:
-
-1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
-2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
-3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
-
-Additionally, there the following images, similar to the above:
-
- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
-
-The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
-
-#### Usage
-
-The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
-
-Replace `/path/to/models` below with the actual path where you downloaded the models.
-
-```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
-```
-
-On completion, you are ready to play!
-
-```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
-```
-
-or with a light image:
-
-```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
-```
-
-or with a server image:
-
-```bash
-docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
-```
-
-### Docker With CUDA
-
-Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
-
-#### Building Locally
-
-```bash
-docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
-docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
-docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
-```
-
-You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
-
-The defaults are:
-
- `CUDA_VERSION` set to `11.7.1`
- `CUDA_DOCKER_ARCH` set to `all`
-
-The resulting images, are essentially the same as the non-CUDA images:
-
-1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
-2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
-3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
-
-#### Usage
-
-After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
-
-```bash
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
-```
-
-### Contributing
-
- Contributors can open PRs
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
- Collaborators will be invited based on contributions
- Any help with managing issues and PRs is very appreciated!
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
-
-### Coding guidelines
-
- Avoid adding third-party dependencies, extra files, extra headers, etc.
- Always consider cross-compatibility with other operating systems and architectures
- Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
-
-![matmul](media/matmul.png)
-
-### Docs
-
- [main (cli)](./examples/main/README.md)
- [server](./examples/server/README.md)
- [jeopardy](./examples/jeopardy/README.md)
- [BLIS](./docs/BLIS.md)
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
- [GBNF grammars](./grammars/README.md)
--- a/ci/run.sh
+++ b/ci/run.sh
@ -103,6 +103,9 @@ function gg_run_ctest_debug {

    set -e

+    # Check cmake, make and ctest are installed
+    gg_check_build_requirements
+
    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                          ) 2>&1 | tee -a $OUT/${ci}-make.log

@ -131,6 +134,9 @@ function gg_run_ctest_release {

    set -e

+    # Check cmake, make and ctest are installed
+    gg_check_build_requirements
+
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log

@ -287,7 +293,7 @@ function gg_run_open_llama_7b_v2 {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -421,7 +427,7 @@ function gg_run_pythia_1_4b {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -553,7 +559,7 @@ function gg_run_pythia_2_8b {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -688,7 +694,7 @@ function gg_run_embd_bge_small {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -701,6 +707,20 @@ function gg_run_embd_bge_small {
    set +e
 }

+function gg_check_build_requirements {
+    if ! command -v cmake &> /dev/null; then
+        gg_printf 'cmake not found, please install'
+    fi
+
+    if ! command -v make &> /dev/null; then
+        gg_printf 'make not found, please install'
+    fi
+
+    if ! command -v ctest &> /dev/null; then
+        gg_printf 'ctest not found, please install'
+    fi
+}
+
 function gg_sum_embd_bge_small {
    gg_printf '### %s\n\n' "${ci}"

--- a/cmake/llama-config.cmake.in
+++ b/cmake/llama-config.cmake.in
@ -8,6 +8,13 @@ set(GGML_CUDA       @GGML_CUDA@)
 set(GGML_METAL      @GGML_METAL@)
 set(GGML_HIPBLAS    @GGML_HIPBLAS@)
 set(GGML_ACCELERATE @GGML_ACCELERATE@)
+set(GGML_VULKAN @GGML_VULKAN@)
+set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@)
+set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@)
+set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@)
+set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@)
+set(GGML_SYCL @GGML_SYCL@)
+set(GGML_OPENMP @GGML_OPENMP@)

@PACKAGE_INIT@

@ -37,18 +44,36 @@ if (GGML_METAL)
    find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
 endif()

+if (GGML_VULKAN)
+    find_package(Vulkan REQUIRED)
+endif()
+
 if (GGML_HIPBLAS)
    find_package(hip REQUIRED)
    find_package(hipblas REQUIRED)
    find_package(rocblas REQUIRED)
 endif()

+if (GGML_SYCL)
+    find_package(IntelSYCL REQUIRED)
+    find_package(MKL REQUIRED)
+endif()
+
+if (GGML_OPENMP)
+    find_package(OpenMP REQUIRED)
+endif()
+
+
+find_library(ggml_LIBRARY ggml
+    REQUIRED
+    HINTS ${LLAMA_LIB_DIR})
+
 find_library(llama_LIBRARY llama
    REQUIRED
    HINTS ${LLAMA_LIB_DIR})

-set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
-set(_llama_transient_defines "@LLAMA_TRANSIENT_DEFINES@")
+set(_llama_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@")
+set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")

 add_library(llama UNKNOWN IMPORTED)

--- a/common/common.cpp
+++ b/common/common.cpp
@ -1,3 +1,7 @@
+#if defined(_MSC_VER)
+#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
+#endif
+
 #include "common.h"
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
@ -190,6 +194,12 @@ int32_t cpu_get_num_math() {
 // CLI argument parsing
 //

+void gpt_params_handle_hf_token(gpt_params & params) {
+    if (params.hf_token.empty() && std::getenv("HF_TOKEN")) {
+        params.hf_token = std::getenv("HF_TOKEN");
+    }
+}
+
 void gpt_params_handle_model_default(gpt_params & params) {
    if (!params.hf_repo.empty()) {
        // short-hand to avoid specifying --hf-file -> default it to --model
@ -237,6 +247,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {

    gpt_params_handle_model_default(params);

+    gpt_params_handle_hf_token(params);
+
    if (params.escape) {
        string_process_escapes(params.prompt);
        string_process_escapes(params.input_prefix);
@ -472,6 +484,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        else { invalid_param = true; }
        return true;
    }
+    if (arg == "--attention") {
+        CHECK_ARG
+        std::string value(argv[i]);
+        /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
+        else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
+        else { invalid_param = true; }
+        return true;
+    }
    if (arg == "--defrag-thold" || arg == "-dt") {
        CHECK_ARG
        params.defrag_thold = std::stof(argv[i]);
@ -644,6 +664,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.model_url = argv[i];
        return true;
    }
+    if (arg == "-hft" || arg == "--hf-token") {
+        if (++i >= argc) {
+          invalid_param = true;
+          return true;
+        }
+        params.hf_token = argv[i];
+        return true;
+    }
    if (arg == "-hfr" || arg == "--hf-repo") {
        CHECK_ARG
        params.hf_repo = argv[i];
@ -657,7 +685,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
    if (arg == "--lora") {
        CHECK_ARG
        params.lora_adapter.emplace_back(argv[i], 1.0f);
-        params.use_mmap = false;
        return true;
    }
    if (arg == "--lora-scaled") {
@ -665,12 +692,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        const char* lora_adapter = argv[i];
        CHECK_ARG
        params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
-        params.use_mmap = false;
-        return true;
-    }
-    if (arg == "--lora-base") {
-        CHECK_ARG
-        params.lora_base = argv[i];
        return true;
    }
    if (arg == "--control-vector") {
@ -757,7 +778,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.cache_type_v = argv[++i];
        return true;
    }
-    if (arg == "--multiline-input") {
+    if (arg == "-mli" || arg == "--multiline-input") {
        params.multiline_input = true;
        return true;
    }
@ -769,6 +790,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.cont_batching = true;
        return true;
    }
+    if (arg == "-nocb" || arg == "--no-cont-batching") {
+        params.cont_batching = false;
+        return true;
+    }
    if (arg == "-fa" || arg == "--flash-attn") {
        params.flash_attn = true;
        return true;
@ -1014,16 +1039,23 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
    }
    if (arg == "--in-prefix-bos") {
        params.input_prefix_bos = true;
+        params.enable_chat_template = false;
        return true;
    }
    if (arg == "--in-prefix") {
        CHECK_ARG
        params.input_prefix = argv[i];
+        params.enable_chat_template = false;
        return true;
    }
    if (arg == "--in-suffix") {
        CHECK_ARG
        params.input_suffix = argv[i];
+        params.enable_chat_template = false;
+        return true;
+    }
+    if (arg == "--spm-infill") {
+        params.spm_infill = true;
        return true;
    }
    if (arg == "--grammar") {
@ -1237,6 +1269,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        CHECK_ARG
        params.out_file = argv[i];
        params.cvector_outfile = argv[i];
+        params.lora_outfile = argv[i];
        return true;
    }
    if (arg == "-ofreq" || arg == "--output-frequency") {
@ -1291,6 +1324,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        else { invalid_param = true; }
        return true;
    }
+    if (arg == "--no-warmup") {
+        params.warmup = false;
+        return true;
+    }
 #ifndef LOG_DISABLE_LOGS
    // Parse args for logging parameters
    if (log_param_single_parse(argv[i])) {
@ -1387,7 +1424,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "       --keep N",               "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
    options.push_back({ "*",           "       --chunks N",             "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
    options.push_back({ "*",           "-fa,   --flash-attn",           "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
-    options.push_back({ "*",           "-p,    --prompt PROMPT",        "prompt to start generation with (default: '%s')", params.prompt.c_str() });
+    options.push_back({ "*",           "-p,    --prompt PROMPT",        "prompt to start generation with\n"
+                                                                        "in conversation mode, this will be used as system prompt\n"
+                                                                        "(default: '%s')", params.prompt.c_str() });
    options.push_back({ "*",           "-f,    --file FNAME",           "a file containing the prompt (default: none)" });
    options.push_back({ "*",           "       --in-file FNAME",        "an input file (repeat to specify multiple files)" });
    options.push_back({ "*",           "-bf,   --binary-file FNAME",    "binary file containing the prompt (default: none)" });
@ -1402,13 +1441,18 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                        "halt generation at PROMPT, return control in interactive mode\n"
                                                                        "can be specified more than once for multiple prompts" });
    options.push_back({ "main",        "-sp,   --special",              "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
-    options.push_back({ "main",        "-cnv,  --conversation",         "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" });
+    options.push_back({ "main",        "-cnv,  --conversation",         "run in conversation mode, does not print special tokens and suffix/prefix\n"
+                                                                        "if suffix/prefix are not specified, default chat template will be used\n"
+                                                                        "(default: %s)", params.conversation ? "true" : "false" });
    options.push_back({ "main infill", "-i,    --interactive",          "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
    options.push_back({ "main infill", "-if,   --interactive-first",    "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
    options.push_back({ "main infill", "-mli,  --multiline-input",      "allows you to write or paste multiple lines without ending each in '\\'" });
    options.push_back({ "main infill", "       --in-prefix-bos",        "prefix BOS to user inputs, preceding the `--in-prefix` string" });
    options.push_back({ "main infill", "       --in-prefix STRING",     "string to prefix user inputs with (default: empty)" });
    options.push_back({ "main infill", "       --in-suffix STRING",     "string to suffix after user inputs with (default: empty)" });
+    options.push_back({ "main",        "       --no-warmup",            "skip warming up the model with an empty run" });
+    options.push_back({ "server infill",
+                                       "       --spm-infill",           "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });

    options.push_back({ "sampling" });
    options.push_back({ "*",           "       --samplers SAMPLERS",    "samplers that will be used for generation in the order, separated by \';\'\n"
@ -1444,6 +1488,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "main",        "       --cfg-scale N",          "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
    options.push_back({ "main",        "       --chat-template JINJA_TEMPLATE",
                                                                        "set custom jinja chat template (default: template taken from model's metadata)\n"
+                                                                        "if suffix/prefix are specified, template will be disabled\n"
                                                                        "only commonly used templates are accepted:\n"
                                                                        "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
    options.push_back({ "grammar" });
@ -1454,8 +1499,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                        "For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" });

    options.push_back({ "embedding" });
-    options.push_back({ "embedding",   "       --pooling {none,mean,cls}",
+    options.push_back({ "embedding",   "       --pooling {none,mean,cls,last}",
                                                                        "pooling type for embeddings, use model default if unspecified" });
+    options.push_back({ "embedding",   "       --attention {causal,non-causal}",
+                                                                        "attention type for embeddings, use model default if unspecified" });

    options.push_back({ "context hacking" });
    options.push_back({ "*",           "       --rope-scaling {none,linear,yarn}",
@ -1494,6 +1541,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "-np,   --parallel N",           "number of parallel sequences to decode (default: %d)", params.n_parallel });
    options.push_back({ "*",           "-ns,   --sequences N",          "number of sequences to decode (default: %d)", params.n_sequences });
    options.push_back({ "*",           "-cb,   --cont-batching",        "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
+    options.push_back({ "*",           "-nocb, --no-cont-batching",     "disable continuous batching" });

    options.push_back({ "multi-modality" });
    options.push_back({ "*",           "       --mmproj FILE",          "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
@ -1536,9 +1584,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "       --override-kv KEY=TYPE:VALUE",
                                                                        "advanced option to override model metadata by key. may be specified multiple times.\n"
                                                                        "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
-    options.push_back({ "*",           "       --lora FNAME",           "apply LoRA adapter (implies --no-mmap)" });
-    options.push_back({ "*",           "       --lora-scaled FNAME S",  "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
-    options.push_back({ "*",           "       --lora-base FNAME",      "optional model to use as a base for the layers modified by the LoRA adapter" });
+    options.push_back({ "*",           "       --lora FNAME",           "apply LoRA adapter (can be repeated to use multiple adapters)" });
+    options.push_back({ "*",           "       --lora-scaled FNAME S",  "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
    options.push_back({ "*",           "       --control-vector FNAME", "add a control vector\n"
                                                                        "note: this argument can be repeated to add multiple control vectors" });
    options.push_back({ "*",           "       --control-vector-scaled FNAME SCALE",
@ -1552,6 +1599,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "-mu,   --model-url MODEL_URL",  "model download url (default: unused)" });
    options.push_back({ "*",           "-hfr,  --hf-repo REPO",         "Hugging Face model repository (default: unused)" });
    options.push_back({ "*",           "-hff,  --hf-file FILE",         "Hugging Face model file (default: unused)" });
+    options.push_back({ "*",           "-hft,  --hf-token TOKEN",       "Hugging Face access token (default: value from HF_TOKEN environment variable)" });

    options.push_back({ "retrieval" });
    options.push_back({ "retrieval",   "       --context-file FNAME",   "file to load context from (repeat to specify multiple files)" });
@ -1628,6 +1676,13 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "cvector",     "       --pca-iter N",           "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
    options.push_back({ "cvector",     "       --method {pca,mean}",    "dimensionality reduction method to be used (default: pca)" });

+    options.push_back({ "export-lora" });
+    options.push_back({ "export-lora", "-m,    --model",                "model path from which to load base model (default '%s')", params.model.c_str() });
+    options.push_back({ "export-lora", "       --lora FNAME",           "path to LoRA adapter  (can be repeated to use multiple adapters)" });
+    options.push_back({ "export-lora", "       --lora-scaled FNAME S",  "path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)" });
+    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during computation (default: %d)", params.n_threads });
+    options.push_back({ "export-lora", "-o,    --output FNAME",         "output file (default: '%s')", params.lora_outfile.c_str() });
+
    printf("usage: %s [options]\n", argv[0]);

    for (const auto & o : options) {
@ -1991,9 +2046,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
    llama_model * model = nullptr;

    if (!params.hf_repo.empty() && !params.hf_file.empty()) {
-        model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams);
+        model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
    } else if (!params.model_url.empty()) {
-        model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
+        model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
    } else {
        model = llama_load_model_from_file(params.model.c_str(), mparams);
    }
@ -2039,19 +2094,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
    for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
        const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
        float lora_scale = std::get<1>(params.lora_adapter[i]);
-        int err = llama_model_apply_lora_from_file(model,
-                                             lora_adapter.c_str(),
-                                             lora_scale,
-                                             ((i > 0) || params.lora_base.empty())
-                                                ? NULL
-                                                : params.lora_base.c_str(),
-                                             params.n_threads);
-        if (err != 0) {
+        auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
+        if (adapter == nullptr) {
            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
            llama_free(lctx);
            llama_free_model(model);
            return std::make_tuple(nullptr, nullptr);
        }
+        llama_lora_adapter_set(lctx, adapter, lora_scale);
    }

    if (params.ignore_eos) {
@ -2061,7 +2111,24 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
    if (params.warmup) {
        LOG("warming up the model with an empty run\n");

-        std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
+        std::vector<llama_token> tmp;
+        llama_token bos = llama_token_bos(model);
+        llama_token eos = llama_token_eos(model);
+        // some models (e.g. T5) don't have a BOS token
+        if (bos != -1) {
+            tmp.push_back(bos);
+        }
+        tmp.push_back(eos);
+
+        if (llama_model_has_encoder(model)) {
+            llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
+            llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
+            if (decoder_start_token_id == -1) {
+                decoder_start_token_id = bos;
+            }
+            tmp.clear();
+            tmp.push_back(decoder_start_token_id);
+        }
        llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
        llama_kv_cache_clear(lctx);
        llama_synchronize(lctx);
@ -2144,6 +2211,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.yarn_beta_slow    = params.yarn_beta_slow;
    cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
    cparams.pooling_type      = params.pooling_type;
+    cparams.attention_type    = params.attention_type;
    cparams.defrag_thold      = params.defrag_thold;
    cparams.cb_eval           = params.cb_eval;
    cparams.cb_eval_user_data = params.cb_eval_user_data;
@ -2163,7 +2231,7 @@ static bool starts_with(const std::string & str, const std::string & prefix) {
    return str.rfind(prefix, 0) == 0;
 }

-static bool llama_download_file(const std::string & url, const std::string & path) {
+static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {

    // Initialize libcurl
    std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
@ -2178,6 +2246,15 @@ static bool llama_download_file(const std::string & url, const std::string & pat
    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);

+    // Check if hf-token or bearer-token was specified
+    if (!hf_token.empty()) {
+      std::string auth_header = "Authorization: Bearer ";
+      auth_header += hf_token.c_str();
+      struct curl_slist *http_headers = NULL;
+      http_headers = curl_slist_append(http_headers, auth_header.c_str());
+      curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
+    }
+
 #if defined(_WIN32)
    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
    //   operating system. Currently implemented under MS-Windows.
@ -2373,6 +2450,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
 struct llama_model * llama_load_model_from_url(
        const char * model_url,
        const char * path_model,
+        const char * hf_token,
        const struct llama_model_params & params) {
    // Basic validation of the model_url
    if (!model_url || strlen(model_url) == 0) {
@ -2380,7 +2458,7 @@ struct llama_model * llama_load_model_from_url(
        return NULL;
    }

-    if (!llama_download_file(model_url, path_model)) {
+    if (!llama_download_file(model_url, path_model, hf_token)) {
        return NULL;
    }

@ -2428,14 +2506,14 @@ struct llama_model * llama_load_model_from_url(
        // Prepare download in parallel
        std::vector<std::future<bool>> futures_download;
        for (int idx = 1; idx < n_split; idx++) {
-            futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split](int download_idx) -> bool {
+            futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
                char split_path[PATH_MAX] = {0};
                llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);

                char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
                llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);

-                return llama_download_file(split_url, split_path);
+                return llama_download_file(split_url, split_path, hf_token);
            }, idx));
        }

@ -2454,6 +2532,7 @@ struct llama_model * llama_load_model_from_hf(
        const char * repo,
        const char * model,
        const char * path_model,
+        const char * hf_token,
        const struct llama_model_params & params) {
    // construct hugging face model url:
    //
@ -2469,7 +2548,7 @@ struct llama_model * llama_load_model_from_hf(
    model_url += "/resolve/main/";
    model_url += model;

-    return llama_load_model_from_url(model_url.c_str(), path_model, params);
+    return llama_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
 }

 #else
@ -2477,6 +2556,7 @@ struct llama_model * llama_load_model_from_hf(
 struct llama_model * llama_load_model_from_url(
        const char * /*model_url*/,
        const char * /*path_model*/,
+        const char * /*hf_token*/,
        const struct llama_model_params & /*params*/) {
    fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
    return nullptr;
@ -2486,6 +2566,7 @@ struct llama_model * llama_load_model_from_hf(
        const char * /*repo*/,
        const char * /*model*/,
        const char * /*path_model*/,
+        const char * /*hf_token*/,
        const struct llama_model_params & /*params*/) {
    fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
    return nullptr;
@ -2550,51 +2631,35 @@ std::vector<llama_token> llama_tokenize(
 }

 std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
-    std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
-    if (n_tokens < 0) {
-        result.resize(-n_tokens);
-        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
-        GGML_ASSERT(check == -n_tokens);
-    } else {
-        result.resize(n_tokens);
+    std::string piece;
+    piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'
+    const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
+    if (n_chars < 0) {
+        piece.resize(-n_chars);
+        int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
+        GGML_ASSERT(check == -n_chars);
+    }
+    else {
+        piece.resize(n_chars);
    }

-    return std::string(result.data(), result.size());
+    return piece;
 }

-std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
-    const llama_token bos_id = llama_token_bos(llama_get_model(ctx));
-
-    std::string piece;
-    std::string result;
-
-    for (size_t i = 0; i < tokens.size(); ++i) {
-        piece = llama_token_to_piece(ctx, tokens[i]);
-
-        // remove the leading space of the first non-BOS token
-        if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
-            piece = piece.substr(1);
-        }
-
-        result += piece;
+std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
+    std::string text;
+    text.resize(std::max(text.capacity(), tokens.size()));
+    int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+    if (n_chars < 0) {
+        text.resize(-n_chars);
+        n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+        GGML_ASSERT(n_chars <= (int32_t)text.size());  // whitespace trimming is performed after per-token detokenization
    }

-    return result;
-}
-
-std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
-    std::string piece;
-    std::string result;
-
-    for (size_t i = 0; i < tokens.size(); ++i) {
-        piece = llama_token_to_piece(ctx, tokens[i]);
-
-        result += piece;
-    }
+    text.resize(n_chars);

    // NOTE: the original tokenizer decodes bytes after collecting the pieces.
-    return result;
+    return text;
 }

 bool llama_should_add_bos_token(const llama_model * model) {
@ -2618,6 +2683,7 @@ std::string llama_chat_apply_template(const struct llama_model * model,
        const std::vector<llama_chat_msg> & msgs,
        bool add_ass) {
    int alloc_size = 0;
+    bool fallback = false; // indicate if we must fallback to default chatml
    std::vector<llama_chat_message> chat;
    for (auto & msg : msgs) {
        chat.push_back({msg.role.c_str(), msg.content.c_str()});
@ -2630,10 +2696,26 @@ std::string llama_chat_apply_template(const struct llama_model * model,
    // run the first time to get the total output length
    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());

+    // error: chat template is not supported
+    if (res < 0) {
+        if (ptr_tmpl != nullptr) {
+            // if the custom "tmpl" is not supported, we throw an error
+            // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
+            throw std::runtime_error("this custom template is not supported");
+        } else {
+            // If the built-in template is not supported, we default to chatml
+            res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+            fallback = true;
+        }
+    }
+
    // if it turns out that our buffer is too small, we resize it
    if ((size_t) res > buf.size()) {
        buf.resize(res);
-        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+        res = llama_chat_apply_template(
+            fallback ? nullptr : model,
+            fallback ? "chatml" : ptr_tmpl,
+            chat.data(), chat.size(), add_ass, buf.data(), buf.size());
    }

    std::string formatted_chat(buf.data(), res);
@ -2645,12 +2727,19 @@ std::string llama_chat_format_single(const struct llama_model * model,
        const std::vector<llama_chat_msg> & past_msg,
        const llama_chat_msg & new_msg,
        bool add_ass) {
-    auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
+    std::ostringstream ss;
+    auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
    std::vector<llama_chat_msg> chat_new(past_msg);
+    // if the past_msg ends with a newline, we must preserve it in the formatted version
+    if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
+        ss << "\n";
+    };
+    // format chat with new_msg
    chat_new.push_back(new_msg);
    auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
-    auto formatted = fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
-    return formatted;
+    // get the diff part
+    ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
+    return ss.str();
 }

 std::string llama_chat_format_example(const struct llama_model * model,
@ -2804,125 +2893,87 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
 //

 static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
-    int32_t n_tensors;
-
-    size_t n_bytes = 0;
-
-    uint32_t max_direction_layer = 0;
-
    llama_control_vector_data result = { -1, {} };

-    // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
-    {
-        struct ggml_init_params meta_params = {
-            /* .mem_size   = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
-            /* .mem_buffer = */ nullptr,
-            /* .no_alloc   = */ true,
-        };
-        ggml_context * meta_ctx = ggml_init(meta_params);
-        struct gguf_init_params meta_gguf_params = {
-            /* .no_alloc = */ true,
-            /* .ctx      = */ &meta_ctx,
-        };
-        struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
-        if (!meta_ctx_gguf) {
-            fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
-            ggml_free(meta_ctx);
-            return result;
-        }
-
-        n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
-        for (int i = 0; i < n_tensors; i++) {
-            std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
-
-            // split on '.'
-            size_t dotpos = name.find('.');
-            if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
-                try {
-                    uint32_t layer = std::stoi(name.substr(dotpos + 1));
-                    if (layer == 0) {
-                        fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
-                        ggml_free(meta_ctx);
-                        gguf_free(meta_ctx_gguf);
-                        return result;
-                    }
-                    if (layer > max_direction_layer) {
-                        max_direction_layer = layer;
-                    }
-                } catch (...) {
-                    fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
-                    ggml_free(meta_ctx);
-                    gguf_free(meta_ctx_gguf);
-                    return result;
-                }
-            }
-
-            struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
-            if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
-                fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
-                ggml_free(meta_ctx);
-                gguf_free(meta_ctx_gguf);
-                return result;
-            }
-            if (result.n_embd == -1) {
-                result.n_embd = ggml_nelements(tensor_meta);
-            } else if (ggml_nelements(tensor_meta) != result.n_embd) {
-                fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str());
-                ggml_free(meta_ctx);
-                gguf_free(meta_ctx_gguf);
-                return result;
-            }
-            n_bytes += ggml_nbytes(tensor_meta);
-        }
-        ggml_free(meta_ctx);
-        gguf_free(meta_ctx_gguf);
+    ggml_context * ctx = nullptr;
+    struct gguf_init_params meta_gguf_params = {
+        /* .no_alloc = */ false,
+        /* .ctx      = */ &ctx,
+    };
+    struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
+    if (!ctx_gguf) {
+        fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
+        return result;
    }

+    int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
    if (n_tensors == 0) {
        fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
-        return result;
    }

-    // load and scale tensors into final control vector context
-    struct ggml_init_params ggml_params = {
-        /* .mem_size   = */ ggml_tensor_overhead() * n_tensors + n_bytes,
-        /* .mem_buffer = */ nullptr,
-        /* .no_alloc   = */ false,
-    };
-    struct ggml_context * ctx = ggml_init(ggml_params);
+    for (int i = 0; i < n_tensors; i++) {
+        std::string name = gguf_get_tensor_name(ctx_gguf, i);

-    struct gguf_init_params params = {
-        /*.no_alloc = */ false,
-        /*.ctx      = */ &ctx,
-    };
-    struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params);
-    if (!ctx_gguf) {
-        fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
-        ggml_free(ctx);
-        return result;
-    }
+        int layer_idx = -1;

-    // do not store data for layer 0 (it's not used)
-    result.data.resize(result.n_embd * max_direction_layer);
-
-    for (uint32_t il = 1; il <= max_direction_layer; il++) {
-        const std::string name = "direction." + std::to_string(il);
-        const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
-
-        float * dst = result.data.data() + result.n_embd * (il - 1);
-
-        if (tensor) {
-            const float * src = (const float *) tensor->data;
-            for (int j = 0; j < result.n_embd; j++) {
-                dst[j] = src[j] * load_info.strength;
-            }
-        } else {
-            for (int j = 0; j < result.n_embd; j++) {
-                dst[j] = 0.0f;
+        // split on '.'
+        size_t dotpos = name.find('.');
+        if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
+            try {
+                layer_idx = std::stoi(name.substr(dotpos + 1));
+            } catch (...) {
+                layer_idx = -1;
            }
        }
+        if (layer_idx < 0) {
+            fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        } else if (layer_idx == 0) {
+            fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+
+        struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
+        if (tensor->type != GGML_TYPE_F32) {
+            fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+        if (ggml_n_dims(tensor) != 1) {
+            fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+
+        if (result.n_embd == -1) {
+            result.n_embd = ggml_nelements(tensor);
+        } else if (ggml_nelements(tensor) != result.n_embd) {
+            fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+
+        // extend if necessary - do not store data for layer 0 (it's not used)
+        result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
+
+        const float * src = (const float *) tensor->data;
+        float * dst = result.data.data() + result.n_embd * (layer_idx - 1);  // layer 1 at [0]
+        for (int j = 0; j < result.n_embd; j++) {
+            dst[j] += src[j] * load_info.strength;  // allows multiple directions for same layer in same file
+        }
+
    }

+    if (result.n_embd == -1) {
+        fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
+        result.data.clear();
+    }
+
+    gguf_free(ctx_gguf);
+    ggml_free(ctx);
+
    return result;
 }

@ -2933,16 +2984,19 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
        auto cur = llama_control_vector_load_one(info);

        if (cur.n_embd == -1) {
-            return result;
+            result.n_embd = -1;
+            break;
        }
-        if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) {
-            fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str());
-            return result;
+        if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
+            fprintf(stderr, "%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
+            result.n_embd = -1;
+            break;
        }

        if (result.n_embd == -1) {
            result = std::move(cur);
        } else {
+            result.data.resize(std::max(result.data.size(), cur.data.size()), 0.0f);  // extend if necessary
            for (size_t i = 0; i < cur.data.size(); i++) {
                result.data[i] += cur.data[i];
            }
@ -2950,7 +3004,8 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
    }

    if (result.n_embd == -1) {
-        fprintf(stderr, "%s: no vectors passed\n", __func__);
+        fprintf(stderr, "%s: no valid control vector files passed\n", __func__);
+        result.data.clear();
    }

    return result;
@ -3118,7 +3173,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
        }
        fprintf(stream, "  - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
    }
-    fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
    fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
--- a/common/common.h
+++ b/common/common.h
@ -99,6 +99,7 @@ struct gpt_params {
    enum llama_split_mode        split_mode        = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
+    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings

    // // sampling parameters
    struct llama_sampling_params sparams;
@ -107,6 +108,7 @@ struct gpt_params {
    std::string model_draft          = ""; // draft model for speculative decoding
    std::string model_alias          = "unknown"; // model alias
    std::string model_url            = ""; // model url to download
+    std::string hf_token             = ""; // HF token
    std::string hf_repo              = ""; // HF repo
    std::string hf_file              = ""; // HF file
    std::string prompt               = "";
@ -126,7 +128,6 @@ struct gpt_params {

    // TODO: avoid tuple, use struct
    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
-    std::string lora_base  = "";                              // base model path for the lora adapter

    std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale

@ -200,6 +201,7 @@ struct gpt_params {
    std::string public_path   = "";
    std::string chat_template = "";
    std::string system_prompt = "";
+    bool enable_chat_template = true;

    std::vector<std::string> api_keys;

@ -250,8 +252,13 @@ struct gpt_params {
    std::string cvector_outfile       = "control_vector.gguf";
    std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
+
+    bool spm_infill = false; // suffix/prefix/middle pattern for infill
+
+    std::string lora_outfile = "ggml-lora-merged-f16.gguf";
 };

+void gpt_params_handle_hf_token(gpt_params & params);
 void gpt_params_handle_model_default(gpt_params & params);

 bool gpt_params_parse_ex   (int argc, char ** argv, gpt_params & params);
@ -307,8 +314,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);

-struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params);
-struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params);
+struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
+struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);

 // Batch utils

@ -346,21 +353,13 @@ std::string llama_token_to_piece(
                       llama_token   token,
                       bool          special = true);

-// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
-//       that takes into account the tokenizer type and decides how to handle the leading space
-//
 // detokenizes a vector of tokens into a string
 // should work similar to Python's `tokenizer.decode`
-// removes the leading space from the first non-BOS token
-std::string llama_detokenize_spm(
+// optionally renders special/control tokens
+std::string llama_detokenize(
                         llama_context * ctx,
-        const std::vector<llama_token> & tokens);
-
-// detokenizes a vector of tokens into a string
-// should work similar to Python's `tokenizer.decode`
-std::string llama_detokenize_bpe(
-                         llama_context * ctx,
-        const std::vector<llama_token> & tokens);
+        const std::vector<llama_token> & tokens,
+                                  bool   special = true);

 // Uses the value from the model metadata if possible, otherwise
 // defaults to true when model type is SPM, otherwise false.
@ -380,6 +379,8 @@ struct llama_chat_msg {
 bool llama_chat_verify_template(const std::string & tmpl);

 // CPP wrapper for llama_chat_apply_template
+// If the built-in template is not supported, we default to chatml
+// If the custom "tmpl" is not supported, we throw an error
 std::string llama_chat_apply_template(const struct llama_model * model,
        const std::string & tmpl,
        const std::vector<llama_chat_msg> & chat,
@ -454,4 +455,3 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
 void yaml_dump_non_result_info(
    FILE * stream, const gpt_params & params, const llama_context * lctx,
    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
-
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@ -316,7 +316,7 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
 };

 std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
-std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
+std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};

 template <typename Iterator>
 std::string join(Iterator begin, Iterator end, const std::string & separator) {
@ -720,7 +720,7 @@ private:
            }
            prop_names.push_back(prop_name);
        }
-        if (!(additional_properties.is_boolean() && !additional_properties.get<bool>())) {
+        if ((additional_properties.is_boolean() && additional_properties.get<bool>()) || additional_properties.is_object()) {
            std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
            std::string value_rule =
                additional_properties.is_object() ? visit(additional_properties, sub_name + "-value")
--- a/common/log.h
+++ b/common/log.h
@ -630,7 +630,7 @@ inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
    buf << "[ ";

    bool first = true;
-    for (const auto &token : tokens)
+    for (const auto & token : tokens)
    {
        if (!first) {
            buf << ", ";
--- a/common/ngram-cache.h
+++ b/common/ngram-cache.h
@ -37,11 +37,18 @@ struct llama_ngram {
    }
 };

+struct llama_token_hash_function {
+    size_t operator()(const llama_token token) const {
+        // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
+        return token * 11400714819323198485llu;
+    }
+};
+
 struct llama_ngram_hash_function {
    size_t operator()(const llama_ngram & ngram) const {
-        size_t hash = 0;
-        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
-            hash ^= std::hash<llama_token>{}(ngram.tokens[i]);
+        size_t hash = llama_token_hash_function{}(ngram.tokens[0]);
+        for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
+            hash ^= llama_token_hash_function{}(ngram.tokens[i]);
        }
        return hash;
    }
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -282,8 +282,6 @@ static llama_token llama_sampling_sample_impl(
        GGML_ASSERT(!original_logits.empty());
    }
    llama_token id = 0;
-    // Get a pointer to the logits
-    float * logits = llama_get_logits_ith(ctx_main, idx);

    if (temp < 0.0) {
        // greedy sampling, with probs
@ -324,12 +322,15 @@ static llama_token llama_sampling_sample_impl(
    }

    if (ctx_sampling->grammar != NULL && !is_resampling) {
+        // Get a pointer to the logits
+        float * logits = llama_get_logits_ith(ctx_main, idx);
+
        // Create an array with a single token data element for the sampled id
        llama_token_data single_token_data = {id, logits[id], 0.0f};
        llama_token_data_array single_token_data_array = { &single_token_data, 1, false };

        // Apply grammar constraints to the single token
-        llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar);
+        llama_grammar_sample(ctx_sampling->grammar, ctx_main, &single_token_data_array);

        // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
        bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
@ -377,7 +378,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
    if (ctx_sampling->grammar != NULL && !apply_grammar) {
        GGML_ASSERT(original_logits != NULL);
        // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
-        *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
+        *original_logits = {logits, logits + n_vocab};
    }

    // apply params.logit_bias map
@ -390,10 +391,10 @@ static llama_token_data_array llama_sampling_prepare_impl(
        llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
    }

-    cur.clear();
+    cur.resize(n_vocab);

    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
    }

    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
@ -420,7 +421,7 @@ static llama_token_data_array llama_sampling_prepare_impl(

    // apply grammar checks before sampling logic
    if (apply_grammar && ctx_sampling->grammar != NULL) {
-        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
+        llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p);
    }

    return cur_p;
@ -454,6 +455,6 @@ void llama_sampling_accept(
    ctx_sampling->prev.push_back(id);

    if (ctx_sampling->grammar != NULL && apply_grammar) {
-        llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
+        llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id);
    }
 }
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-

 # This script downloads the tokenizer models of the specified models from Huggingface and
-# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
+# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
 #
 # This is necessary in order to analyze the type of pre-tokenizer used by the model and
 # provide the necessary information to llama.cpp via the GGUF header in order to implement
@ -15,9 +15,9 @@
 # - Add a new model to the "models" list
 # - Run the script with your huggingface token:
 #
-#   python3 convert-hf-to-gguf-update.py <huggingface_token>
+#   python3 convert_hf_to_gguf_update.py <huggingface_token>
 #
-# - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py
+# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
 # - Update llama.cpp with the new pre-tokenizer if necessary
 #
 # TODO: generate tokenizer tests for llama.cpp
@ -37,7 +37,7 @@ from enum import IntEnum, auto
 from transformers import AutoTokenizer

 logging.basicConfig(level=logging.DEBUG)
-logger = logging.getLogger("convert-hf-to-gguf-update")
+logger = logging.getLogger("convert_hf_to_gguf_update")
 sess = requests.Session()


@ -45,20 +45,21 @@ class TOKENIZER_TYPE(IntEnum):
    SPM = auto()
    BPE = auto()
    WPM = auto()
+    UGM = auto()


 # TODO: this string has to exercise as much pre-tokenizer functionality as possible
 #       will be updated with time - contributions welcome
-chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
+CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'

 if len(sys.argv) == 2:
    token = sys.argv[1]
    if not token.startswith("hf_"):
        logger.info("Huggingface token seems invalid")
-        logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
+        logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
        sys.exit(1)
 else:
-    logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
+    logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
    sys.exit(1)

 # TODO: add models here, base models preferred
@ -85,6 +86,14 @@ models = [
    {"name": "smaug-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
    {"name": "poro-chat",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
    {"name": "jina-v2-code",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
+    {"name": "viking",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
+    {"name": "gemma",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
+    {"name": "gemma-2",        "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
+    {"name": "jais",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
+    {"name": "t5",             "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
+    {"name": "codeshell",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
+    {"name": "tekken",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
+    {"name": "smollm",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
 ]


@ -93,8 +102,8 @@ def download_file_with_auth(url, token, save_path):
    response = sess.get(url, headers=headers)
    response.raise_for_status()
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
-    with open(save_path, 'wb') as f:
-        f.write(response.content)
+    with open(save_path, 'wb') as downloaded_file:
+        downloaded_file.write(response.content)
    logger.info(f"File {save_path} downloaded successfully")


@ -106,9 +115,13 @@ def download_model(model):
    os.makedirs(f"models/tokenizers/{name}", exist_ok=True)

    files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
+
    if tokt == TOKENIZER_TYPE.SPM:
        files.append("tokenizer.model")

+    if tokt == TOKENIZER_TYPE.UGM:
+        files.append("spiece.model")
+
    for file in files:
        save_path = f"models/tokenizers/{name}/{file}"
        if os.path.isfile(save_path):
@ -124,14 +137,14 @@ for model in models:
        logger.error(f"Failed to download model {model['name']}. Error: {e}")


-# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
+# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:

 src_ifs = ""
 for model in models:
    name = model["name"]
    tokt = model["tokt"]

-    if tokt == TOKENIZER_TYPE.SPM:
+    if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
        continue

    # Skip if the tokenizer folder does not exist or there are other download issues previously
@ -141,12 +154,15 @@ for model in models:

    # create the tokenizer
    try:
-        tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+        if name == "t5":
+            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
    except OSError as e:
        logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
        continue  # Skip to the next model if the tokenizer can't be loaded

-    chktok = tokenizer.encode(chktxt)
+    chktok = tokenizer.encode(CHK_TXT)
    chkhsh = sha256(str(chktok).encode()).hexdigest()

    logger.info(f"model: {name}")
@ -178,7 +194,7 @@ src_func = f"""
        # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
        # use in llama.cpp to implement the same pre-tokenizer

-        chktxt = {repr(chktxt)}
+        chktxt = {repr(CHK_TXT)}

        chktok = tokenizer.encode(chktxt)
        chkhsh = sha256(str(chktok).encode()).hexdigest()
@ -188,7 +204,7 @@ src_func = f"""

        res = None

-        # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
+        # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
        #       or pull the latest version of the model from Huggingface
        #       don't edit the hashes manually!
 {src_ifs}
@ -197,9 +213,9 @@ src_func = f"""
            logger.warning("**************************************************************************************")
            logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
            logger.warning("**          There are 2 possible reasons for this:")
-            logger.warning("**          - the model has not been added to convert-hf-to-gguf-update.py yet")
+            logger.warning("**          - the model has not been added to convert_hf_to_gguf_update.py yet")
            logger.warning("**          - the pre-tokenization config has changed upstream")
-            logger.warning("**          Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
+            logger.warning("**          Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
            logger.warning("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
            logger.warning("**")
            logger.warning(f"** chkhsh:  {{chkhsh}}")
@ -213,7 +229,7 @@ src_func = f"""
        return res
 """

-convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
+convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
 convert_py = convert_py_pth.read_text(encoding="utf-8")
 convert_py = re.sub(
    r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
@ -224,7 +240,7 @@ convert_py = re.sub(

 convert_py_pth.write_text(convert_py, encoding="utf-8")

-logger.info("+++ convert-hf-to-gguf.py was updated")
+logger.info("+++ convert_hf_to_gguf.py was updated")

 # generate tests for each tokenizer model

@ -262,6 +278,7 @@ tests = [
    "\n =",
    "' era",
    "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
+    "!!!!!!",
    "3",
    "33",
    "333",
@ -271,8 +288,9 @@ tests = [
    "3333333",
    "33333333",
    "333333333",
-    # "Cửa Việt", # llama-bpe fails on this
-    chktxt,
+    "Cửa Việt", # llama-bpe fails on this
+    " discards",
+    CHK_TXT,
 ]

 # write the tests to ./models/ggml-vocab-{name}.gguf.inp
@ -299,7 +317,10 @@ for model in models:

    # create the tokenizer
    try:
-        tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+        if name == "t5":
+            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
    except OSError as e:
        logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
        continue  # Skip this model and continue with the next one in the loop
@ -325,6 +346,6 @@ logger.info("\nRun the following commands to generate the vocab files for testin
 for model in models:
    name = model["name"]

-    print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
+    print(f"python3 convert_hf_to_gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100

 logger.info("\n")
--- a/convert_llama_ggml_to_gguf.py
+++ b/convert_llama_ggml_to_gguf.py
@ -132,6 +132,10 @@ class Tensor:


 class GGMLModel:
+
+    file_format: GGMLFormat
+    format_version: int
+
    def __init__(self):
        self.hyperparameters = None
        self.vocab = None
@ -290,7 +294,7 @@ class GGMLToGGUF:
        if self.vocab_override is not None:
            vo = self.vocab_override
            logger.info('* Adding vocab item(s)')
-            for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
+            for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
                tokens.append(vbytes)
                scores.append(score)
                toktypes.append(ttype)
@ -354,7 +358,8 @@ class GGMLToGGUF:


 def handle_metadata(cfg, hp):
-    import convert
+    import examples.convert_legacy_llama as convert
+
    assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
    hf_config_path   = cfg.model_metadata_dir / "config.json"
    orig_config_path = cfg.model_metadata_dir / "params.json"
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@ -0,0 +1,393 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+import logging
+import argparse
+import os
+import sys
+import json
+from math import prod
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
+
+import torch
+
+if TYPE_CHECKING:
+    from torch import Tensor
+
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
+import gguf
+
+# reuse model definitions from convert_hf_to_gguf.py
+from convert_hf_to_gguf import LazyTorchTensor, Model
+
+logger = logging.getLogger("lora-to-gguf")
+
+
+@dataclass
+class PartialLoraTensor:
+    A: Tensor | None = None
+    B: Tensor | None = None
+
+
+# magic to support tensor shape modifications and splitting
+class LoraTorchTensor:
+    _lora_A: Tensor  # (n_rank, row_size)
+    _lora_B: Tensor  # (col_size, n_rank)
+    _rank: int
+
+    def __init__(self, A: Tensor, B: Tensor):
+        assert len(A.shape) == len(B.shape)
+        assert A.shape[-2] == B.shape[-1]
+        if A.dtype != B.dtype:
+            A = A.to(torch.float32)
+            B = B.to(torch.float32)
+        self._lora_A = A
+        self._lora_B = B
+        self._rank = B.shape[-1]
+
+    def get_lora_A_B(self) -> tuple[Tensor, Tensor]:
+        return (self._lora_A, self._lora_B)
+
+    def __getitem__(
+        self,
+        indices: (
+            SupportsIndex
+            | slice
+            | tuple[SupportsIndex | slice | Tensor, ...]  # TODO: add ellipsis in the type signature
+        ),
+    ) -> LoraTorchTensor:
+        shape = self.shape
+        if isinstance(indices, SupportsIndex):
+            if len(shape) > 2:
+                return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
+            else:
+                raise NotImplementedError  # can't return a vector
+        elif isinstance(indices, slice):
+            if len(shape) > 2:
+                return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
+            else:
+                return LoraTorchTensor(self._lora_A, self._lora_B[indices])
+        elif isinstance(indices, tuple):
+            assert len(indices) > 0
+            if indices[-1] is Ellipsis:
+                return self[indices[:-1]]
+            # expand ellipsis
+            indices = tuple(
+                u
+                for v in (
+                    (
+                        (slice(None, None) for _ in range(len(indices) - 1))
+                        if i is Ellipsis
+                        else (i,)
+                    )
+                    for i in indices
+                )
+                for u in v
+            )
+
+            if len(indices) < len(shape):
+                indices = (*indices, *(slice(None, None) for _ in range(len(indices), len(shape))))
+
+            # TODO: make sure this is correct
+            indices_A = (
+                *(
+                    (
+                        j.__index__() % self._lora_A.shape[i]
+                        if isinstance(j, SupportsIndex)
+                        else slice(None, None)
+                    )
+                    for i, j in enumerate(indices[:-2])
+                ),
+                slice(None, None),
+                indices[-1],
+            )
+            indices_B = indices[:-1]
+            return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
+        else:
+            raise NotImplementedError  # unknown indice type
+
+    @property
+    def dtype(self) -> torch.dtype:
+        assert self._lora_A.dtype == self._lora_B.dtype
+        return self._lora_A.dtype
+
+    @property
+    def shape(self) -> tuple[int, ...]:
+        assert len(self._lora_A.shape) == len(self._lora_B.shape)
+        return (*self._lora_B.shape[:-1], self._lora_A.shape[-1])
+
+    def size(self, dim=None):
+        assert dim is None
+        return self.shape
+
+    def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
+        if isinstance(shape[0], tuple):
+            new_shape: tuple[int, ...] = shape[0]
+        else:
+            new_shape = cast(tuple[int, ...], shape)
+        orig_shape = self.shape
+        if len(new_shape) < 2:
+            raise NotImplementedError  # can't become a vector
+
+        # expand -1 in the shape
+        if any(dim == -1 for dim in new_shape):
+            n_elems = prod(orig_shape)
+            n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
+            assert n_elems % n_new_elems == 0
+            new_shape = (*(dim if dim != -1 else n_elems // n_new_elems for dim in new_shape),)
+
+        if new_shape[-1] != orig_shape[-1]:
+            raise NotImplementedError  # can't reshape the row size trivially
+
+        shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
+        shape_B = (*new_shape[:-1], self._rank)
+        return LoraTorchTensor(
+            self._lora_A.reshape(shape_A),
+            self._lora_B.reshape(shape_B),
+        )
+
+    def reshape_as(self, other: Tensor) -> LoraTorchTensor:
+        return self.reshape(*other.shape)
+
+    def view(self, *size: int) -> LoraTorchTensor:
+        return self.reshape(*size)
+
+    def permute(self, *dims: int) -> LoraTorchTensor:
+        shape = self.shape
+        dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
+        if dims[-1] == -1:
+            # TODO: support higher dimensional A shapes bigger than 1
+            assert all(dim == 1 for dim in self._lora_A.shape[:-2])
+            return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
+        if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
+            return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims))
+        else:
+            # TODO: compose the above two
+            raise NotImplementedError
+
+    def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
+        shape = self.shape
+        dims = [i for i in range(len(shape))]
+        dims[dim0], dims[dim1] = dims[dim1], dims[dim0]
+        return self.permute(*dims)
+
+    def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor:
+        return self.transpose(axis0, axis1)
+
+    def to(self, *args, **kwargs):
+        return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))
+
+    @classmethod
+    def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
+        del types  # unused
+
+        if kwargs is None:
+            kwargs = {}
+
+        if func is torch.permute:
+            return type(args[0]).permute(*args, **kwargs)
+        elif func is torch.reshape:
+            return type(args[0]).reshape(*args, **kwargs)
+        elif func is torch.stack:
+            assert isinstance(args[0], Sequence)
+            dim = kwargs.get("dim", 0)
+            assert dim == 0
+            return LoraTorchTensor(
+                torch.stack([a._lora_A for a in args[0]], dim),
+                torch.stack([b._lora_B for b in args[0]], dim),
+            )
+        elif func is torch.cat:
+            assert isinstance(args[0], Sequence)
+            dim = kwargs.get("dim", 0)
+            assert dim == 0
+            if len(args[0][0].shape) > 2:
+                return LoraTorchTensor(
+                    torch.cat([a._lora_A for a in args[0]], dim),
+                    torch.cat([b._lora_B for b in args[0]], dim),
+                )
+            elif all(torch.equal(args[0][0]._lora_A, t._lora_A) for t in args[0][1:]):
+                return LoraTorchTensor(
+                    args[0][0]._lora_A,
+                    torch.cat([b._lora_B for b in args[0]], dim),
+                )
+            else:
+                raise NotImplementedError
+        else:
+            raise NotImplementedError
+
+
+def get_base_tensor_name(lora_tensor_name: str) -> str:
+    base_name = lora_tensor_name.replace("base_model.model.", "")
+    base_name = base_name.replace(".lora_A.weight", ".weight")
+    base_name = base_name.replace(".lora_B.weight", ".weight")
+    return base_name
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file")
+    parser.add_argument(
+        "--outfile", type=Path,
+        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
+    )
+    parser.add_argument(
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
+        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
+    )
+    parser.add_argument(
+        "--bigendian", action="store_true",
+        help="model is executed on big endian machine",
+    )
+    parser.add_argument(
+        "--no-lazy", action="store_true",
+        help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
+    )
+    parser.add_argument(
+        "--verbose", action="store_true",
+        help="increase output verbosity",
+    )
+    parser.add_argument(
+        "--dry-run", action="store_true",
+        help="only print out what will be done, without writing any new files",
+    )
+    parser.add_argument(
+        "--base", type=Path, required=True,
+        help="directory containing base model file",
+    )
+    parser.add_argument(
+        "lora_path", type=Path,
+        help="directory containing LoRA adapter file",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+
+    ftype_map: dict[str, gguf.LlamaFileType] = {
+        "f32": gguf.LlamaFileType.ALL_F32,
+        "f16": gguf.LlamaFileType.MOSTLY_F16,
+        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
+        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
+        "auto": gguf.LlamaFileType.GUESSED,
+    }
+
+    ftype = ftype_map[args.outtype]
+
+    dir_base_model: Path = args.base
+    dir_lora: Path = args.lora_path
+    lora_config = dir_lora / "adapter_config.json"
+    input_model = dir_lora / "adapter_model.safetensors"
+
+    if args.outfile is not None:
+        fname_out = args.outfile
+    else:
+        # output in the same directory as the model by default
+        fname_out = dir_lora
+
+    if os.path.exists(input_model):
+        # lazy import load_file only if lora is in safetensors format.
+        from safetensors.torch import load_file
+
+        lora_model = load_file(input_model, device="cpu")
+    else:
+        input_model = os.path.join(dir_lora, "adapter_model.bin")
+        lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
+
+    # load base model
+    logger.info(f"Loading base model: {dir_base_model.name}")
+    hparams = Model.load_hparams(dir_base_model)
+    with torch.inference_mode():
+        try:
+            model_class = Model.from_model_architecture(hparams["architectures"][0])
+        except NotImplementedError:
+            logger.error(f"Model {hparams['architectures'][0]} is not supported")
+            sys.exit(1)
+
+        class LoraModel(model_class):
+            model_arch = model_class.model_arch
+
+            lora_alpha: float
+
+            def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
+
+                super().__init__(*args, **kwargs)
+
+                self.dir_model_card = dir_lora_model
+                self.lora_alpha = float(lora_alpha)
+
+            def set_type(self):
+                self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
+                self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
+
+            def set_gguf_parameters(self):
+                self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
+                super().set_gguf_parameters()
+
+            def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
+                tensor_map: dict[str, PartialLoraTensor] = {}
+
+                for name, tensor in lora_model.items():
+                    if self.lazy:
+                        tensor = LazyTorchTensor.from_eager(tensor)
+                    base_name = get_base_tensor_name(name)
+                    is_lora_a = ".lora_A.weight" in name
+                    is_lora_b = ".lora_B.weight" in name
+                    if not is_lora_a and not is_lora_b:
+                        if ".base_layer.weight" in name:
+                            continue
+                        logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
+                        sys.exit(1)
+
+                    if base_name in tensor_map:
+                        if is_lora_a:
+                            tensor_map[base_name].A = tensor
+                        else:
+                            tensor_map[base_name].B = tensor
+                    else:
+                        if is_lora_a:
+                            tensor_map[base_name] = PartialLoraTensor(A=tensor)
+                        else:
+                            tensor_map[base_name] = PartialLoraTensor(B=tensor)
+
+                for name, tensor in tensor_map.items():
+                    assert tensor.A is not None
+                    assert tensor.B is not None
+                    yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
+
+            def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+                dest = super().modify_tensors(data_torch, name, bid)
+                for dest_name, dest_data in dest:
+                    assert isinstance(dest_data, LoraTorchTensor)
+                    lora_a, lora_b = dest_data.get_lora_A_B()
+
+                    yield (dest_name + ".lora_a", lora_a)
+                    yield (dest_name + ".lora_b", lora_b)
+
+        with open(lora_config, "r") as f:
+            lparams: dict[str, Any] = json.load(f)
+
+        alpha: float = lparams["lora_alpha"]
+
+        model_instance = LoraModel(
+            dir_base_model,
+            ftype,
+            fname_out,
+            is_big_endian=args.bigendian,
+            use_temp_file=False,
+            eager=args.no_lazy,
+            dry_run=args.dry_run,
+            dir_lora_model=dir_lora,
+            lora_alpha=alpha,
+        )
+
+        logger.info("Exporting model...")
+        model_instance.write()
+        logger.info(f"Model successfully exported to {model_instance.fname_out}")
--- a/docs/android.md
+++ b/docs/android.md
@ -0,0 +1,56 @@
+
+# Android
+
+## Build on Android using Termux
+[Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required).
+```
+apt update && apt upgrade -y
+apt install git make cmake
+```
+
+It's recommended to move your model inside the `~/` directory for best performance:
+```
+cd storage/downloads
+mv model.gguf ~/
+```
+
+[Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
+
+## Building the Project using Android NDK
+Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
+
+Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
+```
+$ mkdir build-android
+$ cd build-android
+$ export NDK=<your_ndk_directory>
+$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
+$ make
+```
+
+Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
+
+Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
+
+(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
+```
+$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
+$cd /data/data/com.termux/files/home/bin
+$chmod +x ./*
+```
+
+Download model [llama-2-7b-chat.Q4_K_M.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_K_M.gguf), and push it to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
+
+```
+$mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/home/model/
+```
+
+Now, you can start chatting:
+```
+$cd /data/data/com.termux/files/home/bin
+$./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
+```
+
+Here's a demo of an interactive session running on Pixel 5 phone:
+
+https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
--- a/docs/backend/BLIS.md
+++ b/docs/backend/BLIS.md
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@ -293,31 +293,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
 ```sh
 ./build/bin/llama-ls-sycl-device
 ```
-A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
+This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
 ```
-found 6 SYCL devices:
+found 2 SYCL devices:
+
 |  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
 |ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|
 |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
 | 0|[level_zero:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       1.3|        512|    1024|     32|    16225243136|
 | 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|
-| 2|    [opencl:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       3.0|        512|    1024|     32|    16225243136|
-| 3|    [opencl:gpu:1]|                    Intel(R) UHD Graphics 770|       3.0|         32|     512|     32|    53651849216|
-| 4|    [opencl:cpu:0]|         13th Gen Intel(R) Core(TM) i7-13700K|       3.0|         24|    8192|     64|    67064815616|
-| 5|    [opencl:acc:0]|               Intel(R) FPGA Emulation Device|       1.2|         24|67108864|     64|    67064815616|
 ```

-| Attribute              | Note                                                        |
-|------------------------|-------------------------------------------------------------|
-| compute capability 1.3 | Level-zero driver/runtime, recommended                      |
-| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |

 4. Launch inference

 There are two device selection modes:

 - Single device: Use one device target specified by the user.
- Multiple devices: Automatically select the devices with the same largest Max compute-units.
+- Multiple devices: Automatically choose the devices with the same backend.
+
+In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.

 | Device selection | Parameter                              |
 |------------------|----------------------------------------|
@ -474,33 +469,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
 build\bin\ls-sycl-device.exe
 ```

-The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following:
+This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
 ```
-found 6 SYCL devices:
+found 2 SYCL devices:
 |  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
 |ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|
 |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
 | 0|[level_zero:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       1.3|        512|    1024|     32|    16225243136|
 | 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|
-| 2|    [opencl:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       3.0|        512|    1024|     32|    16225243136|
-| 3|    [opencl:gpu:1]|                    Intel(R) UHD Graphics 770|       3.0|         32|     512|     32|    53651849216|
-| 4|    [opencl:cpu:0]|         13th Gen Intel(R) Core(TM) i7-13700K|       3.0|         24|    8192|     64|    67064815616|
-| 5|    [opencl:acc:0]|               Intel(R) FPGA Emulation Device|       1.2|         24|67108864|     64|    67064815616|

 ```

-| Attribute              | Note                                                      |
-|------------------------|-----------------------------------------------------------|
-| compute capability 1.3 | Level-zero running time, recommended                      |
-| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |
-

 4. Launch inference

 There are two device selection modes:

- Single device: Use one device assigned by user.
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
+- Single device: Use one device assigned by user. Default device id is 0.
+- Multiple devices: Automatically choose the devices with the same backend.
+
+In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.

 | Device selection | Parameter                              |
 |------------------|----------------------------------------|
--- a/docs/build.md
+++ b/docs/build.md
@ -0,0 +1,353 @@
+# Build llama.cpp locally
+
+**To get the Code:**
+
+```bash
+git clone https://github.com/ggerganov/llama.cpp
+cd llama.cpp
+```
+
+In order to build llama.cpp you have four different options.
+
+- Using `make`:
+  - On Linux or MacOS:
+
+      ```bash
+      make
+      ```
+
+  - On Windows (x86/x64 only, arm64 requires cmake):
+
+    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
+    2. Extract `w64devkit` on your pc.
+    3. Run `w64devkit.exe`.
+    4. Use the `cd` command to reach the `llama.cpp` folder.
+    5. From here you can run:
+        ```bash
+        make
+        ```
+
+  - Notes:
+    - For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
+    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
+    - For faster repeated compilation, install [ccache](https://ccache.dev/).
+    - For debug builds, run `make LLAMA_DEBUG=1`
+
+- Using `CMake`:
+
+  ```bash
+  cmake -B build
+  cmake --build build --config Release
+  ```
+
+  **Notes**:
+
+    - For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
+    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
+    - For faster repeated compilation, install [ccache](https://ccache.dev/).
+    - For debug builds, there are two cases:
+
+      1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
+
+      ```bash
+      cmake -B build -DCMAKE_BUILD_TYPE=Debug
+      cmake --build build
+      ```
+
+      2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
+
+      ```bash
+      cmake -B build -G "Xcode"
+      cmake --build build --config Debug
+      ```
+    - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
+      - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
+        - Tab Workload: Desktop-development with C++
+        - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
+      - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
+      - For Windows on ARM (arm64, WoA) build with:
+        ```bash
+        cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
+        cmake --build build-arm64-windows-llvm-release
+        ```
+        Note: Building for arm64 could also be done just with MSVC (with the build-arm64-windows-MSVC preset, or the standard CMake build instructions). But MSVC does not support inline ARM assembly-code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
+
+-   Using `gmake` (FreeBSD):
+
+    1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
+    2. Add your user to **video** group
+    3. Install compilation dependencies.
+
+        ```bash
+        sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
+
+        gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
+        ```
+
+## Metal Build
+
+On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
+To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option.
+
+When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
+argument.
+
+## BLAS Build
+
+Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
+
+### Accelerate Framework:
+
+This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
+
+### OpenBLAS:
+
+This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
+
+- Using `make`:
+  - On Linux:
+    ```bash
+    make GGML_OPENBLAS=1
+    ```
+
+  - On Windows:
+
+    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
+    2. Download the latest version of [OpenBLAS for Windows](https://github.com/xianyi/OpenBLAS/releases).
+    3. Extract `w64devkit` on your pc.
+    4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
+    5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
+    6. Run `w64devkit.exe`.
+    7. Use the `cd` command to reach the `llama.cpp` folder.
+    8. From here you can run:
+
+        ```bash
+        make GGML_OPENBLAS=1
+        ```
+
+- Using `CMake` on Linux:
+
+    ```bash
+    cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+    cmake --build build --config Release
+    ```
+
+### BLIS
+
+Check [BLIS.md](./backend/BLIS.md) for more information.
+
+### SYCL
+
+SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
+
+llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
+
+For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
+
+### Intel oneMKL
+
+Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
+
+- Using manual oneAPI installation:
+  By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
+    ```bash
+    source /opt/intel/oneapi/setvars.sh # You can skip this step if  in oneapi-basekit docker image, only required for manual installation
+    cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=ON
+    cmake --build build --config Release
+    ```
+
+- Using oneAPI docker image:
+  If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
+
+Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
+
+### CUDA
+
+This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
+
+For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
+
+- Using `make`:
+  ```bash
+  make GGML_CUDA=1
+  ```
+- Using `CMake`:
+
+  ```bash
+  cmake -B build -DGGML_CUDA=ON
+  cmake --build build --config Release
+  ```
+
+The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
+
+| Option                        | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
+|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| GGML_CUDA_FORCE_DMMV          | Boolean                | false   | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
+| GGML_CUDA_DMMV_X              | Positive integer >= 32 | 32      | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants.                                         |
+| GGML_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                                                                                                                         |
+| GGML_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower.                       |
+| GGML_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models                                                                                                                                                                                       |
+| GGML_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
+| GGML_CUDA_KQUANTS_ITER        | 1 or 2                 | 2       | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                                                                     |
+| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
+| GGML_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |
+
+### MUSA
+
+- Using `make`:
+  ```bash
+  make GGML_MUSA=1
+  ```
+- Using `CMake`:
+
+  ```bash
+  cmake -B build -DGGML_MUSA=ON
+  cmake --build build --config Release
+  ```
+
+### hipBLAS
+
+This provides BLAS acceleration on HIP-supported AMD GPUs.
+Make sure to have ROCm installed.
+You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
+
+- Using `make`:
+  ```bash
+  make GGML_HIPBLAS=1
+  ```
+- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
+  ```bash
+  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
+      cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+      && cmake --build build --config Release -- -j 16
+  ```
+  On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
+  However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
+
+  Note that if you get the following error:
+  ```
+  clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
+  ```
+  Try searching for a directory under `HIP_PATH` that contains the file
+  `oclc_abi_version_400.bc`. Then, add the following to the start of the
+  command: `HIP_DEVICE_LIB_PATH=<directory-you-just-found>`, so something
+  like:
+  ```bash
+  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
+  HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
+      cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+      && cmake --build build -- -j 16
+  ```
+
+- Using `make` (example for target gfx1030, build with 16 CPU threads):
+  ```bash
+  make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
+  ```
+
+- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
+  ```bash
+  set PATH=%HIP_PATH%\bin;%PATH%
+  cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
+  cmake --build build
+  ```
+  Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
+  Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.
+
+
+The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
+If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
+The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
+
+| Option                 | Legal values           | Default | Description                                                                                                                                                                                                                                    |
+|------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| GGML_CUDA_DMMV_X       | Positive integer >= 32 | 32      | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
+| GGML_CUDA_MMV_Y        | Positive integer       | 1       | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants.                                                                       |
+| GGML_CUDA_KQUANTS_ITER | 1 or 2                 | 2       | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                             |
+
+### Vulkan
+
+**Windows**
+
+#### w64devkit
+
+Download and extract [w64devkit](https://github.com/skeeto/w64devkit/releases).
+
+Download and install the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows). When selecting components, only the Vulkan SDK Core is required.
+
+Launch `w64devkit.exe` and run the following commands to copy Vulkan dependencies:
+```sh
+SDK_VERSION=1.3.283.0
+cp /VulkanSDK/$SDK_VERSION/Bin/glslc.exe $W64DEVKIT_HOME/bin/
+cp /VulkanSDK/$SDK_VERSION/Lib/vulkan-1.lib $W64DEVKIT_HOME/x86_64-w64-mingw32/lib/
+cp -r /VulkanSDK/$SDK_VERSION/Include/* $W64DEVKIT_HOME/x86_64-w64-mingw32/include/
+cat > $W64DEVKIT_HOME/x86_64-w64-mingw32/lib/pkgconfig/vulkan.pc <<EOF
+Name: Vulkan-Loader
+Description: Vulkan Loader
+Version: $SDK_VERSION
+Libs: -lvulkan-1
+EOF
+
+```
+Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
+
+#### MSYS2
+Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
+  ```sh
+  pacman -S git \
+      mingw-w64-ucrt-x86_64-gcc \
+      mingw-w64-ucrt-x86_64-cmake \
+      mingw-w64-ucrt-x86_64-vulkan-devel \
+      mingw-w64-ucrt-x86_64-shaderc
+  ```
+Switch into `llama.cpp` directory and build using CMake.
+```sh
+cmake -B build -DGGML_VULKAN=ON
+cmake --build build --config Release
+```
+
+**With docker**:
+
+You don't need to install Vulkan SDK. It will be installed inside the container.
+
+```sh
+# Build the image
+docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
+
+# Then, use it:
+docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
+```
+
+**Without docker**:
+
+Firstly, you need to make sure you have installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html)
+
+For example, on Ubuntu 22.04 (jammy), use the command below:
+
+```bash
+wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
+wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+apt update -y
+apt-get install -y vulkan-sdk
+# To verify the installation, use the command below:
+vulkaninfo
+```
+
+Alternatively your package manager might be able to provide the appropriate libraries.
+For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
+For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
+
+Then, build llama.cpp using the cmake command below:
+
+```bash
+cmake -B build -DGGML_VULKAN=1
+cmake --build build --config Release
+# Test the output binary (with "-ngl 33" to offload all layers to GPU)
+./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
+
+# You should see in the output, ggml_vulkan detected your GPU. For example:
+# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
+```
+
+### Android
+
+To read documentation for how to build on Android, [click here](./android.md)
--- a/docs/development/HOWTO-add-model.md
+++ b/docs/development/HOWTO-add-model.md
@ -1,4 +1,4 @@
-## Add a new model architecture to `llama.cpp`
+# Add a new model architecture to `llama.cpp`

 Adding a model requires few steps:

@ -9,15 +9,15 @@ Adding a model requires few steps:
 After following these steps, you can open PR.

 Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
- [main](../examples/main)
- [imatrix](../examples/imatrix)
- [quantize](../examples/quantize)
- [server](../examples/server)
+- [main](/examples/main/)
+- [imatrix](/examples/imatrix/)
+- [quantize](/examples/quantize/)
+- [server](/examples/server/)

 ### 1. Convert the model to GGUF

 This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
-Depending on the model architecture, you can use either [convert-hf-to-gguf.py](../convert-hf-to-gguf.py) or [examples/convert-legacy-llama.py](../examples/convert-legacy-llama.py) (for `llama/llama2` models in `.pth` format).
+Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](/examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format).

 The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.

@ -31,7 +31,7 @@ class MyModel(Model):
    model_arch = gguf.MODEL_ARCH.GROK
 ```

-2. Define the layout of the GGUF tensors in [constants.py](../gguf-py/gguf/constants.py)
+2. Define the layout of the GGUF tensors in [constants.py](/gguf-py/gguf/constants.py)

 Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`.

@ -54,7 +54,7 @@ Example for `falcon` model:

 As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist.

-Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](../gguf-py/gguf/tensor_mapping.py) file.
+Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](/gguf-py/gguf/tensor_mapping.py) file.

 If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it.

@ -100,7 +100,7 @@ Have a look at existing implementation like `build_llama`, `build_dbrx` or `buil

 When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.

-Note: to debug the inference graph: you can use [llama-eval-callback](../examples/eval-callback).
+Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/).

 ## GGUF specification

--- a/docs/development/debugging-tests.md
+++ b/docs/development/debugging-tests.md
--- a/docs/development/llama-star/idea-arch.key
+++ b/docs/development/llama-star/idea-arch.key
--- a/docs/development/llama-star/idea-arch.pdf
+++ b/docs/development/llama-star/idea-arch.pdf
--- a/docs/development/token_generation_performance_tips.md
+++ b/docs/development/token_generation_performance_tips.md
@ -1,7 +1,7 @@
 # Token generation performance troubleshooting

 ## Verifying that the model is running on the GPU with CUDA
-Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
+Make sure you compiled llama with the correct env variables according to [this guide](/docs/build.md#cuda), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
 ```shell
 ./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
 ```
--- a/docs/docker.md
+++ b/docs/docker.md
@ -0,0 +1,86 @@
+# Docker
+
+## Prerequisites
+* Docker must be installed and running on your system.
+* Create a folder to store big models & intermediate files (ex. /llama/models)
+
+## Images
+We have three Docker images available for this project:
+
+1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
+2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
+3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
+
+Additionally, there the following images, similar to the above:
+
+- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+
+The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
+
+## Usage
+
+The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
+
+Replace `/path/to/models` below with the actual path where you downloaded the models.
+
+```bash
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
+```
+
+On completion, you are ready to play!
+
+```bash
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
+```
+
+or with a light image:
+
+```bash
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
+```
+
+or with a server image:
+
+```bash
+docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
+```
+
+## Docker With CUDA
+
+Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
+
+## Building Docker locally
+
+```bash
+docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
+docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
+docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
+```
+
+You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
+
+The defaults are:
+
+- `CUDA_VERSION` set to `11.7.1`
+- `CUDA_DOCKER_ARCH` set to `all`
+
+The resulting images, are essentially the same as the non-CUDA images:
+
+1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
+2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
+3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
+
+## Usage
+
+After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
+
+```bash
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
+```
--- a/docs/install.md
+++ b/docs/install.md
@ -0,0 +1,39 @@
+# Install pre-built version of llama.cpp
+
+## Homebrew
+
+On Mac and Linux, the homebrew package manager can be used via
+
+```sh
+brew install llama.cpp
+```
+The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
+
+## Nix
+
+On Mac and Linux, the Nix package manager can be used via
+
+```sh
+nix profile install nixpkgs#llama-cpp
+```
+For flake enabled installs.
+
+Or
+
+```sh
+nix-env --file '<nixpkgs>' --install --attr llama-cpp
+```
+
+For non-flake enabled installs.
+
+This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
+
+## Flox
+
+On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
+
+```sh
+flox install llama-cpp
+```
+
+Flox follows the nixpkgs build of llama.cpp.
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -21,8 +21,8 @@ else()
    add_subdirectory(embedding)
    add_subdirectory(eval-callback)
    add_subdirectory(export-lora)
-    add_subdirectory(finetune)
    add_subdirectory(gbnf-validator)
+    add_subdirectory(gguf-hash)
    add_subdirectory(gguf-split)
    add_subdirectory(gguf)
    add_subdirectory(gritlm)
@ -52,5 +52,4 @@ else()
    add_subdirectory(simple)
    add_subdirectory(speculative)
    add_subdirectory(tokenize)
-    add_subdirectory(train-text-from-scratch)
 endif()
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@ -229,7 +229,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {

 private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
    var result = [CChar](repeating: 0, count: 8)
-    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), false)
+    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), 0, false)
    if nTokens < 0 {
        let actualTokensCount = -Int(nTokens)
        result = .init(repeating: 0, count: actualTokensCount)
@ -238,6 +238,7 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
            token,
            &result,
            Int32(result.count),
+            0,
            false
        )
        assert(check == actualTokensCount)
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -31,7 +31,7 @@ int main(int argc, char ** argv) {
    int n_parallel = params.n_parallel;

    // total length of the sequences including the prompt
-    int n_predict = 32;
+    int n_predict = params.n_predict;

    // init LLM

@ -93,14 +93,34 @@ int main(int argc, char ** argv) {

    // create a llama_batch
    // we use this object to submit token data for decoding
-    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0, 1);
+    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
+
+    std::vector<llama_seq_id> seq_ids(n_parallel, 0);
+    for (int32_t i = 0; i < n_parallel; ++i) {
+        seq_ids[i] = i;
+    }

    // evaluate the initial prompt
    for (size_t i = 0; i < tokens_list.size(); ++i) {
-        llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
+        llama_batch_add(batch, tokens_list[i], i, seq_ids, false);
    }
    GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());

+    if (llama_model_has_encoder(model)) {
+        if (llama_encode(ctx, batch)) {
+            LOG_TEE("%s : failed to eval\n", __func__);
+            return 1;
+        }
+
+        llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
+        if (decoder_start_token_id == -1) {
+            decoder_start_token_id = llama_token_bos(model);
+        }
+
+        llama_batch_clear(batch);
+        llama_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
+    }
+
    // llama_decode will output logits only for the last token of the prompt
    batch.logits[batch.n_tokens - 1] = true;

@ -109,11 +129,11 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    // assign the system KV cache to all parallel sequences
-    // this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
-    for (int32_t i = 1; i < n_parallel; ++i) {
-        llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
-    }
+    //// assign the system KV cache to all parallel sequences
+    //// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
+    //for (int32_t i = 1; i < n_parallel; ++i) {
+    //    llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
+    //}

    if (n_parallel > 1) {
        LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
--- a/examples/convert_legacy_llama.py
+++ b/examples/convert_legacy_llama.py
@ -24,7 +24,7 @@ from abc import ABC, abstractmethod
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional
+from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar

 import numpy as np

@ -346,42 +346,6 @@ class Params:
        return params


-@dataclass
-class Metadata:
-    name: Optional[str] = None
-    author: Optional[str] = None
-    version: Optional[str] = None
-    url: Optional[str] = None
-    description: Optional[str] = None
-    licence: Optional[str] = None
-    source_url: Optional[str] = None
-    source_hf_repo: Optional[str] = None
-
-    @staticmethod
-    def load(metadata_path: Path) -> Metadata:
-        if metadata_path is None or not metadata_path.exists():
-            return Metadata()
-
-        with open(metadata_path, 'r') as file:
-            data = json.load(file)
-
-        # Create a new Metadata instance
-        metadata = Metadata()
-
-        # Assigning values to Metadata attributes if they exist in the JSON file
-        # This is based on LLM_KV_NAMES mapping in llama.cpp
-        metadata.name = data.get("general.name")
-        metadata.author = data.get("general.author")
-        metadata.version = data.get("general.version")
-        metadata.url = data.get("general.url")
-        metadata.description = data.get("general.description")
-        metadata.license = data.get("general.license")
-        metadata.source_url = data.get("general.source.url")
-        metadata.source_hf_repo = data.get("general.source.huggingface.repository")
-
-        return metadata
-
-
 #
 # data loading
 # TODO: reuse (probably move to gguf.py?)
@ -492,12 +456,13 @@ class LazyTensor:

 LazyModel: TypeAlias = 'dict[str, LazyTensor]'

+ModelFormat: TypeAlias = Literal['ggml', 'torch', 'safetensors', 'none']

@dataclass
 class ModelPlus:
    model: LazyModel
    paths: list[Path]  # Where this was read from.
-    format: Literal['ggml', 'torch', 'safetensors', 'none']
+    format: ModelFormat
    vocab: BaseVocab | None  # For GGML models (which have vocab built in), the vocab.


@ -536,7 +501,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:


 def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
-    formats = set(mp.format for mp in models_plus)
+    formats: set[ModelFormat] = set(mp.format for mp in models_plus)
    assert len(formats) == 1, "different formats?"
    format = formats.pop()
    paths = [path for mp in models_plus for path in mp.paths]
@ -555,7 +520,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
    else:
        model = merge_sharded([mp.model for mp in models_plus])

-    return ModelPlus(model, paths, format, vocab)  # pytype: disable=wrong-arg-types
+    return ModelPlus(model, paths, format, vocab)


 def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
@ -805,7 +770,7 @@ class OutputFile:
    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)

-    def add_meta_model(self, params: Params, metadata: Metadata) -> None:
+    def add_meta_model(self, params: Params, metadata: gguf.Metadata | None) -> None:
        # Metadata About The Model And Its Provenence
        name = "LLaMA"
        if metadata is not None and metadata.name is not None:
@ -823,16 +788,73 @@ class OutputFile:
                self.gguf.add_author(metadata.author)
            if metadata.version is not None:
                self.gguf.add_version(metadata.version)
-            if metadata.url is not None:
-                self.gguf.add_url(metadata.url)
+            if metadata.organization is not None:
+                self.gguf.add_organization(metadata.organization)
+
+            if metadata.finetune is not None:
+                self.gguf.add_finetune(metadata.finetune)
+            if metadata.basename is not None:
+                self.gguf.add_basename(metadata.basename)
+
            if metadata.description is not None:
                self.gguf.add_description(metadata.description)
-            if metadata.licence is not None:
-                self.gguf.add_licence(metadata.licence)
+            if metadata.quantized_by is not None:
+                self.gguf.add_quantized_by(metadata.quantized_by)
+
+            if metadata.size_label is not None:
+                self.gguf.add_size_label(metadata.size_label)
+
+            if metadata.license is not None:
+                self.gguf.add_license(metadata.license)
+            if metadata.license_name is not None:
+                self.gguf.add_license_name(metadata.license_name)
+            if metadata.license_link is not None:
+                self.gguf.add_license_link(metadata.license_link)
+
+            if metadata.url is not None:
+                self.gguf.add_url(metadata.url)
+            if metadata.doi is not None:
+                self.gguf.add_doi(metadata.doi)
+            if metadata.uuid is not None:
+                self.gguf.add_uuid(metadata.uuid)
+            if metadata.repo_url is not None:
+                self.gguf.add_repo_url(metadata.repo_url)
+
            if metadata.source_url is not None:
                self.gguf.add_source_url(metadata.source_url)
-            if metadata.source_hf_repo is not None:
-                self.gguf.add_source_hf_repo(metadata.source_hf_repo)
+            if metadata.source_doi is not None:
+                self.gguf.add_source_doi(metadata.source_doi)
+            if metadata.source_uuid is not None:
+                self.gguf.add_source_uuid(metadata.source_uuid)
+            if metadata.source_repo_url is not None:
+                self.gguf.add_source_repo_url(metadata.source_repo_url)
+
+            if metadata.base_models is not None:
+                self.gguf.add_base_model_count(len(metadata.base_models))
+                for key, base_model_entry in enumerate(metadata.base_models):
+                    if "name" in base_model_entry:
+                        self.gguf.add_base_model_name(key, base_model_entry["name"])
+                    if "author" in base_model_entry:
+                        self.gguf.add_base_model_author(key, base_model_entry["author"])
+                    if "version" in base_model_entry:
+                        self.gguf.add_base_model_version(key, base_model_entry["version"])
+                    if "organization" in base_model_entry:
+                        self.gguf.add_base_model_organization(key, base_model_entry["organization"])
+                    if "url" in base_model_entry:
+                        self.gguf.add_base_model_url(key, base_model_entry["url"])
+                    if "doi" in base_model_entry:
+                        self.gguf.add_base_model_doi(key, base_model_entry["doi"])
+                    if "uuid" in base_model_entry:
+                        self.gguf.add_base_model_uuid(key, base_model_entry["uuid"])
+                    if "repo_url" in base_model_entry:
+                        self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"])
+
+            if metadata.tags is not None:
+                self.gguf.add_tags(metadata.tags)
+            if metadata.languages is not None:
+                self.gguf.add_languages(metadata.languages)
+            if metadata.datasets is not None:
+                self.gguf.add_datasets(metadata.datasets)

    def add_meta_arch(self, params: Params) -> None:
        # Metadata About The Neural Architecture Itself
@ -943,7 +965,7 @@ class OutputFile:
    @staticmethod
    def write_vocab_only(
        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata = None,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: gguf.Metadata | None = None,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)

@ -977,7 +999,7 @@ class OutputFile:
        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        pad_vocab: bool = False,
-        metadata: Metadata = None,
+        metadata: gguf.Metadata | None = None,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)

@ -1020,35 +1042,32 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
    raise ValueError(f"Unexpected combination of types: {name_to_type}")


-def model_parameter_count(model: LazyModel) -> int:
-    total_model_parameters = 0
-    for i, (name, lazy_tensor) in enumerate(model.items()):
-        sum_weights_in_tensor = 1
+def per_model_weight_count_estimation(tensors: Iterable[tuple[str, LazyTensor]]) -> tuple[int, int, int]:
+    total_params = 0
+    shared_params = 0
+    expert_params = 0
+
+    for name, lazy_tensor in tensors:
+        # We don't need these
+        if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
+            continue
+
+        # Got A Tensor
+        sum_weights_in_tensor: int = 1
+
+        # Tensor Volume
        for dim in lazy_tensor.shape:
            sum_weights_in_tensor *= dim
-        total_model_parameters += sum_weights_in_tensor
-    return total_model_parameters

+        if ".experts." in name:
+            if ".experts.0." in name:
+                expert_params += sum_weights_in_tensor
+        else:
+            shared_params += sum_weights_in_tensor

-def model_parameter_count_rounded_notation(model_params_count: int) -> str:
-    if model_params_count > 1e12 :
-        # Trillions Of Parameters
-        scaled_model_params = model_params_count * 1e-12
-        scale_suffix = "T"
-    elif model_params_count > 1e9 :
-        # Billions Of Parameters
-        scaled_model_params = model_params_count * 1e-9
-        scale_suffix = "B"
-    elif model_params_count > 1e6 :
-        # Millions Of Parameters
-        scaled_model_params = model_params_count * 1e-6
-        scale_suffix = "M"
-    else:
-        # Thousands Of Parameters
-        scaled_model_params = model_params_count * 1e-3
-        scale_suffix = "K"
+        total_params += sum_weights_in_tensor

-    return f"{round(scaled_model_params)}{scale_suffix}"
+    return total_params, shared_params, expert_params


 def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
@ -1230,34 +1249,24 @@ class VocabFactory:
        return vocab, special_vocab


-def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str:
-    quantization = {
+def default_convention_outfile(file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> str:
+    name = metadata.name if metadata.name is not None else None
+    basename = metadata.basename if metadata.basename is not None else None
+    finetune = metadata.finetune if metadata.finetune is not None else None
+    version = metadata.version if metadata.version is not None else None
+    size_label = metadata.size_label if metadata.size_label is not None else gguf.size_label(*model_params_count, expert_count=expert_count or 0)
+
+    output_type = {
        GGMLFileType.AllF32:    "F32",
        GGMLFileType.MostlyF16: "F16",
        GGMLFileType.MostlyQ8_0: "Q8_0",
    }[file_type]

-    parameters = model_parameter_count_rounded_notation(model_params_count)
-
-    expert_count = ""
-    if params.n_experts is not None:
-        expert_count = f"{params.n_experts}x"
-
-    version = ""
-    if metadata is not None and metadata.version is not None:
-        version = f"-{metadata.version}"
-
-    name = "ggml-model"
-    if metadata is not None and metadata.name is not None:
-        name = metadata.name
-    elif params.path_model is not None:
-        name = params.path_model.name
-
-    return f"{name}{version}-{expert_count}{parameters}-{quantization}"
+    return gguf.naming_convention(name, basename, finetune, version, size_label, output_type)


-def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path:
-    default_filename = default_convention_outfile(file_type, params, model_params_count, metadata)
+def default_outfile(model_paths: list[Path], file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> Path:
+    default_filename = default_convention_outfile(file_type, expert_count, model_params_count, metadata)
    ret = model_paths[0].parent / f"{default_filename}.gguf"
    if ret in model_paths:
        logger.error(
@ -1296,8 +1305,9 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--pad-vocab",    action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")
    parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")
    parser.add_argument("--verbose",      action="store_true",    help="increase output verbosity")
-    parser.add_argument("--metadata",     type=Path,              help="Specify the path for a metadata file")
+    parser.add_argument("--metadata",     type=Path,              help="Specify the path for an authorship metadata override file")
    parser.add_argument("--get-outfile",  action="store_true",    help="get calculated default outfile name")
+    parser.add_argument("--model-name",   type=str, default=None, help="name of the model")

    args = parser.parse_args(args_in)

@ -1309,32 +1319,36 @@ def main(args_in: list[str] | None = None) -> None:
    else:
        logging.basicConfig(level=logging.INFO)

-    metadata = Metadata.load(args.metadata)
+    model_name = args.model_name
+    dir_model = args.model
+
+    metadata = gguf.Metadata.load(args.metadata, dir_model, model_name)

    if args.get_outfile:
-        model_plus = load_some_model(args.model)
+        model_plus = load_some_model(dir_model)
        params = Params.load(model_plus)
-        model   = convert_model_names(model_plus.model, params, args.skip_unknown)
-        model_params_count = model_parameter_count(model_plus.model)
-        ftype   = pick_output_type(model, args.outtype)
-        print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100
+        model = convert_model_names(model_plus.model, params, args.skip_unknown)
+        model_params_count = per_model_weight_count_estimation(model_plus.model.items())
+        ftype = pick_output_type(model, args.outtype)
+
+        if (metadata is None or metadata.name is None) and params.path_model is not None:
+            metadata.name = params.path_model.name
+
+        print(f"{default_convention_outfile(ftype, params.n_experts, model_params_count, metadata)}") # noqa: NP100
        return

    if args.no_vocab and args.vocab_only:
        raise ValueError("--vocab-only does not make sense with --no-vocab")

    if args.dump_single:
-        model_plus = lazy_load_file(args.model)
+        model_plus = lazy_load_file(dir_model)
        do_dump_model(model_plus)
        return

    if not args.vocab_only:
-        model_plus = load_some_model(args.model)
+        model_plus = load_some_model(dir_model)
    else:
-        model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
-
-    model_params_count = model_parameter_count(model_plus.model)
-    logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})")
+        model_plus = ModelPlus(model = {}, paths = [dir_model / 'dummy'], format = 'none', vocab = None)

    if args.dump:
        do_dump_model(model_plus)
@ -1367,7 +1381,7 @@ def main(args_in: list[str] | None = None) -> None:
        logger.info(f"params = {params}")

    model_parent_path = model_plus.paths[0].parent
-    vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
+    vocab_path = Path(args.vocab_dir or dir_model or model_parent_path)
    vocab_factory = VocabFactory(vocab_path)
    vocab_types = None if args.no_vocab else args.vocab_type.split(",")
    vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path)
@ -1396,13 +1410,23 @@ def main(args_in: list[str] | None = None) -> None:
    if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
        vocab = model_plus.vocab

+    assert params is not None
+
+    if metadata.name is None and params.path_model is not None:
+        metadata.name = params.path_model.name
+
+    model_params_count = per_model_weight_count_estimation(model_plus.model.items())
+    logger.info(f"model parameters count : {model_params_count} ({gguf.model_weight_count_rounded_notation(model_params_count[0])})")
+
    logger.info(f"Vocab info: {vocab}")
    logger.info(f"Special vocab info: {special_vocab}")
    model   = model_plus.model
    model   = convert_model_names(model, params, args.skip_unknown)
    ftype   = pick_output_type(model, args.outtype)
    model   = convert_to_output_type(model, ftype)
-    outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata)
+    outfile = args.outfile or default_outfile(model_plus.paths, ftype, params.n_experts, model_params_count, metadata=metadata)
+
+    metadata.size_label = gguf.size_label(*model_params_count, expert_count=params.n_experts or 0)

    params.ftype = ftype
    logger.info(f"Writing {outfile}, format {ftype}")
--- a/examples/deprecation-warning/README.md
+++ b/examples/deprecation-warning/README.md
@ -0,0 +1,49 @@
+# Migration notice for binary filenames
+
+> [!IMPORTANT]
+[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
+
+This migration was important, but it is a breaking change that may not always be immediately obvious to users.
+
+Please update all scripts and workflows to use the new binary names.
+
+| Old Filename | New Filename |
+| ---- | ---- |
+| main | llama-cli |
+| server | llama-server |
+| llama-bench | llama-bench |
+| embedding | llama-embedding |
+| quantize | llama-quantize |
+| tokenize | llama-tokenize |
+| export-lora | llama-export-lora |
+| libllava.a | libllava.a |
+| baby-llama | llama-baby-llama |
+| batched | llama-batched |
+| batched-bench | llama-batched-bench |
+| benchmark-matmult | llama-benchmark-matmult |
+| convert-llama2c-to-ggml | llama-convert-llama2c-to-ggml |
+| eval-callback | llama-eval-callback |
+| gbnf-validator | llama-gbnf-validator |
+| gguf | llama-gguf |
+| gguf-split | llama-gguf-split |
+| gritlm | llama-gritlm |
+| imatrix | llama-imatrix |
+| infill | llama-infill |
+| llava-cli | llama-llava-cli |
+| lookahead | llama-lookahead |
+| lookup | llama-lookup |
+| lookup-create | llama-lookup-create |
+| lookup-merge | llama-lookup-merge |
+| lookup-stats | llama-lookup-stats |
+| parallel | llama-parallel |
+| passkey | llama-passkey |
+| perplexity | llama-perplexity |
+| q8dot | llama-q8dot |
+| quantize-stats | llama-quantize-stats |
+| retrieval | llama-retrieval |
+| save-load-state | llama-save-load-state |
+| simple | llama-simple |
+| speculative | llama-speculative |
+| vdot | llama-vdot |
+| tests/test-c.o | tests/test-c.o |
+
--- a/examples/deprecation-warning/deprecation-warning.cpp
+++ b/examples/deprecation-warning/deprecation-warning.cpp
@ -0,0 +1,35 @@
+// Warns users that this filename was deprecated, and provides a link for more information.
+
+#include <cstdio>
+#include <string>
+#include <unordered_map>
+
+// Main
+int main(int argc, char** argv) {
+    std::string filename = "main";
+    if (argc >= 1) {
+        filename = argv[0];
+    }
+
+    // Get only the program name from the full path
+    auto pos = filename.find_last_of('/');
+    if (pos != std::string::npos) {
+        filename = filename.substr(pos+1);
+    }
+
+    // Append "llama-" to the beginning of filename to get the replacemnt filename
+    auto replacement_filename = "llama-" + filename;
+
+    // The exception is if the filename is "main", then our replacement filename is "llama-cli"
+    if (filename == "main") {
+        replacement_filename = "llama-cli";
+    }
+
+    fprintf(stdout, "\n");
+    fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
+    fprintf(stdout, " Please use '%s' instead.\n", replacement_filename.c_str());
+    fprintf(stdout, " See https://github.com/ggerganov/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
+    fprintf(stdout, "\n");
+
+    return EXIT_FAILURE;
+}
--- a/examples/embedding/README.md
+++ b/examples/embedding/README.md
@ -58,4 +58,3 @@ The above command will output space-separated float values.
 ```powershell
 embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2  --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
 ```
-
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@ -62,7 +62,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
                    } else if (type == GGML_TYPE_I8) {
                        v = (float) *(int8_t *) &data[i];
                    } else {
-                        GGML_ASSERT(false);
+                        GGML_ABORT("fatal error");
                    }
                    printf("%12.4f", v);
                    sum += v;
@ -99,7 +99,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {

    char src1_str[128] = {0};
    if (src1) {
-        sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
+        snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
    }

    printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
--- a/examples/export-lora/README.md
+++ b/examples/export-lora/README.md
@ -6,12 +6,11 @@ Apply LORA adapters to base model and export the resulting model.
 usage: llama-export-lora [options]

 options:
-  -h, --help                         show this help message and exit
-  -m FNAME, --model-base FNAME       model path from which to load base model (default '')
-  -o FNAME, --model-out FNAME        path to save exported model (default '')
-  -l FNAME, --lora FNAME             apply LoRA adapter
-  -s FNAME S, --lora-scaled FNAME S  apply LoRA adapter with user defined scaling S
-  -t N, --threads N                  number of threads to use during computation (default: 4)
+  -m,    --model                  model path from which to load base model (default '')
+         --lora FNAME             path to LoRA adapter  (can be repeated to use multiple adapters)
+         --lora-scaled FNAME S    path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)
+  -t,    --threads N              number of threads to use during computation (default: 4)
+  -o,    --output FNAME           output file (default: 'ggml-lora-merged-f16.gguf')
 ```

 For example:
@ -20,7 +19,15 @@ For example:
 ./bin/llama-export-lora \
    -m open-llama-3b-v2-q8_0.gguf \
    -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
-    -l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
+    --lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.gguf
 ```

-Multiple LORA adapters can be applied by passing multiple `-l FN` or `-s FN S` command line parameters.
+Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters:
+
+```bash
+./bin/llama-export-lora \
+    -m your_base_model.gguf \
+    -o your_merged_model.gguf \
+    --lora-scaled lora_task_A.gguf 0.5 \
+    --lora-scaled lora_task_B.gguf 0.5
+```
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@ -1,462 +1,420 @@
-
 #include "common.h"
 #include "ggml.h"
 #include "ggml-alloc.h"

+#include <map>
 #include <vector>
 #include <string>
 #include <thread>
+#include <fstream>

-struct lora_info {
-    std::string filename;
+static bool g_verbose = false;
+
+static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
+    int id = gguf_find_key(ctx_gguf, key.c_str());
+    return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
+}
+
+static float get_kv_f32(struct gguf_context * ctx_gguf, const std::string & key) {
+    int id = gguf_find_key(ctx_gguf, key.c_str());
+    return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id);
+}
+
+static void zeros(std::ofstream & file, size_t n) {
+    char zero = 0;
+    for (size_t i = 0; i < n; ++i) {
+        file.write(&zero, 1);
+    }
+}
+
+static std::string ggml_ne_string(const ggml_tensor * t) {
+    std::string str;
+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+        str += std::to_string(t->ne[i]);
+        if (i + 1 < GGML_MAX_DIMS) {
+            str += ", ";
+        }
+    }
+    return str;
+}
+
+static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) {
+    struct gguf_init_params params = {
+        /*.no_alloc = */ true,
+        /*.ctx      = */ ctx_ggml,
+    };
+    struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params);
+    if (!ctx_gguf) {
+        throw std::runtime_error("failed to load input GGUF from " + fname);
+    }
+    return ctx_gguf;
+}
+
+static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
+    std::string result;
+    for (size_t pos = 0; ; pos += search.length()) {
+        auto new_pos = s.find(search, pos);
+        if (new_pos == std::string::npos) {
+            result += s.substr(pos, s.size() - pos);
+            break;
+        }
+        result += s.substr(pos, new_pos - pos) + replace;
+        pos = new_pos;
+    }
+    s = std::move(result);
+}
+
+struct file_input {
+    struct ggml_context * ctx_meta = nullptr;
+    struct gguf_context * ctx_gguf = nullptr;
+    std::ifstream f_in;
+    std::map<std::string, ggml_tensor *> tensors;
+    float alpha;
    float scale;
+
+    file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) {
+        if (!f_in.is_open()) {
+            throw std::runtime_error("failed to open input gguf from " + fname);
+        }
+
+        ctx_gguf = load_gguf(fname, &ctx_meta);
+        alpha = get_kv_f32(ctx_gguf, "adapter.lora.alpha");
+        printf("%s: loaded gguf from %s\n", __func__, fname.c_str());
+
+        for (ggml_tensor * cur = ggml_get_first_tensor(ctx_meta); cur; cur = ggml_get_next_tensor(ctx_meta, cur)) {
+            std::string name(cur->name);
+            tensors[name] = cur;
+            if (g_verbose) {
+                printf("%s: %s\n", __func__, cur->name);
+            }
+        }
+    }
+
+    ggml_tensor * get_tensor(std::string name) {
+        if (tensors.find(name) == tensors.end()) {
+            return nullptr;
+        }
+        return tensors[name];
+    }
+
+    void read_tensor_data(std::string name, std::vector<uint8_t> & buf) {
+        if (tensors.find(name) == tensors.end()) {
+            throw std::runtime_error("cannot find tensor with name: " + name);
+        }
+        auto len = ggml_nbytes(tensors[name]);
+        if (buf.size() < len) {
+            buf.resize(len);
+        }
+        auto i_tensor_in = gguf_find_tensor(ctx_gguf, name.c_str()); // idx of tensor in the input file
+        auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in);
+        f_in.seekg(offset);
+        f_in.read((char* )buf.data(), len);
+    }
+
+    ~file_input() {
+        gguf_free(ctx_gguf);
+        ggml_free(ctx_meta);
+    }
 };

-struct export_lora_params {
-    std::string fn_model_base;
-    std::string fn_model_out;
-    std::vector<struct lora_info> lora;
+struct lora_merge_ctx {
+    // input base model + adapters
+    file_input base_model;
+    std::vector<std::unique_ptr<file_input>> adapters;
+
+    // for computing merged tensor
    int n_threads;
-};
+    ggml_backend_t backend = nullptr;
+    ggml_gallocr_t allocr = nullptr;
+    std::vector<uint8_t> read_buf;

-struct lora_data {
-    struct lora_info     info;
-    std::vector<uint8_t> data;
-    struct ggml_context * ctx;
+    // output file
+    struct gguf_context * ctx_out;
+    struct ggml_context * ctx_out_ggml;
+    std::ofstream fout;

-    uint32_t lora_r;
-    uint32_t lora_alpha;
-};
+    lora_merge_ctx(
+            std::string & base_fname,
+            std::vector<std::tuple<std::string, float>> & lora_files,
+            std::string & outfile,
+            int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
+        fout.exceptions(std::ofstream::failbit); // fail fast on write errors

-struct llama_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
+        if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) {
+            throw std::runtime_error("split model is not yet supported");
+        }

-    llama_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            size = 0;
+        for (auto lora_inp : lora_files) {
+            auto fname = std::get<0>(lora_inp);
+            auto scale = std::get<1>(lora_inp);
+            std::unique_ptr<file_input> adapter(new file_input(fname, scale));
+            check_metadata_lora(adapter.get());
+            adapters.push_back(std::move(adapter));
+        }
+
+        ctx_out = gguf_init_empty();
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead(),
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+        ctx_out_ggml = ggml_init(params);
+        backend = ggml_backend_cpu_init();
+        allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+    }
+
+    void check_metadata_lora(file_input * adapter) {
+        auto general_type = get_kv_str(adapter->ctx_gguf, "general.type");
+        if (general_type != "adapter") {
+            throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
+        }
+
+        auto adapter_type = get_kv_str(adapter->ctx_gguf, "adapter.type");
+        if (adapter_type != "lora") {
+            throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
+        }
+
+        auto general_arch_base = get_kv_str(base_model.ctx_gguf, "general.architecture");
+        auto general_arch_lora = get_kv_str(adapter->ctx_gguf,   "general.architecture");
+        if (general_arch_base != general_arch_lora) {
+            throw std::runtime_error("model arch and LoRA arch mismatch");
+        }
+    }
+
+    ggml_type get_out_tensor_type(struct ggml_tensor * t) {
+        if (t->type == GGML_TYPE_F32) {
+            return GGML_TYPE_F32;
        } else {
-            seek(0, SEEK_END);
-            size = tell();
-            seek(0, SEEK_SET);
+            return GGML_TYPE_F16;
        }
    }

-    size_t tell() const {
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        GGML_ASSERT(ret != -1); // this really shouldn't fail
-        return (size_t) ret;
-    }
+    void run_merge() {
+        // prepare metadata
+        gguf_set_kv(ctx_out, base_model.ctx_gguf);
+        // output is forced to f16 for now
+        gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16);

-    void seek(size_t offset, int whence) {
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        GGML_ASSERT(ret == 0); // same
-    }
-
-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
-            return;
+        // check if all lora adapters have the same tensors
+        // TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/llama.cpp/pull/8607#discussion_r1686027777
+        static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once.";
+        if (adapters.size() > 1) {
+            for (size_t i = 1; i < adapters.size(); ++i) {
+                if (adapters[0]->tensors.size() != adapters[i]->tensors.size()) {
+                    throw std::runtime_error(err_no_subset_adapter);
+                }
+                for (auto & it : adapters[i]->tensors) {
+                    if (adapters[0]->get_tensor(it.first) == nullptr) {
+                        throw std::runtime_error(err_no_subset_adapter);
+                    }
+                }
+            }
        }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
-        if (ferror(fp)) {
-            die_fmt("read error: %s", strerror(errno));
+
+        // mapping base tensor to out tensor (same shape with base, but different type)
+        // if out_tensor == nullptr, we only copy it
+        std::vector<std::pair<struct ggml_tensor *, struct ggml_tensor *>> base_to_out_tensors;
+        for (auto & it : base_model.tensors) {
+            bool t_a = true;
+            bool t_b = true;
+            for (auto & adapter : adapters) {
+                t_a &= nullptr != adapter->get_tensor(it.first + ".lora_a");
+                t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b");
+            }
+            auto base_tensor = it.second;
+            if (!t_a && !t_b) {
+                // only copy
+                struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
+                ggml_set_name(cpy_tensor, base_tensor->name);
+                base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr));
+                gguf_add_tensor(ctx_out, cpy_tensor);
+            } else if (t_a && t_b) {
+                // need merging
+                struct ggml_tensor * out_tensor = ggml_new_tensor(
+                    ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
+                ggml_set_name(out_tensor, base_tensor->name);
+                base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor));
+                gguf_add_tensor(ctx_out, out_tensor);
+            } else {
+                throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
+            }
        }
-        if (ret != 1) {
-            die("unexpectedly reached end of file");
+
+        // placeholder for the meta data
+        {
+            size_t meta_size = gguf_get_meta_size(ctx_out);
+            zeros(fout, meta_size);
        }
-    }

-    std::uint32_t read_u32() {
-        std::uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-
-    std::string read_string(std::uint32_t len) {
-        std::vector<char> chars(len);
-        read_raw(chars.data(), len);
-        return std::string(chars.data(), len);
-    }
-
-    void write_raw(const void * ptr, size_t size) {
-        if (size == 0) {
-            return;
+        // process base model tensors
+        size_t n_merged = 0;
+        for (auto & it : base_to_out_tensors) {
+            if (it.second != nullptr) {
+                merge_tensor(it.first, it.second);
+                n_merged++;
+            } else {
+                copy_tensor(it.first);
+            }
        }
-        errno = 0;
-        size_t ret = std::fwrite(ptr, size, 1, fp);
-        if (ret != 1) {
-            die_fmt("write error: %s", strerror(errno));
+
+        // write output metadata
+        {
+            std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
+            gguf_get_meta_data(ctx_out, data.data());
+            fout.seekp(0);
+            fout.write((const char *)data.data(), data.size());
        }
+
+        printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
+        printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size());
    }

-    void write_u32(std::uint32_t val) {
-        write_raw(&val, sizeof(val));
+    void copy_tensor(struct ggml_tensor * base) {
+        printf("%s :  %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
+        size_t len = ggml_nbytes(base);
+        base_model.read_tensor_data(base->name, read_buf);
+        fout.write((char* )read_buf.data(), len);
+        zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
    }

-    bool eof() {
-        return tell() >= size;
-    }
+    void merge_tensor(struct ggml_tensor * base, struct ggml_tensor * out) {
+        std::string name_base(base->name);
+        std::string name_lora_a = name_base + ".lora_a";
+        std::string name_lora_b = name_base + ".lora_b";

-    ~llama_file() {
-        if (fp) {
-            std::fclose(fp);
+        printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
+
+        // context for input tensor
+        std::vector<struct ggml_tensor *> inp_a(adapters.size());
+        std::vector<struct ggml_tensor *> inp_b(adapters.size());
+        struct ggml_init_params params {
+            /*.mem_size   =*/ ggml_tensor_overhead()*(2+adapters.size()*2),
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+        struct ggml_context * ctx = ggml_init(params);
+
+        // alloc tensors
+        struct ggml_tensor * inp_base = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base->ne);
+        for (size_t i = 0; i < adapters.size(); ++i) {
+            auto t_a = adapters[i]->get_tensor(name_lora_a);
+            auto t_b = adapters[i]->get_tensor(name_lora_b);
+            inp_a[i] = ggml_dup_tensor(ctx, t_a);
+            inp_b[i] = ggml_dup_tensor(ctx, t_b);
        }
+        ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
+
+        // load base tensor to backend buffer
+        base_model.read_tensor_data(name_base, read_buf);
+        if (base->type != GGML_TYPE_F32) {
+            // optionally dequantize it
+            printf("%s :   + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
+            auto nels = ggml_nelements(inp_base);
+            ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
+            std::vector<uint8_t> dequant_buf(nels * sizeof(float));
+            qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
+            ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
+        } else {
+            ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
+        }
+
+        // load lora tensors to backend buffer
+        for (size_t i = 0; i < adapters.size(); ++i) {
+            adapters[i]->read_tensor_data(name_lora_a, read_buf);
+            ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i]));
+            adapters[i]->read_tensor_data(name_lora_b, read_buf);
+            ggml_backend_tensor_set(inp_b[i], read_buf.data(), 0, ggml_nbytes(inp_b[i]));
+        }
+
+        // build graph
+        struct ggml_cgraph * gf;
+        {
+            static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
+            static std::vector<uint8_t> buf(buf_size);
+            struct ggml_init_params params0 = {
+                /*.mem_size   =*/ buf_size,
+                /*.mem_buffer =*/ buf.data(),
+                /*.no_alloc   =*/ true,
+            };
+            struct ggml_context * ctx0 = ggml_init(params0);
+            gf = ggml_new_graph(ctx0);
+            struct ggml_tensor * cur = inp_base;
+            for (size_t i = 0; i < adapters.size(); ++i) {
+                struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32)));
+                struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
+                // scale
+                const float alpha = adapters[i]->alpha;
+                const float rank  = (float) inp_b[i]->ne[0];
+                const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
+                delta = ggml_scale(ctx0, delta, scale);
+                cur = ggml_add(ctx0, delta, cur);
+                printf("%s :   + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
+                printf("%s :     input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
+            }
+            cur = ggml_cast(ctx0, cur, out->type);
+            printf("%s :   + output type is %s\n", __func__, ggml_type_name(out->type));
+            ggml_build_forward_expand(gf, cur);
+            ggml_free(ctx0);
+        }
+
+        // compute
+        {
+            ggml_gallocr_alloc_graph(allocr, gf);
+            ggml_backend_cpu_set_n_threads(backend, n_threads);
+            ggml_backend_graph_compute(backend, gf);
+        }
+
+        // write data to output file
+        {
+            auto result = gf->nodes[gf->n_nodes - 1];
+            size_t len = ggml_nbytes(result);
+            if (read_buf.size() < len) {
+                read_buf.resize(len);
+            }
+            ggml_backend_tensor_get(result, read_buf.data(), 0, len);
+            fout.write((char* )read_buf.data(), len);
+            zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
+        }
+
+        ggml_free(ctx);
+        ggml_backend_buffer_free(buffer);
+    }
+
+    ~lora_merge_ctx() {
+        ggml_gallocr_free(allocr);
+        ggml_backend_free(backend);
+        gguf_free(ctx_out);
+        ggml_free(ctx_out_ggml);
    }
 };

-static struct export_lora_params get_default_export_lora_params() {
-    struct export_lora_params result;
-    result.fn_model_base = "";
-    result.fn_model_out  = "";
-    result.n_threads = GGML_DEFAULT_N_THREADS;
-    return result;
-}
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);

-static void export_lora_print_usage(int /*argc*/, char ** argv, const struct export_lora_params * params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help                         show this help message and exit\n");
-    fprintf(stderr, "  -m FNAME, --model-base FNAME       model path from which to load base model (default '%s')\n", params->fn_model_base.c_str());
-    fprintf(stderr, "  -o FNAME, --model-out FNAME        path to save exported model (default '%s')\n", params->fn_model_out.c_str());
-    fprintf(stderr, "  -l FNAME, --lora FNAME             apply LoRA adapter\n");
-    fprintf(stderr, "  -s FNAME S, --lora-scaled FNAME S  apply LoRA adapter with user defined scaling S\n");
-    fprintf(stderr, "  -t N, --threads N                  number of threads to use during computation (default: %d)\n", params->n_threads);
-}
-
-static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_params * params) {
-    bool invalid_param = false;
-    std::string arg;
-    struct export_lora_params default_params = get_default_export_lora_params();
-    const std::string arg_prefix = "--";
-
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
-            std::replace(arg.begin(), arg.end(), '_', '-');
-        }
-
-        if (arg == "-m" || arg == "--model-base") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_model_base = argv[i];
-        } else if (arg == "-o" || arg == "--model-out") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_model_out = argv[i];
-        } else if (arg == "-l" || arg == "--lora") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            struct lora_info lora;
-            lora.filename = argv[i];
-            lora.scale = 1.0f;
-            params->lora.push_back(lora);
-        } else if (arg == "-s" || arg == "--lora-scaled") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            struct lora_info lora;
-            lora.filename = argv[i];
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            lora.scale = std::stof(argv[i]);
-            params->lora.push_back(lora);
-        } else if (arg == "-t" || arg == "--threads") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_threads = std::stoi(argv[i]);
-            if (params->n_threads <= 0) {
-                params->n_threads = std::thread::hardware_concurrency();
-            }
-        } else {
-            fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str());
-            export_lora_print_usage(argc, argv, &default_params);
-            exit(1);
-        }
-    }
-
-    if (params->fn_model_base == default_params.fn_model_base) {
-        fprintf(stderr, "error: please specify a filename for model-base.\n");
-        export_lora_print_usage(argc, argv, &default_params);
-        exit(1);
-    }
-    if (params->fn_model_out == default_params.fn_model_out) {
-        fprintf(stderr, "error: please specify a filename for model-out.\n");
-        export_lora_print_usage(argc, argv, &default_params);
-        exit(1);
-    }
-    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: '%s'\n", arg.c_str());
-        export_lora_print_usage(argc, argv, &default_params);
-        exit(1);
-    }
-    return true;
-}
-
-static void free_lora(struct lora_data * lora) {
-    if (lora->ctx != NULL) {
-        ggml_free(lora->ctx);
-    }
-    delete lora;
-}
-
-static struct lora_data * load_lora(struct lora_info * info) {
-    struct lora_data * result = new struct lora_data;
-    result->info = *info;
-    result->ctx = NULL;
-    result->lora_r     = 1;
-    result->lora_alpha = 1;
-
-    struct llama_file file(info->filename.c_str(), "rb");
-    if (file.fp == NULL) {
-        fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n",
-            info->filename.c_str());
-        free_lora(result);
-        return NULL;
-    }
-
-    struct ggml_init_params params_ggml;
-    params_ggml.mem_size   = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE;
-    params_ggml.mem_buffer = NULL;
-    params_ggml.no_alloc   = true;
-    result->ctx = ggml_init(params_ggml);
-
-    uint32_t magic   = file.read_u32();
-    if (magic != LLAMA_FILE_MAGIC_GGLA) {
-        die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
-    }
-    uint32_t version = file.read_u32();
-    if (version != 1) {
-        die_fmt("unexpected lora file version '%u' in '%s'", (unsigned) version, info->filename.c_str());
-    }
-    result->lora_r     = file.read_u32();
-    result->lora_alpha = file.read_u32();
-    // read tensor infos from file
-    std::vector<char> name_buf;
-    std::vector<struct ggml_tensor *> tensors;
-    std::vector<size_t> tensors_offset;
-    size_t total_nbytes_pad = 0;
-    while(!file.eof()) {
-        int64_t ne[4]   = {1,1,1,1};
-        uint32_t n_dims  = file.read_u32();
-        uint32_t namelen = file.read_u32();
-        uint32_t type    = file.read_u32();
-        for (uint32_t k = 0; k < n_dims; ++k) {
-            ne[k] = (int64_t)file.read_u32();
-        }
-        name_buf.clear();
-        name_buf.resize(namelen + 1, '\0');
-        file.read_raw(name_buf.data(), namelen);
-        file.seek((0-file.tell()) & 31, SEEK_CUR);
-        size_t offset = file.tell();
-        struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne);
-        ggml_set_name(tensor, name_buf.data());
-        size_t nbytes     = ggml_nbytes(tensor);
-        size_t nbytes_pad = ggml_nbytes_pad(tensor);
-        total_nbytes_pad += nbytes_pad;
-        tensors.push_back(tensor);
-        tensors_offset.push_back(offset);
-        file.seek(nbytes, SEEK_CUR);
-    }
-    // read tensor data
-    result->data.resize(total_nbytes_pad);
-    size_t data_offset = 0;
-    for (size_t i = 0; i < tensors.size(); ++i) {
-        struct ggml_tensor * tensor = tensors[i];
-        size_t offset     = tensors_offset[i];
-        size_t nbytes     = ggml_nbytes(tensor);
-        size_t nbytes_pad = ggml_nbytes_pad(tensor);
-        file.seek(offset, SEEK_SET);
-        tensor->data = result->data.data() + data_offset;
-        file.read_raw(tensor->data, nbytes);
-        data_offset += nbytes_pad;
-    }
-    return result;
-}
-
-
-static struct ggml_cgraph * build_graph_lora(
-    struct ggml_context * ctx,
-    struct ggml_tensor * tensor,
-    struct ggml_tensor * lora_a,
-    struct ggml_tensor * lora_b,
-    float scaling
-) {
-    struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
-    if (scaling != 1.0f) {
-        ab = ggml_scale(ctx, ab, scaling);
-    }
-    struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);
-
-    struct ggml_cgraph * gf = ggml_new_graph(ctx);
-    ggml_build_forward_expand (gf, res);
-    return gf;
-}
-
-static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int n_threads) {
-    if (lora->ctx == NULL) {
-        return false;
-    }
-    std::string name = ggml_get_name(tensor);
-    std::string name_a = name + std::string(".loraA");
-    std::string name_b = name + std::string(".loraB");
-    struct ggml_tensor * lora_a = ggml_get_tensor(lora->ctx, name_a.c_str());
-    struct ggml_tensor * lora_b = ggml_get_tensor(lora->ctx, name_b.c_str());
-    if (lora_a == NULL || lora_b == NULL) {
-        return false;
-    }
-
-    float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;
-
-    struct ggml_init_params params;
-    params.mem_size   = GGML_OBJECT_SIZE + ggml_graph_overhead() + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
-    params.mem_buffer = NULL;
-    params.no_alloc   = true;
-    struct ggml_context * ctx = NULL;
-    struct ggml_gallocr * alloc = NULL;
-    struct ggml_cgraph  * gf = NULL;
-
-    ctx   = ggml_init(params);
-    alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
-    gf    = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
-
-    ggml_gallocr_alloc_graph(alloc, gf);
-
-    struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
-    static std::vector<uint8_t> data_work;
-    data_work.resize(cplan.work_size);
-    cplan.work_data = data_work.data();
-
-    ggml_graph_compute(gf, &cplan);
-
-    ggml_gallocr_free(alloc);
-    ggml_free(ctx);
-    return true;
-}
-
-static void export_lora(struct export_lora_params * params) {
-    // load all loras
-    std::vector<struct lora_data *> loras;
-    for (size_t i = 0; i < params->lora.size(); ++i) {
-        struct lora_data * lora = load_lora(&params->lora[i]);
-        if (lora != NULL) {
-            loras.push_back(lora);
-        }
-    }
-    if (loras.size() == 0) {
-        fprintf(stderr, "warning: no lora adapters will be applied.\n");
-    }
-
-    // open input file
-    struct llama_file fin(params->fn_model_base.c_str(), "rb");
-    if (!fin.fp) {
-        die_fmt("Could not open file '%s'\n", params->fn_model_base.c_str());
-    }
-
-    // open base model gguf, read tensors without their data
-    struct ggml_context * ctx_in;
-    struct gguf_init_params params_gguf;
-    params_gguf.no_alloc = true;
-    params_gguf.ctx      = &ctx_in;
-    struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf);
-
-    // create new gguf
-    struct gguf_context * gguf_out = gguf_init_empty();
-
-    // copy meta data from base model: kv and tensors
-    gguf_set_kv(gguf_out, gguf_in);
-    int n_tensors = gguf_get_n_tensors(gguf_in);
-    for (int i=0; i < n_tensors; ++i) {
-        const char * name = gguf_get_tensor_name(gguf_in, i);
-        struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
-        gguf_add_tensor(gguf_out, tensor);
-    }
-
-    // create output file
-    struct llama_file fout(params->fn_model_out.c_str(), "wb");
-    if (!fout.fp) {
-        die_fmt("Could not create file '%s'\n", params->fn_model_out.c_str());
-    }
-
-    // write gguf meta data
-    std::vector<uint8_t> meta;
-    meta.resize(gguf_get_meta_size(gguf_out));
-    gguf_get_meta_data(gguf_out, meta.data());
-    fout.write_raw(meta.data(), meta.size());
-
-    std::vector<uint8_t> data;
-    std::vector<uint8_t> padding;
-    for (int i=0; i < n_tensors; ++i) {
-        const char * name = gguf_get_tensor_name(gguf_in, i);
-        struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
-
-        // read tensor data
-        data.resize(ggml_nbytes(tensor));
-        tensor->data = data.data();
-        size_t offset = gguf_get_tensor_offset(gguf_in, i);
-        fin.seek(offset + meta.size(), SEEK_SET);
-        fin.read_raw(data.data(), data.size());
-
-        // apply all loras
-        for (size_t k = 0; k < loras.size(); ++k) {
-            apply_lora(tensor, loras[k], params->n_threads);
-        }
-
-        // write tensor data + padding
-        padding.clear();
-        padding.resize(GGML_PAD(data.size(), gguf_get_alignment(gguf_out)) - data.size(), 0);
-
-        GGML_ASSERT(fout.tell() == offset + meta.size());
-        // fout.seek(offset + meta.size(), SEEK_SET);
-        fout.write_raw(data.data(), data.size());
-        fout.write_raw(padding.data(), padding.size());
-
-        if (i % 2 == 0) {
-            printf(".");
-        }
-    }
+    printf("\nexample usage:\n");
+    printf("\n  %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
+    printf("\nNOTE: output model is F16\n");
    printf("\n");
-
-    // close gguf
-    gguf_free(gguf_out);
-    gguf_free(gguf_in);
-
-    // free loras
-    for (size_t i = 0; i < loras.size(); ++i) {
-        free_lora(loras[i]);
-    }
 }

 int main(int argc, char ** argv) {
-    struct export_lora_params params = get_default_export_lora_params();
+    gpt_params params;

-    if (!export_lora_params_parse(argc, argv, &params)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
        return 1;
    }

-    export_lora(&params);
+    g_verbose = (params.verbosity == 1);
+    try {
+        lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.n_threads);
+        ctx.run_merge();
+    } catch (const std::exception & err) {
+        fprintf(stderr, "%s\n", err.what());
+        exit(EXIT_FAILURE);
+    }
+
+    printf("done, output file is %s\n", params.lora_outfile.c_str());

    return 0;
 }
--- a/examples/finetune/CMakeLists.txt
+++ b/examples/finetune/CMakeLists.txt
@ -1,5 +0,0 @@
-set(TARGET llama-finetune)
-add_executable(${TARGET} finetune.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/finetune/README.md
+++ b/examples/finetune/README.md
@ -1,90 +0,0 @@
-# finetune
-
-Basic usage instructions:
-
-```bash
-# get training data
-wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
-
-# finetune LORA adapter
-./bin/llama-finetune \
-        --model-base open-llama-3b-v2-q8_0.gguf \
-        --checkpoint-in  chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
-        --checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
-        --lora-out lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \
-        --train-data "shakespeare.txt" \
-        --save-every 10 \
-        --threads 6 --adam-iter 30 --batch 4 --ctx 64 \
-        --use-checkpointing
-
-# predict
-./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
-```
-
-**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
-The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
-So in above example after 10 iterations these files will be written:
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
- lora-open-llama-3b-v2-q8_0-shakespeare-10.bin
- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
-
-After 10 more iterations:
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-20.gguf
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
- lora-open-llama-3b-v2-q8_0-shakespeare-20.bin
- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
-
-Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
-
-llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
-These LORA adapters can then be used by `llama-cli` together with the base model, like in the 'predict' example command above.
-
-In `llama-cli` you can also load multiple LORA adapters, which will then be mixed together.
-
-For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
-
-```bash
-./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
-  --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
-  --lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
-```
-
-You can change how strong each LORA adapter is applied to the base model by using `--lora-scaled FN SCALE` instead of `--lora FN`.
-
-For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
-
-```bash
-./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
-  --lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
-  --lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
-  --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
-```
-
-The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values too big will sometimes result in worse output. Play around to find good values.
-
-Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
-If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
-
-The default LORA rank can be specified with `--lora-r N`.
-The LORA rank can be configured for each model tensor type separately with these command line options:
-
-```bash
-  --lora-r N                 LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default 4)
-  --rank-att-norm N          LORA rank for attention norm tensor (default 1)
-  --rank-ffn-norm N          LORA rank for feed-forward norm tensor (default 1)
-  --rank-out-norm N          LORA rank for output norm tensor (default 1)
-  --rank-tok-embd N          LORA rank for token embeddings tensor (default 4)
-  --rank-out N               LORA rank for output tensor (default 4)
-  --rank-wq N                LORA rank for wq tensor (default 4)
-  --rank-wk N                LORA rank for wk tensor (default 4)
-  --rank-wv N                LORA rank for wv tensor (default 4)
-  --rank-wo N                LORA rank for wo tensor (default 4)
-  --rank-ffn_gate N          LORA rank for ffn_gate tensor (default 4)
-  --rank-ffn_down N          LORA rank for ffn_down tensor (default 4)
-  --rank-ffn_up N            LORA rank for ffn_up tensor (default 4)
-```
-
-The LORA rank of 'norm' tensors should always be 1.
-
-To see all available options use `finetune --help`.
--- a/examples/finetune/convert-finetune-checkpoint-to-gguf.py
+++ b/examples/finetune/convert-finetune-checkpoint-to-gguf.py
@ -1,487 +0,0 @@
-#!/usr/bin/env python3
-# finetune checkpoint --> gguf conversion
-
-import argparse
-import gguf
-import struct
-import numpy as np
-from pathlib import Path
-
-# gguf constants
-LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
-LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam"
-LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
-LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version"
-LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count"
-LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count"
-LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count"
-LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized"
-LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss"
-LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss"
-LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count"
-LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
-LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end"
-LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
-
-LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments"
-LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments"
-LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
-
-LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters"
-LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
-LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients"
-LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients"
-LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction"
-LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y"
-
-LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model"
-LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
-LLM_KV_TRAINING_TYPE               = "training.type"
-LLM_KV_TRAINING_FILE_VERSION       = "training.file_version"
-LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count"
-LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count"
-LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count"
-
-LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD  = "training.lora.rank.token_embd"
-LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm"
-LLM_KV_TRAINING_LORA_RANK_OUTPUT      = "training.lora.rank.output"
-LLM_KV_TRAINING_LORA_RANK_ATTN_NORM   = "training.lora.rank.attn_norm"
-LLM_KV_TRAINING_LORA_RANK_ATTN_Q      = "training.lora.rank.attn_q"
-LLM_KV_TRAINING_LORA_RANK_ATTN_K      = "training.lora.rank.attn_k"
-LLM_KV_TRAINING_LORA_RANK_ATTN_V      = "training.lora.rank.attn_v"
-LLM_KV_TRAINING_LORA_RANK_ATTN_OUT    = "training.lora.rank.attn_output"
-LLM_KV_TRAINING_LORA_RANK_FFN_NORM    = "training.lora.rank.ffn_norm"
-LLM_KV_TRAINING_LORA_RANK_FFN_GATE    = "training.lora.rank.ffn_gate"
-LLM_KV_TRAINING_LORA_RANK_FFN_DOWN    = "training.lora.rank.ffn_down"
-LLM_KV_TRAINING_LORA_RANK_FFN_UP      = "training.lora.rank.ffn_up"
-
-class Tensor:
-    def __init__(self, dtype='f', ne=None):
-        if ne is None:
-            ne = []
-        self.dtype = dtype
-        self.ne = ne
-        self.nbytes = 0
-        if self.dtype == 'f':
-            if len(self.ne) == 0:
-                self.nbytes = 0
-            else:
-                self.nbytes = int(np.product(self.ne)) * 4
-        else:
-            raise ValueError(f"Unhandled data type '{self.dtype}'")
-
-    def load(self, data, offset):
-        nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-
-        assert(nd == len(self.ne))
-        ne = []
-        for d in range(nd):
-            n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-            ne.append(n)
-
-        if tuple(ne) != tuple(self.ne):
-            raise ValueError(f"Tensor.load: Expected number of elements {str(self.ne)} does not match what is read from file {str(ne)}")
-
-        if self.dtype == 'f':
-            assert(dtype == 0)
-        else:
-            raise ValueError(f"Unhandled data type '{self.dtype}'")
-
-        self.name = bytes(data[offset:offset+namelen]); offset += namelen
-        # 32-byte alignment
-        offset += (0 - offset) & 31
-        self.data = data[offset:offset+self.nbytes]
-        offset += self.nbytes
-        return offset
-
-    def max_storage_size(self):
-        result = 0
-        result += 4 # nd
-        result += 4 # namelen
-        result += 4 # dtype
-        result += len(self.ne)*8 # ne
-        result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
-        result += 31 # 32-byte alignment
-        result += self.nbytes
-        return result
-
-    def save_gguf(self, gguf_writer, name):
-        gguf_writer.add_tensor(
-            name=name,
-            tensor=self.data,
-            raw_shape=np.array(list(reversed(self.ne))),
-            raw_dtype=gguf.GGMLQuantizationType.F32)
-
-class OptimizationContext:
-    def __init__(self):
-        pass
-
-    def load(self, data, offset):
-        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
-        offset += 4
-
-        if self.version != 1:
-            raise ValueError('Invalid version of optimization context in checkpoint file')
-
-        self.past    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.nx      = struct.unpack('N',  bytes(data[offset:offset + 8]))[0];  offset += 8
-        self.iter    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]);  offset += 4
-
-        self.adam_m  = Tensor('f', [self.nx])
-        self.adam_v  = Tensor('f', [self.nx])
-        self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
-
-        self.lbfgs_x    = Tensor('f', [self.nx])
-        self.lbfgs_xp   = Tensor('f', [self.nx])
-        self.lbfgs_g    = Tensor('f', [self.nx])
-        self.lbfgs_gp   = Tensor('f', [self.nx])
-        self.lbfgs_d    = Tensor('f', [self.nx])
-        self.lbfgs_pf   = Tensor('f', [self.past] if self.past > 0 else [])
-        self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
-        self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
-        self.lbfgs_lms  = Tensor('f', [self.nx, self.lbfgs_m])
-        self.lbfgs_lmy  = Tensor('f', [self.nx, self.lbfgs_m])
-
-        # forgot to save type in version 1:
-        # guess self.type from number of remaining bytes
-        size_type_0 = 12 + sum([t.max_storage_size() for t in
-                                [self.adam_m, self.adam_v]
-                                +([self.adam_pf] if (self.past > 0) else [])])
-        size_type_1 = 24 + sum([t.max_storage_size() for t in
-                                [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
-                                 self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
-                                 self.lbfgs_lmal, self.lbfgs_lmys,
-                                 self.lbfgs_lms, self.lbfgs_lmy]
-                                 +([self.lbfgs_pf] if (self.past > 0) else [])])
-        # due to alignment padding the size might not by exact
-        # but the difference in size for both types is significant,
-        # so we can just use whichever is closest
-        remaining = len(data) - offset
-        if abs(remaining - size_type_0) < abs(remaining - size_type_1):
-            self.type = 0
-        else:
-            self.type = 1
-
-        if self.type == 0:
-            offset = self.adam_m.load(data, offset)
-            offset = self.adam_v.load(data, offset)
-            offset = self.adam_pf.load(data,offset)
-
-            self.adam_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.adam_fx_prev          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-        elif self.type == 1:
-            offset = self.lbfgs_x.load(data, offset)
-            offset = self.lbfgs_xp.load(data, offset)
-            offset = self.lbfgs_g.load(data, offset)
-            offset = self.lbfgs_gp.load(data, offset)
-            offset = self.lbfgs_d.load(data, offset)
-            offset = self.lbfgs_pf.load(data, offset)
-            offset = self.lbfgs_lmal.load(data, offset)
-            offset = self.lbfgs_lmys.load(data, offset)
-            offset = self.lbfgs_lms.load(data, offset)
-            offset = self.lbfgs_lmy.load(data, offset)
-
-            self.lbfgs_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_step             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_j                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_k                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_end              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-        else:
-            raise ValueError(f"Invalid optimizer type '{self.type}'")
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
-        gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
-        gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
-
-        if self.type == 0:
-            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
-
-            self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
-            self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
-            if self.past > 0:
-                self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
-
-        elif self.type == 1:
-            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
-
-            self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
-            self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
-            self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
-            self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
-            self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
-            if self.past > 0:
-                self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
-            self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
-            self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
-            self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
-            self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
-        else:
-            raise ValueError('Unknown optimizer type')
-
-class LoraParams:
-    def __init__(self):
-        pass
-
-    def load(self, data, offset):
-        self.n_rank_attention_norm  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_wq              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_wk              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_wv              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_wo              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_ffn_norm        = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_w1              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_w2              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_w3              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_tok_embeddings  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_norm            = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_output          = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD,  self.n_rank_tok_embeddings)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, self.n_rank_norm)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT,      self.n_rank_output)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_NORM,   self.n_rank_attention_norm)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_Q,      self.n_rank_wq)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_K,      self.n_rank_wk)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_V,      self.n_rank_wv)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_OUT,    self.n_rank_wo)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_NORM,    self.n_rank_ffn_norm)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_GATE,    self.n_rank_w1)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_DOWN,    self.n_rank_w2)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_UP,      self.n_rank_w3)
-
-class ModelParams:
-    def __init__(self, n_ff = None):
-        self.n_ff = n_ff
-
-    def load(self, data, offset):
-        self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_embd  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_mult  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_head  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rot   = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        return offset
-
-    def get_n_ff(self):
-        if self.n_ff is None:
-            # struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
-            return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
-        else:
-            return self.n_ff
-
-    def save_gguf(self, gguf_writer):
-        # self.n_vocab not saved
-        gguf_writer.add_embedding_length(self.n_embd)
-        gguf_writer.add_head_count(self.n_head)
-        gguf_writer.add_block_count(self.n_layer)
-        gguf_writer.add_rope_dimension_count(self.n_rot)
-        gguf_writer.add_feed_forward_length(self.get_n_ff())
-
-def tensor_name(key, bid=None, suffix=".weight"):
-    return gguf.TENSOR_NAMES[key].format(bid=bid) + suffix
-
-class Layer:
-    def __init__(self, params, lora_params, bid):
-        self.bid = bid
-        self.att_norm_a = Tensor('f', [lora_params.n_rank_attention_norm, params.n_embd])
-        self.att_norm_b = Tensor('f', [lora_params.n_rank_attention_norm, 1])
-        self.wq_a       = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
-        self.wq_b       = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
-        self.wk_a       = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
-        self.wk_b       = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
-        self.wv_a       = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
-        self.wv_b       = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
-        self.wo_a       = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
-        self.wo_b       = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
-        self.ffn_norm_a = Tensor('f', [lora_params.n_rank_ffn_norm, params.n_embd])
-        self.ffn_norm_b = Tensor('f', [lora_params.n_rank_ffn_norm, 1])
-        self.w1_a       = Tensor('f', [lora_params.n_rank_w1, params.n_embd])
-        self.w1_b       = Tensor('f', [lora_params.n_rank_w1, params.get_n_ff()])
-        self.w2_a       = Tensor('f', [lora_params.n_rank_w2, params.get_n_ff()])
-        self.w2_b       = Tensor('f', [lora_params.n_rank_w2, params.n_embd])
-        self.w3_a       = Tensor('f', [lora_params.n_rank_w3, params.n_embd])
-        self.w3_b       = Tensor('f', [lora_params.n_rank_w3, params.get_n_ff()])
-
-    def load(self, data, offset):
-        offset = self.att_norm_a.load(data, offset)
-        offset = self.att_norm_b.load(data, offset)
-        offset = self.wq_a.load(data, offset)
-        offset = self.wq_b.load(data, offset)
-        offset = self.wk_a.load(data, offset)
-        offset = self.wk_b.load(data, offset)
-        offset = self.wv_a.load(data, offset)
-        offset = self.wv_b.load(data, offset)
-        offset = self.wo_a.load(data, offset)
-        offset = self.wo_b.load(data, offset)
-        offset = self.ffn_norm_a.load(data, offset)
-        offset = self.ffn_norm_b.load(data, offset)
-        offset = self.w1_a.load(data, offset)
-        offset = self.w1_b.load(data, offset)
-        offset = self.w2_a.load(data, offset)
-        offset = self.w2_b.load(data, offset)
-        offset = self.w3_a.load(data, offset)
-        offset = self.w3_b.load(data, offset)
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        self.att_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_a"))
-        self.att_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_b"))
-        self.wq_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q,    self.bid, ".weight.lora_a"))
-        self.wq_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q,    self.bid, ".weight.lora_b"))
-        self.wk_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K,    self.bid, ".weight.lora_a"))
-        self.wk_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K,    self.bid, ".weight.lora_b"))
-        self.wv_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V,    self.bid, ".weight.lora_a"))
-        self.wv_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V,    self.bid, ".weight.lora_b"))
-        self.wo_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT,  self.bid, ".weight.lora_a"))
-        self.wo_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT,  self.bid, ".weight.lora_b"))
-        self.ffn_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM,  self.bid, ".weight.lora_a"))
-        self.ffn_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM,  self.bid, ".weight.lora_b"))
-        self.w1_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE,  self.bid, ".weight.lora_a"))
-        self.w1_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE,  self.bid, ".weight.lora_b"))
-        self.w2_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN,  self.bid, ".weight.lora_a"))
-        self.w2_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN,  self.bid, ".weight.lora_b"))
-        self.w3_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP,    self.bid, ".weight.lora_a"))
-        self.w3_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP,    self.bid, ".weight.lora_b"))
-
-class LoraModel:
-    def __init__(self, n_ff = None):
-        self.params = ModelParams(n_ff = n_ff)
-        self.lora_params = LoraParams()
-        self.layers = []
-
-    def load(self, data, offset):
-        offset = self.params.load(data, offset)
-        offset = self.lora_params.load(data, offset)
-
-        self.tok_embd_a = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_embd])
-        self.tok_embd_b = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_vocab])
-        self.norm_a     = Tensor('f', [self.lora_params.n_rank_norm, self.params.n_embd])
-        self.norm_b     = Tensor('f', [self.lora_params.n_rank_norm, 1])
-        self.output_a   = Tensor('f', [self.lora_params.n_rank_output, self.params.n_embd])
-        self.output_b   = Tensor('f', [self.lora_params.n_rank_output, self.params.n_vocab])
-
-        offset = self.tok_embd_a.load(data, offset)
-        offset = self.tok_embd_b.load(data, offset)
-        offset = self.norm_a.load(data, offset)
-        offset = self.norm_b.load(data, offset)
-        offset = self.output_a.load(data, offset)
-        offset = self.output_b.load(data, offset)
-
-        self.layers.clear()
-        for bid in range(self.params.n_layer):
-            layer = Layer(self.params, self.lora_params, bid)
-            offset = layer.load(data, offset)
-            self.layers.append(layer)
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        self.params.save_gguf(gguf_writer)
-        self.lora_params.save_gguf(gguf_writer)
-
-        self.tok_embd_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD,  suffix=".weight.lora_a"))
-        self.tok_embd_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD,  suffix=".weight.lora_b"))
-        self.norm_a.save_gguf    (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_a"))
-        self.norm_b.save_gguf    (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_b"))
-        self.output_a.save_gguf  (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT,      suffix=".weight.lora_a"))
-        self.output_b.save_gguf  (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT,      suffix=".weight.lora_b"))
-
-        for layer in self.layers:
-            layer.save_gguf(gguf_writer)
-
-class LoraCheckpoint:
-    def __init__(self, n_ff = None):
-        self.model = LoraModel(n_ff = n_ff)
-        self.opt_ctx = OptimizationContext()
-
-    def load(self, data, offset):
-        magic   = bytes(reversed(data[offset:offset + 4])); offset += 4
-        if magic != b'ggcl':
-            raise ValueError(f"File header magic indicates, that this is no finetune-lora checkpoint file. Expected 'ggcl', Got '{str(magic)}'")
-
-        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        if self.version != 0:
-            raise ValueError('Invalid version of checkpoint file')
-
-        self.train_its     = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        self.train_tokens  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-
-        offset = self.model.load(data, offset)
-        offset = self.opt_ctx.load(data, offset)
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
-        gguf_writer.add_layer_norm_rms_eps(1e-5)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION,    0)
-        gguf_writer.add_string(LLM_KV_TRAINING_TYPE,            LLM_KV_TRAINING_TYPE_FINETUNE_LORA)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT,    self.train_samples)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT,     self.train_tokens)
-        self.model.save_gguf(gguf_writer)
-        self.opt_ctx.save_gguf(gguf_writer)
-
-def handle_args():
-    parser = argparse.ArgumentParser(description = 'Convert finetune checkpoints to GGUF')
-    parser.add_argument('--input',  '-i', type = Path, help = 'Input finetune checkpoint filename', required=True)
-    parser.add_argument('--output', '-o', type = Path, help = 'Output GGUF filename', required=True)
-    parser.add_argument('--ff', type = int, help = "Feedforward size, if not provided compute from n_mult. Provide this if you get 'ValueError: Tensor.load: Expected number of elements does not match what is read from file'", required=False)
-    return parser.parse_args()
-
-def main():
-    cfg = handle_args()
-    print(cfg)
-    data = np.memmap(cfg.input, mode = 'r')
-    chk = LoraCheckpoint(n_ff = cfg.ff)
-    offset = 0
-    offset = chk.load(data, offset)
-    # we should have read all available data
-    assert(offset == len(data))
-
-    gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
-    chk.save_gguf(gguf_writer)
-    print("    gguf: write header")
-    gguf_writer.write_header_to_file()
-    print("    gguf: write metadata")
-    gguf_writer.write_kv_data_to_file()
-    print("    gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
-    gguf_writer.close()
-
-if __name__ == '__main__':
-    main()
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
--- a/examples/finetune/finetune.sh
+++ b/examples/finetune/finetune.sh
@ -1,34 +0,0 @@
-#!/bin/bash
-cd `dirname $0`
-cd ../..
-
-EXE="./llama-finetune"
-
-if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
-if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
-
-# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
-MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing.
-
-while getopts "dg" opt; do
-  case $opt in
-    d)
-      DEBUGGER="gdb --args"
-      ;;
-    g)
-      EXE="./build/bin/Release/finetune"
-      GPUARG="--gpu-layers 25"
-      ;;
-  esac
-done
-
-$DEBUGGER $EXE \
-        --model-base $MODEL \
-        $GPUARG \
-        --checkpoint-in  chk-ol3b-shakespeare-LATEST.gguf \
-        --checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \
-        --lora-out lora-ol3b-shakespeare-ITERATION.bin \
-        --train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \
-        --save-every 10 \
-        --threads 10 --adam-iter 30 --batch 4 --ctx 64 \
-        --use-checkpointing
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@ -16,20 +16,25 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st
    auto decoded = decode_utf8(input_str, {});
    const auto & code_points = decoded.first;

+    const llama_grammar_rules  & rules      = llama_grammar_get_rules (grammar);
+          llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
+
    size_t pos = 0;
    for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
-        auto prev_stacks = grammar->stacks;
-        llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
-        if (grammar->stacks.empty()) {
+        const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy
+
+        llama_grammar_accept(rules, prev_stacks, *it, cur_stacks);
+
+        if (cur_stacks.empty()) {
            error_pos = pos;
            error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
-            grammar->stacks = prev_stacks;
+            cur_stacks = prev_stacks;
            return false;
        }
        ++pos;
    }

-    for (const auto & stack : grammar->stacks) {
+    for (const auto & stack : cur_stacks) {
        if (stack.empty()) {
            return true;
        }
--- a/examples/gguf-hash/CMakeLists.txt
+++ b/examples/gguf-hash/CMakeLists.txt
@ -0,0 +1,15 @@
+set(TARGET llama-gguf-hash)
+add_executable(${TARGET} gguf-hash.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+
+# clibs dependencies
+include_directories(deps/)
+add_library(xxhash OBJECT deps/xxhash/xxhash.c deps/xxhash/xxhash.h)
+target_link_libraries(${TARGET} PRIVATE xxhash)
+add_library(sha1 OBJECT deps/sha1/sha1.c deps/sha1/sha1.h)
+target_link_libraries(${TARGET} PRIVATE sha1)
+add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h)
+target_link_libraries(${TARGET} PRIVATE sha256)
+
+target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/gguf-hash/README.md
+++ b/examples/gguf-hash/README.md
@ -0,0 +1,206 @@
+
+# llama-gguf-hash
+
+CLI to hash GGUF files to detect difference on a per model and per tensor level.
+
+**Command line options:**
+
+- `--help`: display help message
+- `--xxh64`: use xhash 64bit hash mode (default)
+- `--sha1`: use sha1
+- `--uuid`: use uuid
+- `--sha256`: use sha256
+- `--all`: use all hash
+- `--no-layer`: exclude per layer hash
+- `--uuid`: generate UUIDv5 ID
+- `-c`, `--check <manifest>`:  verify against a manifest
+
+## About
+
+While most POSIX systems already have hash checking programs like sha256sum, it
+is designed to check entire files. This is not ideal for our purpose if we want
+to check for consistency of the tensor data even if the metadata content of the
+gguf KV store has been updated.
+
+This program is designed to hash a gguf tensor payload on a 'per tensor layer'
+in addition to a 'entire tensor model' hash. The intent is that the entire
+tensor layer can be checked first but if there is any detected inconsistencies,
+then the per tensor hash can be used to narrow down the specific tensor layer
+that has inconsistencies.
+
+For Maintainers:
+- Detection of tensor inconsistency during development and automated tests
+    - This is served by xxh64 which is fast
+    - This is also served by having per tensor layer to assist in narrowing down
+      the location of the faulty tensor layer
+    - This is also served by sha1 which is much slower but more widely supported
+
+For Model Creators:
+- Optional consistent UUID generation based on model tensor content
+    - This is served by UUIDv5 which is useful for databases keys
+        - llama.cpp UUIDv5 Namespace: `ef001206-dadc-5f6d-a15f-3359e577d4e5`
+            - Made via UUIDv5 URL namespace of `en.wikipedia.org/wiki/Llama.cpp`
+
+For Model Users:
+- Assurance of tensor layer integrity even if metadata was updated
+    - This is served by sha256 which is still considered very secure as of 2024
+
+### Design Note
+
+- The default behavior of this program if no arguments is provided is to hash
+  using xxhash's xxh32 mode because it is very fast and is primarily targeted
+  towards maintainers who may want to use this in automated tests.
+- xxhash support xxh32 and xxh128 for 32bit hash and 128bit hash respectively
+  however we picked 64bit xxhash as most computers are 64bit as of 2024 and thus
+  would have a better affinity to calculating hash that is 64bit in size.
+
+## Compile Example
+
+```bash
+cmake -B build -DCMAKE_BUILD_TYPE=Debug -DLLAMA_FATAL_WARNINGS=ON
+make -C build clean
+make -C build llama-gguf-hash VERBOSE=1
+./build/bin/llama-gguf-hash test.gguf
+./build/bin/llama-gguf-hash --xxh64 test.gguf
+./build/bin/llama-gguf-hash --sha1 test.gguf
+./build/bin/llama-gguf-hash --uuid test.gguf
+./build/bin/llama-gguf-hash --sha256 test.gguf
+```
+
+## Generation and Verification Example
+
+To generate we may use this command
+
+```bash
+./llama-gguf-hash --all test.gguf > test.gguf.manifest
+```
+
+Which would generate a manifest that looks like below, which contains multiple hash type and per tensor layer hashes as well
+(This excludes UUID as that is an ID not a hash)
+
+```bash
+xxh64     f66e9cd66a4396a0  test.gguf:tensor_0
+sha1      59f79ecefd8125a996fdf419239051a7e99e5f20  test.gguf:tensor_0
+sha256    c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd  test.gguf:tensor_0
+xxh64     7d3a1f9ac04d0537  test.gguf:tensor_1
+sha1      4765f592eacf096df4628ba59476af94d767080a  test.gguf:tensor_1
+sha256    8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7  test.gguf:tensor_1
+xxh64     a0af5d700049693b  test.gguf:tensor_2
+sha1      25cbfbad4513cc348e2c95ebdee69d6ff2fd8753  test.gguf:tensor_2
+sha256    947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180  test.gguf:tensor_2
+xxh64     e83fddf559d7b6a6  test.gguf:tensor_3
+sha1      a9cba73e2d90f2ee3dae2548caa42bef3fe6a96c  test.gguf:tensor_3
+sha256    423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1  test.gguf:tensor_3
+xxh64     1257733306b7992d  test.gguf:tensor_4
+sha1      d7bc61db93bb685ce9d598da89717c66729b7543  test.gguf:tensor_4
+sha256    79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf  test.gguf:tensor_4
+xxh64     d238d16ba4711e58  test.gguf:tensor_5
+sha1      0706566c198fe1072f37e0a5135b4b5f23654c52  test.gguf:tensor_5
+sha256    60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b  test.gguf:tensor_5
+xxh64     3fbc3b65ab8c7f39  test.gguf:tensor_6
+sha1      73922a0727226a409049f6fc3172a52219ca6f00  test.gguf:tensor_6
+sha256    574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0  test.gguf:tensor_6
+xxh64     c22021c29854f093  test.gguf:tensor_7
+sha1      efc39cece6a951188fc41e354c73bbfe6813d447  test.gguf:tensor_7
+sha256    4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75  test.gguf:tensor_7
+xxh64     936df61f5d64261f  test.gguf:tensor_8
+sha1      c2490296d789a4f34398a337fed8377d943d9f06  test.gguf:tensor_8
+sha256    c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01  test.gguf:tensor_8
+xxh64     93fd20c64421c081  test.gguf:tensor_9
+sha1      7047ce1e78437a6884337a3751c7ee0421918a65  test.gguf:tensor_9
+sha256    23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514  test.gguf:tensor_9
+xxh64     5a54d3aad816f302  test.gguf
+sha1      d15be52c4ff213e823cb6dd13af7ee2f978e7042  test.gguf
+sha256    7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4  test.gguf
+```
+
+We can then use the normal check command which will by default check for the highest security strength hash and verify against that:
+
+```bash
+$ ./llama-gguf-hash --check test.gguf.manifest test.gguf
+manifest  test.gguf.manifest  sha256  sha1  xxh64
+sha256    c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd  test.gguf:tensor_0  -  Ok
+sha256    8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7  test.gguf:tensor_1  -  Ok
+sha256    947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180  test.gguf:tensor_2  -  Ok
+sha256    423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1  test.gguf:tensor_3  -  Ok
+sha256    79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf  test.gguf:tensor_4  -  Ok
+sha256    60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b  test.gguf:tensor_5  -  Ok
+sha256    574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0  test.gguf:tensor_6  -  Ok
+sha256    4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75  test.gguf:tensor_7  -  Ok
+sha256    c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01  test.gguf:tensor_8  -  Ok
+sha256    23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514  test.gguf:tensor_9  -  Ok
+sha256    7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4  test.gguf  -  Ok
+
+Verification results for test.gguf.manifest - Success
+```
+
+Or we may explicitly ask for a faster hash like:
+
+```bash
+$ ./llama-gguf-hash --check test.gguf.manifest --xxh64 test.gguf
+manifest  test.gguf.manifest  sha256  sha1  xxh64
+xxh64     f66e9cd66a4396a0  test.gguf:tensor_0  -  Ok
+xxh64     7d3a1f9ac04d0537  test.gguf:tensor_1  -  Ok
+xxh64     a0af5d700049693b  test.gguf:tensor_2  -  Ok
+xxh64     e83fddf559d7b6a6  test.gguf:tensor_3  -  Ok
+xxh64     1257733306b7992d  test.gguf:tensor_4  -  Ok
+xxh64     d238d16ba4711e58  test.gguf:tensor_5  -  Ok
+xxh64     3fbc3b65ab8c7f39  test.gguf:tensor_6  -  Ok
+xxh64     c22021c29854f093  test.gguf:tensor_7  -  Ok
+xxh64     936df61f5d64261f  test.gguf:tensor_8  -  Ok
+xxh64     93fd20c64421c081  test.gguf:tensor_9  -  Ok
+xxh64     5a54d3aad816f302  test.gguf  -  Ok
+
+Verification results for test.gguf.manifest - Success
+```
+
+Or maybe we want to just check that all the hash is valid:
+
+```bash
+$./llama-gguf-hash --check test.gguf.manifest --all test.gguf.manifest
+manifest  test.gguf.manifest  sha256  sha1  xxh64
+xxh64     f66e9cd66a4396a0  test.gguf:tensor_0  -  Ok
+sha1      59f79ecefd8125a996fdf419239051a7e99e5f20  test.gguf:tensor_0  -  Ok
+sha256    c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd  test.gguf:tensor_0  -  Ok
+xxh64     7d3a1f9ac04d0537  test.gguf:tensor_1  -  Ok
+sha1      4765f592eacf096df4628ba59476af94d767080a  test.gguf:tensor_1  -  Ok
+sha256    8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7  test.gguf:tensor_1  -  Ok
+xxh64     a0af5d700049693b  test.gguf:tensor_2  -  Ok
+sha1      25cbfbad4513cc348e2c95ebdee69d6ff2fd8753  test.gguf:tensor_2  -  Ok
+sha256    947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180  test.gguf:tensor_2  -  Ok
+xxh64     e83fddf559d7b6a6  test.gguf:tensor_3  -  Ok
+sha1      a9cba73e2d90f2ee3dae2548caa42bef3fe6a96c  test.gguf:tensor_3  -  Ok
+sha256    423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1  test.gguf:tensor_3  -  Ok
+xxh64     1257733306b7992d  test.gguf:tensor_4  -  Ok
+sha1      d7bc61db93bb685ce9d598da89717c66729b7543  test.gguf:tensor_4  -  Ok
+sha256    79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf  test.gguf:tensor_4  -  Ok
+xxh64     d238d16ba4711e58  test.gguf:tensor_5  -  Ok
+sha1      0706566c198fe1072f37e0a5135b4b5f23654c52  test.gguf:tensor_5  -  Ok
+sha256    60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b  test.gguf:tensor_5  -  Ok
+xxh64     3fbc3b65ab8c7f39  test.gguf:tensor_6  -  Ok
+sha1      73922a0727226a409049f6fc3172a52219ca6f00  test.gguf:tensor_6  -  Ok
+sha256    574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0  test.gguf:tensor_6  -  Ok
+xxh64     c22021c29854f093  test.gguf:tensor_7  -  Ok
+sha1      efc39cece6a951188fc41e354c73bbfe6813d447  test.gguf:tensor_7  -  Ok
+sha256    4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75  test.gguf:tensor_7  -  Ok
+xxh64     936df61f5d64261f  test.gguf:tensor_8  -  Ok
+sha1      c2490296d789a4f34398a337fed8377d943d9f06  test.gguf:tensor_8  -  Ok
+sha256    c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01  test.gguf:tensor_8  -  Ok
+xxh64     93fd20c64421c081  test.gguf:tensor_9  -  Ok
+sha1      7047ce1e78437a6884337a3751c7ee0421918a65  test.gguf:tensor_9  -  Ok
+sha256    23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514  test.gguf:tensor_9  -  Ok
+xxh64     5a54d3aad816f302  test.gguf  -  Ok
+sha1      d15be52c4ff213e823cb6dd13af7ee2f978e7042  test.gguf  -  Ok
+sha256    7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4  test.gguf  -  Ok
+
+Verification results for test.gguf.manifest - Success
+```
+
+
+## Crypto/Hash Libraries Used
+
+These micro c libraries dependencies was installed via the [clib c package manager](https://github.com/clibs)
+
+- https://github.com/Cyan4973/xxHash
+- https://github.com/clibs/sha1/
+- https://github.com/jb55/sha256.c
--- a/examples/gguf-hash/deps/rotate-bits/package.json
+++ b/examples/gguf-hash/deps/rotate-bits/package.json
@ -0,0 +1,13 @@
+{
+  "name": "rotate-bits",
+  "version": "0.1.1",
+  "repo": "jb55/rotate-bits.h",
+  "description": "rotate bits",
+  "keywords": ["rotl", "rotr"],
+  "src": ["rotate-bits.h"],
+  "license": "Public Domain",
+  "development": {
+    "thlorenz/tap.c": "*"
+  }
+}
+
--- a/examples/gguf-hash/deps/rotate-bits/rotate-bits.h
+++ b/examples/gguf-hash/deps/rotate-bits/rotate-bits.h
@ -0,0 +1,46 @@
+
+
+#ifndef __ROTATE_DEFS_H
+#define __ROTATE_DEFS_H
+
+#ifdef _MSC_VER
+
+#include <stdlib.h>
+
+#define ROTL32(v, n) _rotl((v), (n))
+#define ROTL64(v, n) _rotl64((v), (n))
+
+#define ROTR32(v, n) _rotr((v), (n))
+#define ROTR64(v, n) _rotr64((v), (n))
+
+#else
+
+#include <stdint.h>
+
+#define U8V(v) ((uint8_t)(v) & 0xFFU)
+#define U16V(v) ((uint16_t)(v) & 0xFFFFU)
+#define U32V(v) ((uint32_t)(v) & 0xFFFFFFFFU)
+#define U64V(v) ((uint64_t)(v) & 0xFFFFFFFFFFFFFFFFU)
+
+#define ROTL32(v, n) \
+  (U32V((uint32_t)(v) << (n)) | ((uint32_t)(v) >> (32 - (n))))
+
+// tests fail if we don't have this cast...
+#define ROTL64(v, n) \
+  (U64V((uint64_t)(v) << (n)) | ((uint64_t)(v) >> (64 - (n))))
+
+#define ROTR32(v, n) ROTL32(v, 32 - (n))
+#define ROTR64(v, n) ROTL64(v, 64 - (n))
+
+#endif
+
+#define ROTL8(v, n) \
+  (U8V((uint8_t)(v) << (n)) | ((uint8_t)(v) >> (8 - (n))))
+
+#define ROTL16(v, n) \
+  (U16V((uint16_t)(v) << (n)) | ((uint16_t)(v) >> (16 - (n))))
+
+#define ROTR8(v, n) ROTL8(v, 8 - (n))
+#define ROTR16(v, n) ROTL16(v, 16 - (n))
+
+#endif
--- a/examples/gguf-hash/deps/sha1/package.json
+++ b/examples/gguf-hash/deps/sha1/package.json
@ -0,0 +1,9 @@
+{
+  "name": "sha1",
+  "version": "0.0.1",
+  "repo": "clibs/sha1",
+  "description": "sha1 hash algorithm",
+  "keywords": ["sha1", "hash"],
+  "license": "public domain",
+  "src": ["sha1.c", "sha1.h"]
+}
--- a/examples/gguf-hash/deps/sha1/sha1.c
+++ b/examples/gguf-hash/deps/sha1/sha1.c
@ -0,0 +1,295 @@
+/*
+SHA-1 in C
+By Steve Reid <steve@edmweb.com>
+100% Public Domain
+
+Test Vectors (from FIPS PUB 180-1)
+"abc"
+  A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D
+"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
+  84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1
+A million repetitions of "a"
+  34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F
+*/
+
+/* #define LITTLE_ENDIAN * This should be #define'd already, if true. */
+/* #define SHA1HANDSOFF * Copies data before messing with it. */
+
+#define SHA1HANDSOFF
+
+#include <stdio.h>
+#include <string.h>
+
+/* for uint32_t */
+#include <stdint.h>
+
+#include "sha1.h"
+
+
+#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
+
+/* blk0() and blk() perform the initial expand. */
+/* I got the idea of expanding during the round function from SSLeay */
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \
+    |(rol(block->l[i],8)&0x00FF00FF))
+#elif BYTE_ORDER == BIG_ENDIAN
+#define blk0(i) block->l[i]
+#else
+#error "Endianness not defined!"
+#endif
+#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \
+    ^block->l[(i+2)&15]^block->l[i&15],1))
+
+/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
+#define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30);
+#define R1(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30);
+#define R2(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30);
+#define R3(v,w,x,y,z,i) z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30);
+#define R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30);
+
+
+/* Hash a single 512-bit block. This is the core of the algorithm. */
+
+void SHA1Transform(
+    uint32_t state[5],
+    const unsigned char buffer[64]
+)
+{
+    uint32_t a, b, c, d, e;
+
+    typedef union
+    {
+        unsigned char c[64];
+        uint32_t l[16];
+    } CHAR64LONG16;
+
+#ifdef SHA1HANDSOFF
+    CHAR64LONG16 block[1];      /* use array to appear as a pointer */
+
+    memcpy(block, buffer, 64);
+#else
+    /* The following had better never be used because it causes the
+     * pointer-to-const buffer to be cast into a pointer to non-const.
+     * And the result is written through.  I threw a "const" in, hoping
+     * this will cause a diagnostic.
+     */
+    CHAR64LONG16 *block = (const CHAR64LONG16 *) buffer;
+#endif
+    /* Copy context->state[] to working vars */
+    a = state[0];
+    b = state[1];
+    c = state[2];
+    d = state[3];
+    e = state[4];
+    /* 4 rounds of 20 operations each. Loop unrolled. */
+    R0(a, b, c, d, e, 0);
+    R0(e, a, b, c, d, 1);
+    R0(d, e, a, b, c, 2);
+    R0(c, d, e, a, b, 3);
+    R0(b, c, d, e, a, 4);
+    R0(a, b, c, d, e, 5);
+    R0(e, a, b, c, d, 6);
+    R0(d, e, a, b, c, 7);
+    R0(c, d, e, a, b, 8);
+    R0(b, c, d, e, a, 9);
+    R0(a, b, c, d, e, 10);
+    R0(e, a, b, c, d, 11);
+    R0(d, e, a, b, c, 12);
+    R0(c, d, e, a, b, 13);
+    R0(b, c, d, e, a, 14);
+    R0(a, b, c, d, e, 15);
+    R1(e, a, b, c, d, 16);
+    R1(d, e, a, b, c, 17);
+    R1(c, d, e, a, b, 18);
+    R1(b, c, d, e, a, 19);
+    R2(a, b, c, d, e, 20);
+    R2(e, a, b, c, d, 21);
+    R2(d, e, a, b, c, 22);
+    R2(c, d, e, a, b, 23);
+    R2(b, c, d, e, a, 24);
+    R2(a, b, c, d, e, 25);
+    R2(e, a, b, c, d, 26);
+    R2(d, e, a, b, c, 27);
+    R2(c, d, e, a, b, 28);
+    R2(b, c, d, e, a, 29);
+    R2(a, b, c, d, e, 30);
+    R2(e, a, b, c, d, 31);
+    R2(d, e, a, b, c, 32);
+    R2(c, d, e, a, b, 33);
+    R2(b, c, d, e, a, 34);
+    R2(a, b, c, d, e, 35);
+    R2(e, a, b, c, d, 36);
+    R2(d, e, a, b, c, 37);
+    R2(c, d, e, a, b, 38);
+    R2(b, c, d, e, a, 39);
+    R3(a, b, c, d, e, 40);
+    R3(e, a, b, c, d, 41);
+    R3(d, e, a, b, c, 42);
+    R3(c, d, e, a, b, 43);
+    R3(b, c, d, e, a, 44);
+    R3(a, b, c, d, e, 45);
+    R3(e, a, b, c, d, 46);
+    R3(d, e, a, b, c, 47);
+    R3(c, d, e, a, b, 48);
+    R3(b, c, d, e, a, 49);
+    R3(a, b, c, d, e, 50);
+    R3(e, a, b, c, d, 51);
+    R3(d, e, a, b, c, 52);
+    R3(c, d, e, a, b, 53);
+    R3(b, c, d, e, a, 54);
+    R3(a, b, c, d, e, 55);
+    R3(e, a, b, c, d, 56);
+    R3(d, e, a, b, c, 57);
+    R3(c, d, e, a, b, 58);
+    R3(b, c, d, e, a, 59);
+    R4(a, b, c, d, e, 60);
+    R4(e, a, b, c, d, 61);
+    R4(d, e, a, b, c, 62);
+    R4(c, d, e, a, b, 63);
+    R4(b, c, d, e, a, 64);
+    R4(a, b, c, d, e, 65);
+    R4(e, a, b, c, d, 66);
+    R4(d, e, a, b, c, 67);
+    R4(c, d, e, a, b, 68);
+    R4(b, c, d, e, a, 69);
+    R4(a, b, c, d, e, 70);
+    R4(e, a, b, c, d, 71);
+    R4(d, e, a, b, c, 72);
+    R4(c, d, e, a, b, 73);
+    R4(b, c, d, e, a, 74);
+    R4(a, b, c, d, e, 75);
+    R4(e, a, b, c, d, 76);
+    R4(d, e, a, b, c, 77);
+    R4(c, d, e, a, b, 78);
+    R4(b, c, d, e, a, 79);
+    /* Add the working vars back into context.state[] */
+    state[0] += a;
+    state[1] += b;
+    state[2] += c;
+    state[3] += d;
+    state[4] += e;
+    /* Wipe variables */
+    a = b = c = d = e = 0;
+#ifdef SHA1HANDSOFF
+    memset(block, '\0', sizeof(block));
+#endif
+}
+
+
+/* SHA1Init - Initialize new context */
+
+void SHA1Init(
+    SHA1_CTX * context
+)
+{
+    /* SHA1 initialization constants */
+    context->state[0] = 0x67452301;
+    context->state[1] = 0xEFCDAB89;
+    context->state[2] = 0x98BADCFE;
+    context->state[3] = 0x10325476;
+    context->state[4] = 0xC3D2E1F0;
+    context->count[0] = context->count[1] = 0;
+}
+
+
+/* Run your data through this. */
+
+void SHA1Update(
+    SHA1_CTX * context,
+    const unsigned char *data,
+    uint32_t len
+)
+{
+    uint32_t i;
+
+    uint32_t j;
+
+    j = context->count[0];
+    if ((context->count[0] += len << 3) < j)
+        context->count[1]++;
+    context->count[1] += (len >> 29);
+    j = (j >> 3) & 63;
+    if ((j + len) > 63)
+    {
+        memcpy(&context->buffer[j], data, (i = 64 - j));
+        SHA1Transform(context->state, context->buffer);
+        for (; i + 63 < len; i += 64)
+        {
+            SHA1Transform(context->state, &data[i]);
+        }
+        j = 0;
+    }
+    else
+        i = 0;
+    memcpy(&context->buffer[j], &data[i], len - i);
+}
+
+
+/* Add padding and return the message digest. */
+
+void SHA1Final(
+    unsigned char digest[20],
+    SHA1_CTX * context
+)
+{
+    unsigned i;
+
+    unsigned char finalcount[8];
+
+    unsigned char c;
+
+#if 0    /* untested "improvement" by DHR */
+    /* Convert context->count to a sequence of bytes
+     * in finalcount.  Second element first, but
+     * big-endian order within element.
+     * But we do it all backwards.
+     */
+    unsigned char *fcp = &finalcount[8];
+
+    for (i = 0; i < 2; i++)
+    {
+        uint32_t t = context->count[i];
+
+        int j;
+
+        for (j = 0; j < 4; t >>= 8, j++)
+            *--fcp = (unsigned char) t}
+#else
+    for (i = 0; i < 8; i++)
+    {
+        finalcount[i] = (unsigned char) ((context->count[(i >= 4 ? 0 : 1)] >> ((3 - (i & 3)) * 8)) & 255);      /* Endian independent */
+    }
+#endif
+    c = 0200;
+    SHA1Update(context, &c, 1);
+    while ((context->count[0] & 504) != 448)
+    {
+        c = 0000;
+        SHA1Update(context, &c, 1);
+    }
+    SHA1Update(context, finalcount, 8); /* Should cause a SHA1Transform() */
+    for (i = 0; i < 20; i++)
+    {
+        digest[i] = (unsigned char)
+            ((context->state[i >> 2] >> ((3 - (i & 3)) * 8)) & 255);
+    }
+    /* Wipe variables */
+    memset(context, '\0', sizeof(*context));
+    memset(&finalcount, '\0', sizeof(finalcount));
+}
+
+void SHA1(
+    char *hash_out,
+    const char *str,
+    uint32_t len)
+{
+    SHA1_CTX ctx;
+    unsigned int ii;
+
+    SHA1Init(&ctx);
+    for (ii=0; ii<len; ii+=1)
+        SHA1Update(&ctx, (const unsigned char*)str + ii, 1);
+    SHA1Final((unsigned char *)hash_out, &ctx);
+}
+
--- a/examples/gguf-hash/deps/sha1/sha1.h
+++ b/examples/gguf-hash/deps/sha1/sha1.h
@ -0,0 +1,52 @@
+#ifndef SHA1_H
+#define SHA1_H
+
+/*
+   SHA-1 in C
+   By Steve Reid <steve@edmweb.com>
+   100% Public Domain
+ */
+
+#include "stdint.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef struct
+{
+    uint32_t state[5];
+    uint32_t count[2];
+    unsigned char buffer[64];
+} SHA1_CTX;
+
+void SHA1Transform(
+    uint32_t state[5],
+    const unsigned char buffer[64]
+    );
+
+void SHA1Init(
+    SHA1_CTX * context
+    );
+
+void SHA1Update(
+    SHA1_CTX * context,
+    const unsigned char *data,
+    uint32_t len
+    );
+
+void SHA1Final(
+    unsigned char digest[20],
+    SHA1_CTX * context
+    );
+
+void SHA1(
+    char *hash_out,
+    const char *str,
+    uint32_t len);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* SHA1_H */
--- a/examples/gguf-hash/deps/sha256/package.json
+++ b/examples/gguf-hash/deps/sha256/package.json
@ -0,0 +1,15 @@
+{
+  "name": "sha256",
+  "version": "0.0.2",
+  "repo": "jb55/sha256.c",
+  "description": "sha256 in c",
+  "keywords": ["sha256", "sha2"],
+  "src": ["sha256.c", "sha256.h"],
+  "dependencies": {
+    "jb55/rotate-bits.h": "0.1.1"
+  },
+  "development": {
+    "thlorenz/tap.c": "*"
+  }
+}
+
--- a/examples/gguf-hash/deps/sha256/sha256.c
+++ b/examples/gguf-hash/deps/sha256/sha256.c
@ -0,0 +1,221 @@
+/* Crypto/Sha256.c -- SHA-256 Hash
+2010-06-11 : Igor Pavlov : Public domain
+This code is based on public domain code from Wei Dai's Crypto++ library. */
+
+#include "rotate-bits/rotate-bits.h"
+#include "sha256.h"
+
+/* define it for speed optimization */
+#define _SHA256_UNROLL
+#define _SHA256_UNROLL2
+
+void
+sha256_init(sha256_t *p)
+{
+  p->state[0] = 0x6a09e667;
+  p->state[1] = 0xbb67ae85;
+  p->state[2] = 0x3c6ef372;
+  p->state[3] = 0xa54ff53a;
+  p->state[4] = 0x510e527f;
+  p->state[5] = 0x9b05688c;
+  p->state[6] = 0x1f83d9ab;
+  p->state[7] = 0x5be0cd19;
+  p->count = 0;
+}
+
+#define S0(x) (ROTR32(x, 2) ^ ROTR32(x,13) ^ ROTR32(x, 22))
+#define S1(x) (ROTR32(x, 6) ^ ROTR32(x,11) ^ ROTR32(x, 25))
+#define s0(x) (ROTR32(x, 7) ^ ROTR32(x,18) ^ (x >> 3))
+#define s1(x) (ROTR32(x,17) ^ ROTR32(x,19) ^ (x >> 10))
+
+#define blk0(i) (W[i] = data[i])
+#define blk2(i) (W[i&15] += s1(W[(i-2)&15]) + W[(i-7)&15] + s0(W[(i-15)&15]))
+
+#define Ch(x,y,z) (z^(x&(y^z)))
+#define Maj(x,y,z) ((x&y)|(z&(x|y)))
+
+#define a(i) T[(0-(i))&7]
+#define b(i) T[(1-(i))&7]
+#define c(i) T[(2-(i))&7]
+#define d(i) T[(3-(i))&7]
+#define e(i) T[(4-(i))&7]
+#define f(i) T[(5-(i))&7]
+#define g(i) T[(6-(i))&7]
+#define h(i) T[(7-(i))&7]
+
+
+#ifdef _SHA256_UNROLL2
+
+#define R(a,b,c,d,e,f,g,h, i) h += S1(e) + Ch(e,f,g) + K[i+j] + (j?blk2(i):blk0(i));\
+  d += h; h += S0(a) + Maj(a, b, c)
+
+#define RX_8(i) \
+  R(a,b,c,d,e,f,g,h, i); \
+  R(h,a,b,c,d,e,f,g, (i+1)); \
+  R(g,h,a,b,c,d,e,f, (i+2)); \
+  R(f,g,h,a,b,c,d,e, (i+3)); \
+  R(e,f,g,h,a,b,c,d, (i+4)); \
+  R(d,e,f,g,h,a,b,c, (i+5)); \
+  R(c,d,e,f,g,h,a,b, (i+6)); \
+  R(b,c,d,e,f,g,h,a, (i+7))
+
+#else
+
+#define R(i) h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[i+j] + (j?blk2(i):blk0(i));\
+  d(i) += h(i); h(i) += S0(a(i)) + Maj(a(i), b(i), c(i))
+
+#ifdef _SHA256_UNROLL
+
+#define RX_8(i) R(i+0); R(i+1); R(i+2); R(i+3); R(i+4); R(i+5); R(i+6); R(i+7);
+
+#endif
+
+#endif
+
+static const uint32_t K[64] = {
+  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+  0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+  0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+  0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+  0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+  0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+  0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+  0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+  0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+  0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+  0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+  0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+  0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+static void
+sha256_transform(uint32_t *state, const uint32_t *data)
+{
+  uint32_t W[16] = {0};
+  unsigned j;
+  #ifdef _SHA256_UNROLL2
+  uint32_t a,b,c,d,e,f,g,h;
+  a = state[0];
+  b = state[1];
+  c = state[2];
+  d = state[3];
+  e = state[4];
+  f = state[5];
+  g = state[6];
+  h = state[7];
+  #else
+  uint32_t T[8];
+  for (j = 0; j < 8; j++)
+    T[j] = state[j];
+  #endif
+
+  for (j = 0; j < 64; j += 16)
+  {
+    #if defined(_SHA256_UNROLL) || defined(_SHA256_UNROLL2)
+    RX_8(0); RX_8(8);
+    #else
+    unsigned i;
+    for (i = 0; i < 16; i++) { R(i); }
+    #endif
+  }
+
+  #ifdef _SHA256_UNROLL2
+  state[0] += a;
+  state[1] += b;
+  state[2] += c;
+  state[3] += d;
+  state[4] += e;
+  state[5] += f;
+  state[6] += g;
+  state[7] += h;
+  #else
+  for (j = 0; j < 8; j++)
+    state[j] += T[j];
+  #endif
+
+  /* Wipe variables */
+  /* memset(W, 0, sizeof(W)); */
+  /* memset(T, 0, sizeof(T)); */
+}
+
+#undef S0
+#undef S1
+#undef s0
+#undef s1
+
+static void
+sha256_write_byte_block(sha256_t *p)
+{
+  uint32_t data32[16];
+  unsigned i;
+  for (i = 0; i < 16; i++)
+    data32[i] =
+      ((uint32_t)(p->buffer[i * 4    ]) << 24) +
+      ((uint32_t)(p->buffer[i * 4 + 1]) << 16) +
+      ((uint32_t)(p->buffer[i * 4 + 2]) <<  8) +
+      ((uint32_t)(p->buffer[i * 4 + 3]));
+  sha256_transform(p->state, data32);
+}
+
+
+void
+sha256_hash(unsigned char *buf, const unsigned char *data, size_t size)
+{
+  sha256_t hash;
+  sha256_init(&hash);
+  sha256_update(&hash, data, size);
+  sha256_final(&hash, buf);
+}
+
+
+void
+sha256_update(sha256_t *p, const unsigned char *data, size_t size)
+{
+  uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
+  while (size > 0)
+  {
+    p->buffer[curBufferPos++] = *data++;
+    p->count++;
+    size--;
+    if (curBufferPos == 64)
+    {
+      curBufferPos = 0;
+      sha256_write_byte_block(p);
+    }
+  }
+}
+
+
+void
+sha256_final(sha256_t *p, unsigned char *digest)
+{
+  uint64_t lenInBits = (p->count << 3);
+  uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
+  unsigned i;
+  p->buffer[curBufferPos++] = 0x80;
+  while (curBufferPos != (64 - 8))
+  {
+    curBufferPos &= 0x3F;
+    if (curBufferPos == 0)
+      sha256_write_byte_block(p);
+    p->buffer[curBufferPos++] = 0;
+  }
+  for (i = 0; i < 8; i++)
+  {
+    p->buffer[curBufferPos++] = (unsigned char)(lenInBits >> 56);
+    lenInBits <<= 8;
+  }
+  sha256_write_byte_block(p);
+
+  for (i = 0; i < 8; i++)
+  {
+    *digest++ = (unsigned char)(p->state[i] >> 24);
+    *digest++ = (unsigned char)(p->state[i] >> 16);
+    *digest++ = (unsigned char)(p->state[i] >> 8);
+    *digest++ = (unsigned char)(p->state[i]);
+  }
+  sha256_init(p);
+}
--- a/examples/gguf-hash/deps/sha256/sha256.h
+++ b/examples/gguf-hash/deps/sha256/sha256.h
@ -0,0 +1,24 @@
+/* Sha256.h -- SHA-256 Hash
+2010-06-11 : Igor Pavlov : Public domain */
+
+#ifndef __CRYPTO_SHA256_H
+#define __CRYPTO_SHA256_H
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#define SHA256_DIGEST_SIZE 32
+
+typedef struct sha256_t
+{
+  uint32_t state[8];
+  uint64_t count;
+  unsigned char buffer[64];
+} sha256_t;
+
+void sha256_init(sha256_t *p);
+void sha256_update(sha256_t *p, const unsigned char *data, size_t size);
+void sha256_final(sha256_t *p, unsigned char *digest);
+void sha256_hash(unsigned char *buf, const unsigned char *data, size_t size);
+
+#endif
--- a/examples/gguf-hash/deps/xxhash/clib.json
+++ b/examples/gguf-hash/deps/xxhash/clib.json
@ -0,0 +1,12 @@
+{
+  "name": "xxhash",
+  "version": "0.8.2",
+  "repo": "Cyan4973/xxhash",
+  "description": "Extremely fast non-cryptographic hash algorithm",
+  "keywords": ["xxhash", "hashing"],
+  "license": "BSD-2-Clause",
+  "src": [
+    "xxhash.c",
+    "xxhash.h"
+  ]
+}
--- a/examples/gguf-hash/deps/xxhash/xxhash.c
+++ b/examples/gguf-hash/deps/xxhash/xxhash.c
@ -0,0 +1,42 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Copyright (C) 2012-2023 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/*
+ * xxhash.c instantiates functions defined in xxhash.h
+ */
+
+#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */
+#define XXH_IMPLEMENTATION      /* access definitions */
+
+#include "xxhash.h"
--- a/examples/gguf-hash/deps/xxhash/xxhash.h
+++ b/examples/gguf-hash/deps/xxhash/xxhash.h
--- a/examples/gguf-hash/gguf-hash.cpp
+++ b/examples/gguf-hash/gguf-hash.cpp
@ -0,0 +1,693 @@
+#include "ggml.h"
+
+#include <cstdlib>   /* abort() */
+#include <cstddef>
+#include <cstdio>
+#include <string>
+#include <stdexcept>
+#include <algorithm>
+#include <cstring>
+
+#include <sstream>
+#include <fstream>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "xxhash/xxhash.h"
+#include "sha1/sha1.h"
+#include "sha256/sha256.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+
+// uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Llama.cpp')
+#define UUID_NAMESPACE_LLAMA_CPP "ef001206-dadc-5f6d-a15f-3359e577d4e5"
+#define UUID_NAMESPACE_LLAMA_CPP_HEX 0xef, 0x00, 0x12, 0x06, 0xda, 0xdc, 0x5f, 0x6d, 0xa1, 0x5f, 0x33, 0x59, 0xe5, 0x77, 0xd4, 0xe5
+
+
+#define HASH_TYPE_SHA256_STR "sha256"
+#define HASH_TYPE_SHA1_STR   "sha1"
+#define HASH_TYPE_XXH64_STR  "xxh64"
+#define HASH_TYPE_UUID_STR   "uuid"
+
+
+typedef enum {
+    HASH_EXIT_SUCCESS = 0, // All hash has been generated or validated
+    HASH_EXIT_FAILURE = 1, // Generic Failure
+    HASH_EXIT_MISMATCH = 2, // Hash mismatched during validation
+    HASH_EXIT_MANIFEST_MISSING_ENTRY = 3, // Hash attempted validation but missing entry in manifest
+    HASH_EXIT_MANIFEST_UNKNOWN_HASH = 4, // Manifest is present, but we do not know any hash format within it
+    HASH_EXIT_MANIFEST_FILE_ERROR = 5 // Manifest is either missing or not a known format
+} hash_exit_code_t;
+
+
+typedef enum {
+    HASH_MANIFEST_NOT_FOUND,
+    HASH_MANIFEST_MISMATCH,
+    HASH_MANIFEST_OK,
+} hash_manifest_result_t;
+
+
+struct hash_params {
+    std::string input;
+    bool xxh64 = false;
+    bool sha1 = false;
+    bool sha256 = false;
+    bool uuid = false;
+
+    bool no_layer = false;
+
+    bool manifest_is_usable = false;
+    std::string manifest_file;
+};
+
+struct manifest_check_params {
+    bool xxh64 = false;
+    bool sha1 = false;
+    bool sha256 = false;
+    bool uuid = false;
+};
+
+static char const * hash_manifest_result_to_str(hash_manifest_result_t value) {
+    switch (value) {
+        case HASH_MANIFEST_NOT_FOUND: return "Not Found";
+        case HASH_MANIFEST_MISMATCH: return "Mismatch";
+        case HASH_MANIFEST_OK: return "Ok";
+    }
+    return "?";
+}
+
+static char const * hash_exit_code_to_str(hash_exit_code_t value) {
+    switch (value) {
+        case HASH_EXIT_SUCCESS: return "Success";
+        case HASH_EXIT_FAILURE: return "Failure";
+        case HASH_EXIT_MISMATCH: return "Mismatch";
+        case HASH_EXIT_MANIFEST_MISSING_ENTRY: return "Manifest Missing Entry";
+        case HASH_EXIT_MANIFEST_UNKNOWN_HASH: return "Manifest Unknown Hash";
+        case HASH_EXIT_MANIFEST_FILE_ERROR: return "Manifest File Error";
+    }
+    return "?";
+}
+
+static void hash_print_usage(const char * executable) {
+    const hash_params default_params;
+    printf("\n");
+    printf("usage: %s [options] GGUF_IN\n", executable);
+    printf("\n");
+    printf("Hash a GGUF file");
+    printf("\n");
+    printf("options:\n");
+    printf("  -h, --help              show this help message and exit\n");
+    printf("      --xxh64             use xxh64 hash\n");
+    printf("      --sha1              use sha1 hash\n");
+    printf("      --sha256            use sha256 hash\n");
+    printf("      --all               use all hash\n");
+    printf("      --no-layer          exclude per layer hash\n");
+    printf("      --uuid              generate UUIDv5 ID\n");
+    printf("  -c, --check <manifest>  verify against a manifest\n");
+    printf("\n");
+}
+
+static void hash_params_parse_ex(int argc, const char ** argv, hash_params & params) {
+    std::string arg;
+    bool invalid_param = false;
+    const std::string arg_prefix = "--";
+
+    int arg_idx = 1;
+    for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
+        arg = argv[arg_idx];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+
+        bool arg_found = false;
+        if (arg == "-h" || arg == "--help") {
+            hash_print_usage(argv[0]);
+            exit(0);
+        }
+
+        if (arg == "--xxh64") {
+            arg_found = true;
+            params.xxh64 = true;
+        }
+
+        if (arg == "--sha1") {
+            arg_found = true;
+            params.sha1 = true;
+        }
+
+        if (arg == "--uuid") {
+            arg_found = true;
+            params.uuid = true;
+        }
+
+        if (arg == "--sha256") {
+            arg_found = true;
+            params.sha256 = true;
+        }
+
+        if (arg == "--all") {
+            arg_found = true;
+            params.sha256 = true;
+            params.sha1 = true;
+            params.xxh64 = true;
+        }
+
+        if (arg == "--no-layer") {
+            arg_found = true;
+            params.no_layer = true;
+        }
+
+        if (arg == "-c" || arg == "--check") {
+            if (++arg_idx >= argc) {
+                invalid_param = true;
+                break;
+            }
+            arg_found = true;
+            params.manifest_file = argv[arg_idx];
+        }
+
+        if (!arg_found) {
+            throw std::invalid_argument("error: unknown argument: " + arg);
+        }
+    }
+
+    if (invalid_param) {
+        throw std::invalid_argument("error: invalid parameter for argument:" + arg);
+    }
+
+    if (argc - arg_idx < 1) {
+        throw std::invalid_argument("error: bad arguments");
+    }
+
+    params.input = argv[arg_idx++];
+}
+
+static bool hash_params_parse(int argc, const char ** argv, hash_params & params) {
+    bool result = true;
+    try {
+        hash_params_parse_ex(argc, argv, params);
+    }
+    catch (const std::invalid_argument & ex) {
+        fprintf(stderr, "%s\n", ex.what());
+        hash_print_usage(argv[0]);
+        exit(EXIT_FAILURE);
+    }
+    return result;
+}
+
+static bool manifest_type(const std::string & manifest_file, manifest_check_params & manifest_check) {
+    if (manifest_file.empty()) {
+        return false;
+    }
+
+    std::ifstream file(manifest_file);
+    if (!file.is_open()) {
+        return false;
+    }
+
+    std::string manifest_entry_line;
+    while (getline(file, manifest_entry_line)) {
+        // hash_type_str hash_str tensor_name
+        // e.g. 'xxh64     f66e9cd66a4396a0  test.gguf:tensor_0'
+        std::istringstream line_stream(manifest_entry_line);
+        std::string file_hash_type;
+        if (line_stream >> file_hash_type) {
+            if (file_hash_type == HASH_TYPE_SHA256_STR) {
+                manifest_check.sha256 = true;
+            } else if (file_hash_type == HASH_TYPE_SHA1_STR) {
+                manifest_check.sha1 = true;
+            } else if (file_hash_type == HASH_TYPE_XXH64_STR) {
+                manifest_check.xxh64 = true;
+            } else if (file_hash_type == HASH_TYPE_UUID_STR) {
+                manifest_check.uuid = true;
+            }
+        }
+    }
+
+    return true;
+}
+
+static hash_manifest_result_t manifest_verify(const std::string& manifest_file, const std::string& hash_type_str, const std::string& hash_str, const std::string& tensor_name) {
+    if (manifest_file.empty()) {
+        return HASH_MANIFEST_NOT_FOUND;
+    }
+
+    std::ifstream file(manifest_file);
+    if (!file.is_open()) {
+        return HASH_MANIFEST_NOT_FOUND;
+    }
+
+    std::string manifest_entry_line;
+    while (getline(file, manifest_entry_line)) {
+        std::istringstream line_stream(manifest_entry_line);
+        std::string file_hash_type;
+        std::string file_hash;
+        std::string file_tensor_name;
+        if (line_stream >> file_hash_type >> file_hash >> file_tensor_name) {
+            // Line parsed. Check hash validity
+
+            if (file_hash_type != hash_type_str) {
+                continue;
+            }
+
+            if (file_tensor_name != tensor_name) {
+                continue;
+            }
+
+            return (file_hash == hash_str) ? HASH_MANIFEST_OK : HASH_MANIFEST_MISMATCH;
+        }
+    }
+
+    return HASH_MANIFEST_NOT_FOUND;
+}
+
+static void generate_uuidv5(const unsigned char sha1_digest[20], unsigned char uuid[16]) {
+    // Ref: https://www.rfc-editor.org/rfc/rfc9562.html#section-5.5
+    // Assumes that digest was processed correctly with the expected namespace
+    for (int i = 0; i < 16; i++) {
+        uuid[i] = sha1_digest[i];
+    }
+
+    // Set bits corresponding to UUID ver 5
+    uuid[ 6] &= ~(0xF << 4);
+    uuid[ 6] |= (5 << 4);
+
+    // Set bits corresponding to UUID variant 0b10XX
+    uuid[ 8] &= ~(0xc << 4);
+    uuid[ 8] |= (0x8 << 4);
+}
+
+static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
+    const std::string & fname = hash_params.input;
+    struct ggml_context * ctx_data = NULL;
+
+    struct gguf_init_params params = {
+        /*.no_alloc = */ false,
+        /*.ctx      = */ &ctx_data,
+    };
+
+    // xxh64 init
+    XXH64_state_t* xxh64_model_hash_state = NULL;
+    if (hash_params.xxh64) {
+        xxh64_model_hash_state = XXH64_createState();
+        if (xxh64_model_hash_state==NULL) {
+            abort();
+        }
+
+        XXH64_hash_t const seed = 0;
+        if (XXH64_reset(xxh64_model_hash_state, seed) == XXH_ERROR) {
+            abort();
+        }
+    }
+
+    // sha1 init
+    SHA1_CTX sha1_model_hash_ctx;
+    if (hash_params.sha1) {
+        SHA1Init(&sha1_model_hash_ctx);
+    }
+
+    // sha256 init
+    sha256_t sha256_model_hash_ctx;
+    if (hash_params.sha256) {
+        sha256_init(&sha256_model_hash_ctx);
+    }
+
+    // sha1 for uuid init
+    SHA1_CTX sha1_for_uuid_ctx;
+    if (hash_params.uuid) {
+        unsigned char const uuidv5_namespace[] = {UUID_NAMESPACE_LLAMA_CPP_HEX};
+        SHA1Init(&sha1_for_uuid_ctx);
+        SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)uuidv5_namespace, sizeof(uuidv5_namespace));
+    }
+
+    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
+    const int n_tensors = gguf_get_n_tensors(ctx);
+    bool tensor_layer_in_manifest = false;
+    bool model_in_manifest = false;
+    bool tensor_layer_has_mismatch = false;
+    bool model_has_mismatch = false;
+    for (int i = 0; i < n_tensors; ++i) {
+        const char * name = gguf_get_tensor_name(ctx, i);
+        struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
+        auto n_bytes = ggml_nbytes(cur);
+        auto *raw_data = cur->data;
+        const std::string tensor_layer_name = fname + ":" + name;
+
+        if (hash_params.xxh64) {
+
+            if (!hash_params.no_layer) {
+                // Per Layer Hash
+                XXH64_hash_t hash = XXH64(raw_data, n_bytes, 0);
+
+                char hex_result[17];
+                for (int  offset = 0; offset < 8; offset++) {
+                    unsigned int shift_bits_by = (8 * (8 - offset - 1));
+                    snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
+                }
+
+                if (hash_params.manifest_is_usable) {
+                    hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name);
+
+                    switch (verify_result) {
+                        case HASH_MANIFEST_NOT_FOUND:
+                            break;
+                        case HASH_MANIFEST_MISMATCH:
+                            tensor_layer_in_manifest = true;
+                            tensor_layer_has_mismatch = true;
+                            break;
+                        case HASH_MANIFEST_OK:
+                            tensor_layer_in_manifest = true;
+                            break;
+                    }
+
+                    printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
+                } else {
+                    printf("%-8s  %-s  %s\n", HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name.c_str());
+                }
+            }
+
+            // Overall Model Hash
+            if (XXH64_update(xxh64_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort();
+        }
+
+        if (hash_params.sha1) {
+
+            if (!hash_params.no_layer) {
+                // Per Layer Hash
+                char result[21]; // sha1 outputs 20 bytes
+                SHA1( result, (const char *)raw_data, n_bytes);
+
+                char hex_result[41] = {0};
+                for (int  offset = 0; offset < 20; offset++) {
+                    snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
+                }
+
+                if (hash_params.manifest_is_usable) {
+                    hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name);
+
+                    switch (verify_result) {
+                        case HASH_MANIFEST_NOT_FOUND:
+                            break;
+                        case HASH_MANIFEST_MISMATCH:
+                            tensor_layer_in_manifest = true;
+                            tensor_layer_has_mismatch = true;
+                            break;
+                        case HASH_MANIFEST_OK:
+                            tensor_layer_in_manifest = true;
+                            break;
+                    }
+
+                    printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
+                } else {
+                    printf("%-8s  %-s  %s\n", HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name.c_str());
+                }
+            }
+
+            // Overall Model Hash
+            SHA1Update( &sha1_model_hash_ctx, (unsigned char const *)raw_data, n_bytes);
+        }
+
+        if (hash_params.sha256) {
+
+            if (!hash_params.no_layer) {
+                // Per Layer Hash
+                unsigned char result[SHA256_DIGEST_SIZE]; // sha256 outputs 32 bytes
+                sha256_hash((unsigned char*) result, (const unsigned char *)raw_data, n_bytes);
+
+                char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
+                for (int  offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
+                    snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
+                }
+
+                if (hash_params.manifest_is_usable) {
+                    hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name);
+
+                    switch (verify_result) {
+                        case HASH_MANIFEST_NOT_FOUND:
+                            break;
+                        case HASH_MANIFEST_MISMATCH:
+                            tensor_layer_in_manifest = true;
+                            tensor_layer_has_mismatch = true;
+                            break;
+                        case HASH_MANIFEST_OK:
+                            tensor_layer_in_manifest = true;
+                            break;
+                    }
+
+                    printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
+                } else {
+                    printf("%-8s  %-s  %s\n", HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name.c_str());
+                }
+            }
+
+            // Overall Model Hash
+            sha256_update( &sha256_model_hash_ctx, (unsigned char const *)raw_data, n_bytes);
+        }
+
+        if (hash_params.uuid) {
+            SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)raw_data, n_bytes);
+        }
+    }
+
+    if (hash_params.xxh64) {
+        XXH64_hash_t const hash = XXH64_digest(xxh64_model_hash_state);
+
+        char hex_result[17];
+        for (int  offset = 0; offset < 8; offset++) {
+            unsigned int shift_bits_by = (8 * (8 - offset - 1));
+            snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
+        }
+
+        if (hash_params.manifest_is_usable) {
+            hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH64_STR, hex_result, fname);
+
+            switch (verify_result) {
+                case HASH_MANIFEST_NOT_FOUND:
+                    break;
+                case HASH_MANIFEST_MISMATCH:
+                    model_in_manifest = true;
+                    model_has_mismatch = true;
+                    break;
+                case HASH_MANIFEST_OK:
+                    model_in_manifest = true;
+                    break;
+            }
+
+            printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_XXH64_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
+        } else {
+            printf("%-8s  %-s  %s\n", HASH_TYPE_XXH64_STR, hex_result, fname.c_str());
+        }
+    }
+
+    if (hash_params.sha1) {
+        unsigned char result[21];
+        SHA1Final(result, &sha1_model_hash_ctx);
+
+        char hex_result[41];
+        for (int  offset = 0; offset < 20; offset++) {
+            snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
+        }
+
+        if (hash_params.manifest_is_usable) {
+            hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA1_STR, hex_result, fname);
+
+            switch (verify_result) {
+                case HASH_MANIFEST_NOT_FOUND:
+                    break;
+                case HASH_MANIFEST_MISMATCH:
+                    model_in_manifest = true;
+                    model_has_mismatch = true;
+                    break;
+                case HASH_MANIFEST_OK:
+                    model_in_manifest = true;
+                    break;
+            }
+
+            printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_SHA1_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
+        } else {
+            printf("%-8s  %-s  %s\n", HASH_TYPE_SHA1_STR, hex_result, fname.c_str());
+        }
+    }
+
+    if (hash_params.sha256) {
+        unsigned char result[SHA256_DIGEST_SIZE]; // sha256 outputs 32 bytes
+        sha256_final( &sha256_model_hash_ctx,  result);
+
+        char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
+        for (int  offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
+            snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
+        }
+
+        if (hash_params.manifest_is_usable) {
+            hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, hex_result, fname);
+
+            switch (verify_result) {
+                case HASH_MANIFEST_NOT_FOUND:
+                    break;
+                case HASH_MANIFEST_MISMATCH:
+                    model_in_manifest = true;
+                    model_has_mismatch = true;
+                    break;
+                case HASH_MANIFEST_OK:
+                    model_in_manifest = true;
+                    break;
+            }
+
+            printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_SHA256_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
+        } else {
+            printf("%-8s  %-s  %s\n", HASH_TYPE_SHA256_STR, hex_result, fname.c_str());
+        }
+    }
+
+    if (hash_params.uuid) {
+        unsigned char result[21];
+        SHA1Final(result, &sha1_for_uuid_ctx);
+
+        unsigned char uuid[16];
+        generate_uuidv5(result, uuid);
+
+        char string_buffer[37] = {0};
+        snprintf(string_buffer, sizeof(string_buffer), "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+            uuid[0], uuid[1], uuid[2], uuid[3],
+            uuid[4], uuid[5], uuid[6], uuid[7],
+            uuid[8], uuid[9], uuid[10], uuid[11],
+            uuid[12], uuid[13], uuid[14], uuid[15]);
+
+        if (hash_params.manifest_is_usable) {
+            hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, string_buffer, fname);
+
+            switch (verify_result) {
+                case HASH_MANIFEST_NOT_FOUND:
+                    break;
+                case HASH_MANIFEST_MISMATCH:
+                    model_in_manifest = true;
+                    model_has_mismatch = true;
+                    break;
+                case HASH_MANIFEST_OK:
+                    model_in_manifest = true;
+                    break;
+            }
+
+            printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_UUID_STR, string_buffer, fname.c_str(), hash_manifest_result_to_str(verify_result));
+        } else {
+            printf("%-8s  %-s  %s\n", HASH_TYPE_UUID_STR, string_buffer, fname.c_str());
+        }
+    }
+
+
+    ggml_free(ctx_data);
+    gguf_free(ctx);
+
+
+    if (hash_params.manifest_is_usable) {
+        // In hash verification mode
+
+        if (!model_in_manifest) {
+            // model missing in manifest?
+
+            // Check tensor layer...
+            if (!tensor_layer_in_manifest) {
+                // Still missing? Maybe we are reading the wrong manifest.
+                return HASH_EXIT_MANIFEST_MISSING_ENTRY;
+            }
+
+            if (tensor_layer_has_mismatch) {
+                // Per tensor check found error
+                return HASH_EXIT_FAILURE;
+            }
+
+            // All per tensor layer checks passed? Sounds good enough.
+            return HASH_EXIT_SUCCESS;
+        }
+
+        // Overall model check passed, but let's check per layer just in case
+        // If missing, we don't care too much as the overall model checked
+        if (tensor_layer_in_manifest && tensor_layer_has_mismatch) {
+            return HASH_EXIT_FAILURE;
+        }
+
+        if (model_has_mismatch) {
+            // model has failed hash somewhere in the model
+            return HASH_EXIT_FAILURE;
+        }
+
+        // All checks appears to be fine
+        return HASH_EXIT_SUCCESS;
+    }
+
+    // In hash generation mode
+    return HASH_EXIT_SUCCESS;
+}
+
+int main(int argc, const char ** argv) {
+    hash_params params;
+    manifest_check_params manifest_check;
+    hash_params_parse(argc, argv, params);
+
+    if (!params.manifest_file.empty()) {
+        if (!manifest_type(params.manifest_file, manifest_check)) {
+            printf("ERROR cannot open manifest %s", params.manifest_file.c_str());
+            return HASH_EXIT_MANIFEST_FILE_ERROR;
+        }
+
+        if (!manifest_check.sha256 && !manifest_check.sha1 && !manifest_check.xxh64 && !manifest_check.uuid) {
+            printf("ERROR manifest does not have any known hash format in %s", params.manifest_file.c_str());
+            return HASH_EXIT_MANIFEST_UNKNOWN_HASH;
+        }
+
+        printf("manifest  %s", params.manifest_file.c_str());
+
+        if (manifest_check.sha256) {
+            printf("  sha256");
+        }
+
+        if (manifest_check.sha1) {
+            printf("  sha1");
+        }
+
+        if (manifest_check.xxh64) {
+            printf("  xxh64");
+        }
+
+        if (manifest_check.uuid) {
+            printf("  uuid");
+        }
+
+        printf("\n");
+
+        // Autoselect the highest security hash if manifest is provided but
+        // the user has not specifically defined the hash they care about
+        if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
+            // User has not selected a specific value, pick most secure hash
+            if (manifest_check.sha256) {
+                params.sha256 = true;
+            } else if (manifest_check.sha1) {
+                params.sha1 = true;
+            } else if (manifest_check.xxh64) {
+                params.xxh64 = true;
+            } else if (manifest_check.uuid) {
+                params.uuid = true;
+            }
+        }
+
+        params.manifest_is_usable = true;
+    }
+
+    // By default if no swich argument provided, assume xxh64
+    if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
+        params.xxh64 = true;
+    }
+
+    hash_exit_code_t exit_code = gguf_hash(params);
+
+    if (params.manifest_is_usable) {
+        printf("\nVerification results for %s - %s\n", params.manifest_file.c_str(), hash_exit_code_to_str(exit_code));
+    }
+
+    return exit_code;
+}
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@ -92,6 +92,11 @@ static bool gguf_ex_read_0(const std::string & fname) {

    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);

+    if (!ctx) {
+        fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
    printf("%s: version:      %d\n", __func__, gguf_get_version(ctx));
    printf("%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
    printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@ -1,6 +1,6 @@
 # llama.cpp/examples/imatrix

-Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantum models.
+Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models.
 More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861

 ## Usage
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -127,7 +127,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        }
        else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
-            exit(1); //GGML_ASSERT(false);
+            exit(1); //GGML_ABORT("fatal error");
        }
        if (m_params.verbosity > 1) {
            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
@ -176,7 +176,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        }
        else if (e.values.size() != (size_t)src1->ne[0]) {
            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
-            exit(1); //GGML_ASSERT(false);
+            exit(1); //GGML_ABORT("fatal error");
        }
        ++e.ncall;
        if (m_params.verbosity > 1) {
--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@ -15,6 +15,7 @@ In this section, we cover the most commonly used options for running the `infill
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
 -   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+-   `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.

 ## Input Prompts

--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -204,25 +204,23 @@ int main(int argc, char ** argv) {
    GGML_ASSERT(llama_add_eos_token(model) != 1);
    LOG("add_bos: %d\n", add_bos);

-    bool suff_rm_leading_spc = params.escape;
-    if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
-        params.input_suffix.erase(0, 1);
-        suff_rm_leading_spc = false;
-    }
    std::vector<llama_token> embd_inp;
+    std::vector<llama_token> embd_end;
    std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
    std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
-    const int space_token = 29871;
-    if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
-        inp_sfx.erase(inp_sfx.begin());
-    }
+
+    GGML_ASSERT(llama_token_prefix(model) >= 0);
+    GGML_ASSERT(llama_token_suffix(model) >= 0);
+
    inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
-    if (add_bos) {
-        inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
-    }
    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
-    embd_inp = inp_pfx;
-    embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
+
+    embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
+    embd_end = params.spm_infill ? inp_pfx : inp_sfx;
+    if (add_bos) {
+        embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
+    }
+    embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());

    const llama_token middle_token = llama_token_middle(model);
    if (middle_token >= 0) {
@ -514,26 +512,21 @@ int main(int argc, char ** argv) {
                    string_process_escapes(params.input_prefix);
                    string_process_escapes(params.input_suffix);
                }
-                suff_rm_leading_spc = params.escape;
-                if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
-                    params.input_suffix.erase(0, 1);
-                    suff_rm_leading_spc = false;
-                }
+
                // tokenize new prefix and suffix
                std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
                std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
-                if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
-                    inp_sfx.erase(inp_sfx.begin());
-                }
-                inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
-                if (add_bos) {
-                    inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
-                }
-                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
-                embd_inp = inp_pfx;
-                embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());

-                const llama_token middle_token = llama_token_middle(model);
+                inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
+                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
+
+                embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
+                embd_end = params.spm_infill ? inp_pfx : inp_sfx;
+                if (add_bos) {
+                    embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
+                }
+                embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
+
                if (middle_token >= 0) {
                    embd_inp.push_back(middle_token);
                }
@ -657,4 +650,3 @@ int main(int argc, char ** argv) {

    return 0;
 }
-
--- a/examples/json_schema_pydantic_example.py
+++ b/examples/json_schema_pydantic_example.py
@ -1,9 +1,9 @@
 # Usage:
 #! ./llama-server -m some-model.gguf &
 #! pip install pydantic
-#! python json-schema-pydantic-example.py
+#! python json_schema_pydantic_example.py

-from pydantic import BaseModel, Extra, TypeAdapter
+from pydantic import BaseModel, Field, TypeAdapter
 from annotated_types import MinLen
 from typing import Annotated, List, Optional
 import json, requests
@ -17,6 +17,9 @@ if True:

        The response_model param takes a type (+ supports Pydantic) and behaves just as w/ Instructor (see below)
        '''
+        response_format = None
+        type_adapter = None
+
        if response_model:
            type_adapter = TypeAdapter(response_model)
            schema = type_adapter.json_schema()
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@ -1,4 +1,6 @@
 #!/usr/bin/env python3
+from __future__ import annotations
+
 import argparse
 import itertools
 import json
@ -188,7 +190,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou
    raise RuntimeError("At least one of min_value or max_value must be set")

 class BuiltinRule:
-    def __init__(self, content: str, deps: list = None):
+    def __init__(self, content: str, deps: list | None = None):
        self.content = content
        self.deps = deps or []

@ -231,7 +233,7 @@ GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]')
 GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'}

 NON_LITERAL_SET = set('|.()[]{}*+?')
-ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')
+ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('^$.[]()|{}*+?')


 class SchemaConverter:
@ -248,7 +250,7 @@ class SchemaConverter:

    def _format_literal(self, literal):
        escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
-            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), literal
+            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)) or m.group(0), literal
        )
        return f'"{escaped}"'

@ -403,11 +405,11 @@ class SchemaConverter:
        i = 0
        length = len(pattern)

-        def to_rule(s: Tuple[str, bool]) -> str:
+        def to_rule(s: tuple[str, bool]) -> str:
            (txt, is_literal) = s
            return "\"" + txt + "\"" if is_literal else txt

-        def transform() -> Tuple[str, bool]:
+        def transform() -> tuple[str, bool]:
            '''
                Parse a unit at index i (advancing it), and return its string representation + whether it's a literal.
            '''
@ -420,7 +422,7 @@ class SchemaConverter:
            # We only need a flat structure here to apply repetition operators to the last item, and
            # to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially
            # (GBNF's syntax is luckily very close to regular expressions!)
-            seq: list[Tuple[str, bool]] = []
+            seq: list[tuple[str, bool]] = []

            def get_dot():
                if self._dotall:
@ -602,7 +604,7 @@ class SchemaConverter:
                else:
                    add_component(t, is_required=True)

-            return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=[]))
+            return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))

        elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
            items = schema.get('items') or schema['prefixItems']
@ -691,7 +693,7 @@ class SchemaConverter:
        required_props = [k for k in sorted_props if k in required]
        optional_props = [k for k in sorted_props if k not in required]

-        if additional_properties != False:
+        if additional_properties is not None and additional_properties != False:
            sub_name = f'{name}{"-" if name else ""}additional'
            value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \
                self._add_primitive('value', PRIMITIVE_RULES['value'])
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -23,6 +23,10 @@
 #include "ggml-cuda.h"
 #include "ggml-sycl.h"

+#ifdef GGML_USE_CANN
+#include "ggml-cann.h"
+#endif
+
 // utils
 static uint64_t get_time_ns() {
    using clock = std::chrono::high_resolution_clock;
@ -120,6 +124,17 @@ static std::string get_gpu_info() {
            id += "/";
        }
    }
+#endif
+#ifdef GGML_USE_CANN
+    uint32_t count = ggml_backend_cann_get_device_count();
+    for (uint32_t i = 0; i < count; i++) {
+        char buf[128];
+        ggml_backend_cann_get_device_description(i, buf, sizeof(buf));
+        id += buf;
+        if (i < count - 1) {
+            id += "/";
+        }
+    }
 #endif
    // TODO: other backends
    return id;
@ -135,7 +150,7 @@ static const char * output_format_str(output_formats format) {
        case JSON:     return "json";
        case MARKDOWN: return "md";
        case SQL:      return "sql";
-        default: GGML_ASSERT(!"invalid output format");
+        default: GGML_ABORT("invalid output format");
    }
 }

@ -161,7 +176,7 @@ static const char * split_mode_str(llama_split_mode mode) {
        case LLAMA_SPLIT_MODE_NONE:  return "none";
        case LLAMA_SPLIT_MODE_LAYER: return "layer";
        case LLAMA_SPLIT_MODE_ROW:   return "row";
-        default: GGML_ASSERT(!"invalid split mode");
+        default: GGML_ABORT("invalid split mode");
    }
 }

@ -1311,7 +1326,7 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
        case SQL:
            return std::unique_ptr<printer>(new sql_printer());
    }
-    GGML_ASSERT(false);
+    GGML_ABORT("fatal error");
 }

 int main(int argc, char ** argv) {
--- a/examples/llama.android/llama/CMakeLists.txt
+++ b/examples/llama.android/llama/CMakeLists.txt
@ -1,55 +0,0 @@
-
-# For more information about using CMake with Android Studio, read the
-# documentation: https://d.android.com/studio/projects/add-native-code.html.
-# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
-
-# Sets the minimum CMake version required for this project.
-cmake_minimum_required(VERSION 3.22.1)
-
-# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
-# Since this is the top level CMakeLists.txt, the project name is also accessible
-# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
-# build script scope).
-project("llama-android")
-
-## Fetch latest llama.cpp from GitHub
-#include(FetchContent)
-#FetchContent_Declare(
-#        llama
-#        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
-#        GIT_TAG        master
-#)
-#
-## Also provides "common"
-#FetchContent_MakeAvailable(llama)
-
-# llama.cpp CI uses the code from the current branch
-# ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700
-add_subdirectory(../../../../../../ build-llama)
-
-# Creates and names a library, sets it as either STATIC
-# or SHARED, and provides the relative paths to its source code.
-# You can define multiple libraries, and CMake builds them for you.
-# Gradle automatically packages shared libraries with your APK.
-#
-# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
-# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
-# is preferred for the same purpose.
-#
-# In order to load a library into your app from Java/Kotlin, you must call
-# System.loadLibrary() and pass the name of the library defined here;
-# for GameActivity/NativeActivity derived applications, the same library name must be
-# used in the AndroidManifest.xml file.
-add_library(${CMAKE_PROJECT_NAME} SHARED
-    # List C/C++ source files with relative paths to this CMakeLists.txt.
-        llama-android.cpp)
-
-# Specifies libraries CMake should link to your target library. You
-# can link libraries from various origins, such as libraries defined in this
-# build script, prebuilt third-party libraries, or Android system libraries.
-target_link_libraries(${CMAKE_PROJECT_NAME}
-    # List libraries link to the target library
-    llama
-    common
-    android
-    log)
--- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
+++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
@ -11,15 +11,15 @@ cmake_minimum_required(VERSION 3.22.1)
 # build script scope).
 project("llama-android")

-include(FetchContent)
-FetchContent_Declare(
-        llama
-        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
-        GIT_TAG        master
-)
+#include(FetchContent)
+#FetchContent_Declare(
+#        llama
+#        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
+#        GIT_TAG        master
+#)

 # Also provides "common"
-FetchContent_MakeAvailable(llama)
+#FetchContent_MakeAvailable(llama)

 # Creates and names a library, sets it as either STATIC
 # or SHARED, and provides the relative paths to its source code.
@ -30,6 +30,10 @@ FetchContent_MakeAvailable(llama)
 # the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
 # is preferred for the same purpose.
 #
+
+#load local llama.cpp
+add_subdirectory(../../../../../../ build-llama)
+
 # In order to load a library into your app from Java/Kotlin, you must call
 # System.loadLibrary() and pass the name of the library defined here;
 # for GameActivity/NativeActivity derived applications, the same library name must be
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@ -409,7 +409,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(

    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
    if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
-        return env->NewStringUTF("");
+        return nullptr;
    }

    auto new_token_chars = llama_token_to_piece(context, new_token_id);
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@ -26,11 +26,12 @@ actor LlamaContext {
    private var context: OpaquePointer
    private var batch: llama_batch
    private var tokens_list: [llama_token]
+    var is_done: Bool = false

    /// This variable is used to store temporarily invalid cchars
    private var temporary_invalid_cchars: [CChar]

-    var n_len: Int32 = 64
+    var n_len: Int32 = 1024
    var n_cur: Int32 = 0

    var n_decode: Int32 = 0
@ -160,6 +161,7 @@ actor LlamaContext {

        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            print("\n")
+            is_done = true
            let new_token_str = String(cString: temporary_invalid_cchars + [0])
            temporary_invalid_cchars.removeAll()
            return new_token_str
@ -322,7 +324,7 @@ actor LlamaContext {
        defer {
            result.deallocate()
        }
-        let nTokens = llama_token_to_piece(model, token, result, 8, false)
+        let nTokens = llama_token_to_piece(model, token, result, 8, 0, false)

        if nTokens < 0 {
            let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
@ -330,7 +332,7 @@ actor LlamaContext {
            defer {
                newResult.deallocate()
            }
-            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false)
+            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, 0, false)
            let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
            return Array(bufferPointer)
        } else {
--- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
+++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
@ -132,7 +132,7 @@ class LlamaState: ObservableObject {
        messageLog += "\(text)"

        Task.detached {
-            while await llamaContext.n_cur < llamaContext.n_len {
+            while await !llamaContext.is_done {
                let result = await llamaContext.completion_loop()
                await MainActor.run {
                    self.messageLog += "\(result)"
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@ -30,16 +30,16 @@ git clone https://huggingface.co/mtgv/MobileVLM-1.7B
 git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 ```

-2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
+2. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:

 ```sh
-python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
+python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
 ```

-3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
+3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:

 ```sh
-python ./examples/llava/convert-image-encoder-to-gguf \
+python ./examples/llava/convert_image_encoder_to_gguf \
    -m path/to/clip-vit-large-patch14-336 \
    --llava-projector path/to/MobileVLM-1.7B/llava.projector \
    --output-dir path/to/MobileVLM-1.7B \
@ -47,17 +47,17 @@ python ./examples/llava/convert-image-encoder-to-gguf \
 ```

 ```sh
-python ./examples/llava/convert-image-encoder-to-gguf \
+python ./examples/llava/convert_image_encoder_to_gguf \
    -m path/to/clip-vit-large-patch14-336 \
    --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
    --output-dir path/to/MobileVLM-1.7B_V2 \
    --projector-type ldpv2
 ```

-4. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
+4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:

 ```sh
-python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B
+python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B
 ```

 5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@ -38,22 +38,22 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 pip install -r examples/llava/requirements.txt
 ```

-3. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
+3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:

 ```sh
-python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
+python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b
 ```

-4. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:
+4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:

 ```sh
-python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
+python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
 ```

-5. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
+5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:

 ```sh
-python ./examples/convert-legacy-llama.py ../llava-v1.5-7b --skip-unknown
+python ./examples/convert_legacy_llama.py ../llava-v1.5-7b --skip-unknown
 ```

 Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
@ -70,9 +70,9 @@ git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
 pip install -r examples/llava/requirements.txt
 ```

-3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
+3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
 ```console
-python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/
+python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
 ```
 - you will find a llava.projector and a llava.clip file in your model directory

@ -86,13 +86,13 @@ curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.jso

 5) Create the visual gguf model:
 ```console
-python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
+python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
 ```
 - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP

 6) Then convert the model to gguf format:
 ```console
-python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
+python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
 ```

 7) And finally we can run the llava cli using the 1.6 model version:
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -16,6 +16,10 @@
 #include "ggml-metal.h"
 #endif

+#ifdef GGML_USE_CANN
+#include "ggml-cann.h"
+#endif
+
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"

@ -865,7 +869,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            embeddings = peg_0;
        }
        else {
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
        }
    }

@ -1001,6 +1005,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    LOG_TEE("%s: CLIP using Metal backend\n", __func__);
 #endif

+#ifdef GGML_USE_CANN
+    new_clip->backend = ggml_backend_cann_init(0);
+    LOG_TEE("%s: CLIP using CANN backend\n", __func__);
+#endif
+

    if (!new_clip->backend) {
        new_clip->backend = ggml_backend_cpu_init();
--- a/examples/llava/convert_image_encoder_to_gguf.py
+++ b/examples/llava/convert_image_encoder_to_gguf.py
@ -185,6 +185,8 @@ else:
    fout.add_description("two-tower CLIP model")

 if has_text_encoder:
+    assert t_hparams is not None
+    assert tokens is not None
    # text_model hparams
    fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
@ -259,8 +261,8 @@ if has_vision_encoder:


    if processor is not None:
-        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
-        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
+        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean  # pyright: ignore[reportAttributeAccessIssue]
+        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std  # pyright: ignore[reportAttributeAccessIssue]
    else:
        image_mean = args.image_mean if args.image_mean is not None else default_image_mean
        image_std = args.image_std if args.image_std is not None else default_image_std
@ -272,7 +274,7 @@ fout.add_bool("clip.use_gelu", use_gelu)


 if has_llava_projector:
-    model.vision_model.encoder.layers.pop(-1)
+    model.vision_model.encoder.layers.pop(-1)  # pyright: ignore[reportAttributeAccessIssue]
    projector = torch.load(args.llava_projector)
    for name, data in projector.items():
        name = get_tensor_name(name)
@ -286,7 +288,7 @@ if has_llava_projector:

    print("Projector tensors added\n")

-state_dict = model.state_dict()
+state_dict = model.state_dict()  # pyright: ignore[reportAttributeAccessIssue]
 for name, data in state_dict.items():
    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
        # we don't need this
--- a/examples/llava/llava_surgery.py
+++ b/examples/llava/llava_surgery.py
--- a/examples/llava/llava_surgery_v2.py
+++ b/examples/llava/llava_surgery_v2.py
@ -2,7 +2,9 @@ import argparse
 import glob
 import os
 import torch
-from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file
+from safetensors import safe_open
+from safetensors.torch import save_file
+from typing import Any, ContextManager, cast

 # Function to determine if file is a SafeTensor file
 def is_safetensor_file(file_path):
@ -13,7 +15,7 @@ def is_safetensor_file(file_path):
 def load_model(file_path):
    if is_safetensor_file(file_path):
        tensors = {}
-        with safe_open(file_path, framework="pt", device="cpu") as f:
+        with cast(ContextManager[Any], safe_open(file_path, framework="pt", device="cpu")) as f:
            for key in f.keys():
                tensors[key] = f.get_tensor(key).clone()
                # output shape
@ -134,7 +136,7 @@ if len(mm_tensors) == 0:
    if last_checkpoint is not None:
        for k, v in last_checkpoint.items():
            print(k)
-    print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint)} tensors.")
+    print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint) if last_checkpoint is not None else 0} tensors.")
    print("No tensors found. Is this a LLaVA model?")
    exit()

@ -143,8 +145,10 @@ print(f"Found additional {len(first_mm_tensors)} tensors to extract.")
 # projector = {name: checkpoint.[name].float() for name in mm_tensors}
 projector = {}
 for name in mm_tensors:
+    assert last_checkpoint is not None
    projector[name] = last_checkpoint[name].float()
 for name in first_mm_tensors:
+    assert first_checkpoint is not None
    projector[name] = first_checkpoint[name].float()

 if len(projector) > 0:
--- a/examples/llava/requirements.txt
+++ b/examples/llava/requirements.txt
@ -1,3 +1,4 @@
-r ../../requirements/requirements-convert-legacy-llama.txt
+-r ../../requirements/requirements-convert_legacy_llama.txt
+--extra-index-url https://download.pytorch.org/whl/cpu
 pillow~=10.2.0
-torch~=2.1.1
+torch~=2.2.1
--- a/Show More
+++ b/Show More