Merge branch 'ggerganov:master' into master

2025-01-13 04:00:16 +00:00 · 2024-07-12 14:34:19 +01:00 · 2024-07-12 14:34:19 +01:00 · 3277bb88e5
commit 3277bb88e5
parent e18d43c668 4e24cffd8c
225 changed files with 20673 additions and 4531 deletions
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -17,19 +17,18 @@
  rocmPackages,
  vulkan-headers,
  vulkan-loader,
-  clblast,
+  curl,
  useBlas ? builtins.all (x: !x) [
    useCuda
    useMetalKit
    useOpenCL
    useRocm
    useVulkan
  ] && blas.meta.available,
  useCuda ? config.cudaSupport,
-  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
+  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
  useMpi ? false, # Increases the runtime closure size by ~700M
  useOpenCL ? false,
  useRocm ? config.rocmSupport,
  enableCurl ? true,
  useVulkan ? false,
  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
@ -56,7 +55,6 @@ let
    ++ lib.optionals useCuda [ "CUDA" ]
    ++ lib.optionals useMetalKit [ "MetalKit" ]
    ++ lib.optionals useMpi [ "MPI" ]
    ++ lib.optionals useOpenCL [ "OpenCL" ]
    ++ lib.optionals useRocm [ "ROCm" ]
    ++ lib.optionals useVulkan [ "Vulkan" ];
@ -91,6 +89,22 @@ let
      ps.tiktoken
      ps.torchWithoutCuda
      ps.transformers
      # server bench
      ps.matplotlib
      # server tests
      ps.openai
      ps.behave
      ps.prometheus-client
      # for examples/pydantic-models-to-grammar-examples.py
      ps.docstring-parser
      ps.pydantic
      # for scripts/compare-llama-bench.py
      ps.gitpython
      ps.tabulate
    ]
  );
@ -198,19 +212,19 @@ effectiveStdenv.mkDerivation (
      optionals effectiveStdenv.isDarwin darwinBuildInputs
      ++ optionals useCuda cudaBuildInputs
      ++ optionals useMpi [ mpi ]
      ++ optionals useOpenCL [ clblast ]
      ++ optionals useRocm rocmBuildInputs
      ++ optionals useBlas [ blas ]
-      ++ optionals useVulkan vulkanBuildInputs;
+      ++ optionals useVulkan vulkanBuildInputs
      ++ optionals enableCurl [ curl ];
    cmakeFlags =
      [
        (cmakeBool "LLAMA_BUILD_SERVER" true)
        (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
        (cmakeBool "LLAMA_CURL" enableCurl)
        (cmakeBool "GGML_NATIVE" false)
        (cmakeBool "GGML_BLAS" useBlas)
        (cmakeBool "GGML_CLBLAST" useOpenCL)
        (cmakeBool "GGML_CUDA" useCuda)
        (cmakeBool "GGML_HIPBLAS" useRocm)
        (cmakeBool "GGML_METAL" useMetalKit)
@ -254,7 +268,6 @@ effectiveStdenv.mkDerivation (
        useCuda
        useMetalKit
        useMpi
        useOpenCL
        useRocm
        useVulkan
        ;
@ -281,7 +294,7 @@ effectiveStdenv.mkDerivation (
      # Configurations we don't want even the CI to evaluate. Results in the
      # "unsupported platform" messages. This is mostly a no-op, because
      # cudaPackages would've refused to evaluate anyway.
-      badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
+      badPlatforms = optionals useCuda lib.platforms.darwin;
      # Configurations that are known to result in build failures. Can be
      # overridden by importing Nixpkgs with `allowBroken = true`.
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@ -8,7 +8,7 @@ arg1="$1"
 shift
 if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
-    python3 ./convert-hf-to-gguf.py "$@"
+    python3 ./convert_hf_to_gguf.py "$@"
 elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
    ./llama-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -9,5 +9,3 @@ contact_links:
  - name: Want to contribute?
    url: https://github.com/ggerganov/llama.cpp/wiki/contribute
    about: Head to the contribution guide page of the wiki for areas you can help with
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -16,7 +16,9 @@ SYCL:
        - any-glob-to-any-file:
            - ggml/include/ggml-sycl.h
            - ggml/src/ggml-sycl.cpp
-            - README-sycl.md
+            - ggml/src/ggml-sycl/**
            - docs/backend/SYCL.md
            - examples/sycl/**
 Nvidia GPU:
    - changed-files:
        - any-glob-to-any-file:
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@ -0,0 +1,38 @@
 name: Python Type-Check
 on:
  push:
    paths:
      - '.github/workflows/python-type-check.yml'
      - '**.py'
      - '**/requirements*.txt'
  pull_request:
    paths:
      - '.github/workflows/python-type-check.yml'
      - '**.py'
      - '**/requirements*.txt'
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  python-type-check:
    runs-on: ubuntu-latest
    name: pyright type-check
    steps:
      - name: Check out source repository
        uses: actions/checkout@v4
      - name: Set up Python environment
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: Install Python dependencies
        # TODO: use a venv
        run: pip install -r requirements/requirements-all.txt
      - name: Type-check with Pyright
        uses: jakebailey/pyright-action@v2
        with:
          version: 1.1.370
          level: warning
          warnings: true
--- a/.gitignore
+++ b/.gitignore
@ -47,6 +47,7 @@ build*
 !build-info.cpp.in
 !build-info.sh
 !build.zig
 !docs/build.md
 /libllama.so
 /llama-*
 android-ndk-*
@ -60,6 +61,11 @@ llama-batched-swift
 out/
 tmp/
 # Deprecated
 /main
 /server
 # CI
 !.github/workflows/*.yml
@ -98,13 +104,14 @@ examples/server/*.mjs.hpp
 # Python
-__pycache__
+/.venv
-.venv
+__pycache__/
-/Pipfile
+*/poetry.lock
 dist
 poetry.lock
 poetry.toml
 # Nix
 /result
 # Test binaries
 /tests/test-backend-ops
 /tests/test-double-float
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -42,13 +42,14 @@ endif()
 option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
 if (WIN32)
    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
 endif()
 #
 # option list
 #
 # general
 option(LLAMA_CCACHE "llama: use ccache if available" ON)
 # debug
 option(LLAMA_ALL_WARNINGS           "llama: enable all compiler warnings"                   ON)
 option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
@ -73,7 +74,6 @@ option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
 # override ggml options
 set(GGML_CCACHE             ${LLAMA_CCACHE})
 set(GGML_SANITIZE_THREAD    ${LLAMA_SANITIZE_THREAD})
 set(GGML_SANITIZE_ADDRESS   ${LLAMA_SANITIZE_ADDRESS})
 set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
@ -111,7 +111,10 @@ llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)
 # build the library
 #
-add_subdirectory(ggml)
+if (NOT TARGET ggml)
    add_subdirectory(ggml)
    # ... otherwise assume ggml is added by a parent CMakeLists.txt
 endif()
 add_subdirectory(src)
 #
@ -152,7 +155,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama)
 install(
-    FILES convert-hf-to-gguf.py
+    FILES convert_hf_to_gguf.py
    PERMISSIONS
        OWNER_READ
        OWNER_WRITE
--- a/CMakePresets.json
+++ b/CMakePresets.json
@ -19,6 +19,7 @@
        "cacheVariables": {
            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
            "CMAKE_CXX_COMPILER": "icx",
            "CMAKE_C_COMPILER": "cl",
            "GGML_SYCL": "ON",
            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
        }
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,14 +1,24 @@
-# Contributing Guidelines
+# Pull requests
-## Checklist
+- Always squash-merge the PR before merging
 - Use the following format for your final commit: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
 - Test your changes:
  - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
  - Execute [the full CI locally on your machine](ci/README.md) before publishing
 - If the pull request contains only documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times
 - Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
  - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your conveience
-* Make sure your PR follows the [coding guidelines](https://github.com/ggerganov/llama.cpp/blob/master/README.md#coding-guidelines)
+# Coding guidelines
 * Test your changes using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
 * Execute [the full CI locally on your machine](ci/README.md) before publishing
-## PR formatting
+- Avoid adding third-party dependencies, extra files, extra headers, etc.
 - Always consider cross-compatibility with other operating systems and architectures
 - Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
 - There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
 - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
 - Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
 - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
 - Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
 ![matmul](media/matmul.png)
 * Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
    - The PR template has a series of review complexity checkboxes `[ ]` that you can mark as `[X]` for your conveience. Refer to [About task lists](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) for more information.
 * If the pull request only contains documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times.
 * When squashing multiple commits on merge, use the following format for your commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : Fix typo in utils.py (#1234)`
--- a/128
+++ b/128
@ -14,6 +14,7 @@ BUILD_TARGETS = \
 	llama-finetune \
 	llama-gbnf-validator \
 	llama-gguf \
 	llama-gguf-hash \
 	llama-gguf-split \
 	llama-gritlm \
 	llama-imatrix \
@ -62,6 +63,15 @@ TEST_TARGETS = \
 	tests/test-tokenizer-1-bpe \
 	tests/test-tokenizer-1-spm
 # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
 LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
 	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
 	retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm
 # Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
 #  We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
 LEGACY_TARGETS_BUILD = main quantize perplexity embedding server finetune
 # Deprecation aliases
 ifdef LLAMA_CUBLAS
 $(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
@ -187,7 +197,7 @@ ifdef GGML_RPC
 	BUILD_TARGETS += rpc-server
 endif
-default: $(BUILD_TARGETS)
+default: $(BUILD_TARGETS) $(LEGACY_TARGETS_BUILD)
 test: $(TEST_TARGETS)
 	@failures=0; \
@ -222,7 +232,7 @@ test: $(TEST_TARGETS)
 	fi
 	@echo 'All tests passed.'
-all: $(BUILD_TARGETS) $(TEST_TARGETS)
+all: $(BUILD_TARGETS) $(TEST_TARGETS) $(LEGACY_TARGETS_BUILD)
 ifdef RISCV_CROSS_COMPILE
 CC	:= riscv64-unknown-linux-gnu-gcc
@ -239,17 +249,22 @@ MK_CFLAGS    = -std=c11   -fPIC
 MK_CXXFLAGS  = -std=c++11 -fPIC
 MK_NVCCFLAGS = -std=c++11
-ifndef LLAMA_NO_CCACHE
+ifdef LLAMA_NO_CCACHE
 GGML_NO_CCACHE := 1
 DEPRECATE_WARNING := 1
 endif
 ifndef GGML_NO_CCACHE
 CCACHE := $(shell which ccache)
 ifdef CCACHE
 export CCACHE_SLOPPINESS = time_macros
-$(info I ccache found, compilation results will be cached. Disable with LLAMA_NO_CCACHE.)
+$(info I ccache found, compilation results will be cached. Disable with GGML_NO_CCACHE.)
 CC    := $(CCACHE) $(CC)
 CXX   := $(CCACHE) $(CXX)
 else
 $(info I ccache not found. Consider installing it for faster compilation.)
 endif # CCACHE
-endif # LLAMA_NO_CCACHE
+endif # GGML_NO_CCACHE
 # clock_gettime came in POSIX.1b (1993)
 # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
@ -532,14 +547,20 @@ ifdef GGML_OPENBLAS64
 endif # GGML_OPENBLAS64
 ifdef GGML_BLIS
-	MK_CPPFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis
+	MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis
 	MK_LDFLAGS  += -lblis -L/usr/local/lib
 	OBJ_GGML    += ggml/src/ggml-blas.o
 endif # GGML_BLIS
 ifdef GGML_NVPL
 	MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas
 	MK_LDFLAGS  += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp
 	OBJ_GGML    += ggml/src/ggml-blas.o
 endif # GGML_NVPL
 ifndef GGML_NO_LLAMAFILE
 	MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
-	OBJ_GGML    += ggml/src/sgemm.o
+	OBJ_GGML    += ggml/src/llamafile/sgemm.o
 endif
 ifdef GGML_RPC
@ -820,7 +841,8 @@ OBJ_GGML += \
 	ggml/src/ggml.o \
 	ggml/src/ggml-alloc.o \
 	ggml/src/ggml-backend.o \
-	ggml/src/ggml-quants.o
+	ggml/src/ggml-quants.o \
 	ggml/src/ggml-aarch64.o
 OBJ_LLAMA = \
 	src/llama.o \
@ -920,6 +942,7 @@ $(info   - LLAMA_NO_LLAMAFILE)
 $(info   - LLAMA_NO_ACCELERATE)
 $(info   - LLAMA_NO_OPENMP)
 $(info   - LLAMA_NO_METAL)
 $(info   - LLAMA_NO_CCACHE)
 $(info )
 endif
@ -953,15 +976,22 @@ ggml/src/ggml-quants.o: \
 	ggml/src/ggml-common.h
 	$(CC) $(CFLAGS)    -c $< -o $@
 ggml/src/ggml-aarch64.o: \
 	ggml/src/ggml-aarch64.c \
 	ggml/include/ggml.h \
 	ggml/src/ggml-aarch64.h \
 	ggml/src/ggml-common.h
 	$(CC) $(CFLAGS)    -c $< -o $@
 ggml/src/ggml-blas.o: \
 	ggml/src/ggml-blas.cpp \
 	ggml/include/ggml-blas.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 ifndef GGML_NO_LLAMAFILE
-ggml/src/sgemm.o: \
+ggml/src/llamafile/sgemm.o: \
-	ggml/src/sgemm.cpp \
+	ggml/src/llamafile/sgemm.cpp \
-	ggml/src/sgemm.h \
+	ggml/src/llamafile/sgemm.h \
 	ggml/include/ggml.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif # GGML_NO_LLAMAFILE
@ -1086,6 +1116,7 @@ clean:
 	rm -vrf ggml/src/ggml-cuda/template-instances/*.o
 	rm -rvf $(BUILD_TARGETS)
 	rm -rvf $(TEST_TARGETS)
 	rm -rvf $(LEGACY_TARGETS_CLEAN)
 	find examples pocs -type f -name "*.o" -delete
 #
@ -1172,6 +1203,23 @@ llama-gguf: examples/gguf/gguf.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 examples/gguf-hash/deps/sha1/sha1.o: \
 	examples/gguf-hash/deps/sha1/sha1.c
 	$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
 examples/gguf-hash/deps/xxhash/xxhash.o: \
 	examples/gguf-hash/deps/xxhash/xxhash.c
 	$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
 examples/gguf-hash/deps/sha256/sha256.o: \
 	examples/gguf-hash/deps/sha256/sha256.c
 	$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
 llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o\
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 llama-gguf-split: examples/gguf-split/gguf-split.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@ -1464,3 +1512,61 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 #
 # Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
 #
 # Mark legacy binary targets as .PHONY so that they are always checked.
 .PHONY: main quantize perplexity embedding server finetune
 # NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
 #  Eventually we will want to remove these target from building all the time.
 main: examples/deprecation-warning/deprecation-warning.cpp
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 	@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."
 server: examples/deprecation-warning/deprecation-warning.cpp
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 	@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."
 quantize: examples/deprecation-warning/deprecation-warning.cpp
 ifneq (,$(wildcard quantize))
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 	@echo "#########"
 	@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
 	@echo "  Remove the 'quantize' binary to remove this warning."
 	@echo "#########"
 endif
 perplexity: examples/deprecation-warning/deprecation-warning.cpp
 ifneq (,$(wildcard perplexity))
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 	@echo "#########"
 	@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
 	@echo "  Remove the 'perplexity' binary to remove this warning."
 	@echo "#########"
 endif
 embedding: examples/deprecation-warning/deprecation-warning.cpp
 ifneq (,$(wildcard embedding))
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 	@echo "#########"
 	@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
 	@echo "  Remove the 'embedding' binary to remove this warning."
 	@echo "#########"
 endif
 finetune: examples/deprecation-warning/deprecation-warning.cpp
 ifneq (,$(wildcard finetune))
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 	@echo "#########"
 	@echo "WARNING: The 'finetune' binary is deprecated. Please use 'llama-finetune' instead."
 	@echo "  Remove the 'finetune' binary to remove this warning."
 	@echo "#########"
 endif
--- a/Package.swift
+++ b/Package.swift
@ -10,6 +10,7 @@ var sources = [
    "ggml/src/ggml-alloc.c",
    "ggml/src/ggml-backend.c",
    "ggml/src/ggml-quants.c",
    "ggml/src/ggml-aarch64.c",
 ]
 var resources: [Resource] = []
--- a/README.md
+++ b/README.md
@ -13,7 +13,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 > [!IMPORTANT]
 [2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
-### Recent API changes
+## Recent API changes
 - [2024 Jun 26] The source code and CMake build scripts have been restructured https://github.com/ggerganov/llama.cpp/pull/8006
 - [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
@ -24,9 +24,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 - [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
 - [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
-### Hot topics
+## Hot topics
- **`convert.py` has been deprecated and moved to `examples/convert-legacy-llama.py`, please use `convert-hf-to-gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
+- **`convert.py` has been deprecated and moved to `examples/convert_legacy_llama.py`, please use `convert_hf_to_gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
 - Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
 - BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
 - MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
@ -39,37 +39,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 ----
 <details>
  <summary>Table of Contents</summary>
  <ol>
    <li>
      <a href="#description">Description</a>
    </li>
    <li>
      <a href="#usage">Usage</a>
      <ul>
        <li><a href="#get-the-code">Get the Code</a></li>
        <li><a href="#build">Build</a></li>
        <li><a href="#blas-build">BLAS Build</a></li>
        <li><a href="#prepare-and-quantize">Prepare and Quantize</a></li>
        <li><a href="#run-the-quantized-model">Run the quantized model</a></li>
        <li><a href="#memorydisk-requirements">Memory/Disk Requirements</a></li>
        <li><a href="#quantization">Quantization</a></li>
        <li><a href="#interactive-mode">Interactive mode</a></li>
        <li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
        <li><a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a></li>
        <li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li>
        <li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li>
        <li><a href="#android">Android</a></li>
        <li><a href="#docker">Docker</a></li>
      </ul>
    </li>
    <li><a href="#contributing">Contributing</a></li>
    <li><a href="#coding-guidelines">Coding guidelines</a></li>
    <li><a href="#docs">Docs</a></li>
  </ol>
 </details>
 ## Description
 The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
@ -87,14 +56,6 @@ Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomm
 improved significantly thanks to many contributions. It is the main playground for developing new features for the
 [ggml](https://github.com/ggerganov/ggml) library.
 **Supported platforms:**
 - [X] Mac OS
 - [X] Linux
 - [X] Windows (via CMake)
 - [X] Docker
 - [X] FreeBSD
 **Supported models:**
 Typically finetunes of the base models below are supported as well.
@ -108,6 +69,7 @@ Typically finetunes of the base models below are supported as well.
 - [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
 - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
 - [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
 - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
 - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
@ -134,8 +96,9 @@ Typically finetunes of the base models below are supported as well.
 - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
 - [x] [OLMo](https://allenai.org/olmo)
 - [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
 - [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
-(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
+(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
 **Multimodal models:**
@ -149,12 +112,6 @@ Typically finetunes of the base models below are supported as well.
 - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
 - [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
 **HTTP server**
 [llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
 [simplechat](./examples/server/public_simplechat) is a simple chat client, which can be used to chat with the model exposed using above web server (use --path to point to simplechat), from a local web browser.
 **Bindings:**
 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
@ -175,6 +132,7 @@ Typically finetunes of the base models below are supported as well.
 - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
 - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
 - PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
 - Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
 **UI:**
@ -217,10 +175,16 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 **Tools:**
 - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
 - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
---
+**Infrastructure:**
-Here is a typical run using LLaMA v2 13B on M2 Ultra:
+- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
 ## Demo
 <details>
 <summary>Typical run using LLaMA v2 13B on M2 Ultra</summary>
 ```
 $ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
@ -300,454 +264,85 @@ llama_print_timings:        eval time = 24513.59 ms /   399 runs   (   61.44 ms
 llama_print_timings:       total time = 25431.49 ms
 ```
 </details>
 <details>
 <summary>Demo of running both LLaMA-7B and whisper.cpp on a single M1 Pro MacBook</summary>
 And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook:
 https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8b4f-add84093ffff.mp4
 </details>
 ## Usage
 Here are the end-to-end binary build and model conversion steps for most supported models.
-### Get the Code
+### Basic usage
 Firstly, you need to get the binary. There are different methods that you can follow:
 - Method 1: Clone this repository and build locally, see [how to build](./docs/build.md)
 - Method 2: If you are using MacOS or Linux, you can install llama.cpp via [brew, flox or nix](./docs/install.md)
 - Method 3: Use a Docker image, see [documentation for Docker](./docs/docker.md)
 - Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/llama.cpp/releases)
 You can run a basic completion using this command:
 ```bash
-git clone https://github.com/ggerganov/llama.cpp
+llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
-cd llama.cpp
+
 # Output:
 # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
 ```
-### Build
+See [this page](./examples/main/README.md) for a full list of parameters.
-In order to build llama.cpp you have four different options.
+### Conversation mode
- Using `make`:
+If you want a more ChatGPT-like experience, you can run in conversation mode by passing `-cnv` as a parameter:
  - On Linux or MacOS:
      ```bash
      make
      ```
  - On Windows:
    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
    2. Extract `w64devkit` on your pc.
    3. Run `w64devkit.exe`.
    4. Use the `cd` command to reach the `llama.cpp` folder.
    5. From here you can run:
        ```bash
        make
        ```
  - Notes:
    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
    - For faster repeated compilation, install [ccache](https://ccache.dev/).
    - For debug builds, run `make LLAMA_DEBUG=1`
 - Using `CMake`:
  ```bash
  cmake -B build
  cmake --build build --config Release
  ```
  **Notes**:
    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
    - For faster repeated compilation, install [ccache](https://ccache.dev/).
    - For debug builds, there are two cases:
      1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
      ```bash
      cmake -B build -DCMAKE_BUILD_TYPE=Debug
      cmake --build build
      ```
      2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
      ```bash
      cmake -B build -G "Xcode"
      cmake --build build --config Debug
      ```
 -   Using `gmake` (FreeBSD):
    1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
    2. Add your user to **video** group
    3. Install compilation dependencies.
        ```bash
        sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
        gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
        ```
 ### Homebrew
 On Mac and Linux, the homebrew package manager can be used via
 ```
 brew install llama.cpp
 ```
 The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
 ### Nix
 On Mac and Linux, the Nix package manager can be used via
 ```
 nix profile install nixpkgs#llama-cpp
 ```
 For flake enabled installs.
 Or
 ```
 nix-env --file '<nixpkgs>' --install --attr llama-cpp
 ```
 For non-flake enabled installs.
 This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
 #### Flox
 On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
 ```
 flox install llama-cpp
 ```
 Flox follows the nixpkgs build of llama.cpp.
 ### Metal Build
 On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
 To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option.
 When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
 argument.
 ### BLAS Build
 Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
 - #### Accelerate Framework:
  This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
 - #### OpenBLAS:
  This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
  - Using `make`:
    - On Linux:
      ```bash
      make GGML_OPENBLAS=1
      ```
    - On Windows:
      1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
      2. Download the latest version of [OpenBLAS for Windows](https://github.com/xianyi/OpenBLAS/releases).
      3. Extract `w64devkit` on your pc.
      4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
      5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
      6. Run `w64devkit.exe`.
      7. Use the `cd` command to reach the `llama.cpp` folder.
      8. From here you can run:
          ```bash
          make GGML_OPENBLAS=1
          ```
  - Using `CMake` on Linux:
      ```bash
      cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
      cmake --build build --config Release
      ```
 - #### BLIS
  Check [BLIS.md](docs/BLIS.md) for more information.
 - #### SYCL
  SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
  llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
  For detailed info, please refer to [llama.cpp for SYCL](README-sycl.md).
 - #### Intel oneMKL
  Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md).
  - Using manual oneAPI installation:
    By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
      ```bash
      source /opt/intel/oneapi/setvars.sh # You can skip this step if  in oneapi-basekit docker image, only required for manual installation
      cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=ON
      cmake --build build --config Release
      ```
  - Using oneAPI docker image:
    If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
  Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
 - #### CUDA
  This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
  For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
  - Using `make`:
    ```bash
    make GGML_CUDA=1
    ```
  - Using `CMake`:
    ```bash
    cmake -B build -DGGML_CUDA=ON
    cmake --build build --config Release
    ```
  The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
  | Option                        | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
  |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
  | GGML_CUDA_FORCE_DMMV          | Boolean                | false   | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
  | GGML_CUDA_DMMV_X              | Positive integer >= 32 | 32      | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants.                                         |
  | GGML_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                                                                                                                         |
  | GGML_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower.                       |
  | GGML_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models                                                                                                                                                                                       |
  | GGML_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
  | GGML_CUDA_KQUANTS_ITER        | 1 or 2                 | 2       | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                                                                     |
  | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
  | GGML_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |
 - #### hipBLAS
  This provides BLAS acceleration on HIP-supported AMD GPUs.
  Make sure to have ROCm installed.
  You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
  - Using `make`:
    ```bash
    make GGML_HIPBLAS=1
    ```
  - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
    ```bash
    HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
        cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
        && cmake --build build --config Release -- -j 16
    ```
    On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
    However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
    Note that if you get the following error:
    ```
    clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
    ```
    Try searching for a directory under `HIP_PATH` that contains the file
    `oclc_abi_version_400.bc`. Then, add the following to the start of the
    command: `HIP_DEVICE_LIB_PATH=<directory-you-just-found>`, so something
    like:
    ```bash
    HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
    HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
        cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
        && cmake --build build -- -j 16
    ```
  - Using `make` (example for target gfx1030, build with 16 CPU threads):
    ```bash
    make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
    ```
  - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
    ```bash
    set PATH=%HIP_PATH%\bin;%PATH%
    cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
    cmake --build build
    ```
    Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
    Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.
  The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
  If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
  The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
  | Option                 | Legal values           | Default | Description                                                                                                                                                                                                                                    |
  |------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
  | GGML_CUDA_DMMV_X       | Positive integer >= 32 | 32      | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
  | GGML_CUDA_MMV_Y        | Positive integer       | 1       | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants.                                                                       |
  | GGML_CUDA_KQUANTS_ITER | 1 or 2                 | 2       | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                             |
 - #### Vulkan
  **With docker**:
  You don't need to install Vulkan SDK. It will be installed inside the container.
  ```sh
  # Build the image
  docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
  # Then, use it:
  docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
  ```
  **Without docker**:
  Firstly, you need to make sure you have installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html)
  For example, on Ubuntu 22.04 (jammy), use the command below:
  ```bash
  wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
  wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
  apt update -y
  apt-get install -y vulkan-sdk
  # To verify the installation, use the command below:
  vulkaninfo
  ```
  Alternatively your package manager might be able to provide the appropriate libraries.
  For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
  For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
  Then, build llama.cpp using the cmake command below:
  ```bash
  cmake -B build -DGGML_VULKAN=1
  cmake --build build --config Release
  # Test the output binary (with "-ngl 33" to offload all layers to GPU)
  ./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
  # You should see in the output, ggml_vulkan detected your GPU. For example:
  # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
  ```
 ### Prepare and Quantize
 > [!NOTE]
 > You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
 To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
 Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
 It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
 ```bash
-# obtain the official LLaMA model weights and place them in ./models
+llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
 ls ./models
 llama-2-7b tokenizer_checklist.chk tokenizer.model
 # [Optional] for models using BPE tokenizers
 ls ./models
 <folder containing weights and tokenizer json> vocab.json
 # [Optional] for PyTorch .bin models like Mistral-7B
 ls ./models
 <folder containing weights and tokenizer json>
-# install Python dependencies
+# Output:
-python3 -m pip install -r requirements.txt
+# > hi, who are you?
-
+# Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
-# convert the model to ggml FP16 format
+#
-python3 convert-hf-to-gguf.py models/mymodel/
+# > what is 1+1?
-
+# Easy peasy! The answer to 1+1 is... 2!
 # quantize the model to 4-bits (using Q4_K_M method)
 ./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
 # update the gguf filetype to current version if older version is now unsupported
 ./llama-quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
 ```
-### Run the quantized model
+By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
 ```bash
-# start inference on a gguf model
+./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
 ./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
 ```
-When running the larger models, make sure you have enough disk space to store all the intermediate files.
+You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:
-### Running on Windows with prebuilt binaries
+```bash
-
+./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
 You will find prebuilt Windows binaries on the release page.
 Simply download and extract the latest zip package of choice: (e.g. `llama-b1380-bin-win-avx2-x64.zip`)
 From the unzipped folder, open a terminal/cmd window here and place a pre-converted `.gguf` model file. Test out the main example like so:
 ```
 .\main -m llama-2-7b.Q4_0.gguf -n 128
 ```
-### Memory/Disk Requirements
+### Web server
-As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
+[llama.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
-| Model | Original size | Quantized size (Q4_0) |
+Example usage:
 |------:|--------------:|----------------------:|
 |    7B |         13 GB |                3.9 GB |
 |   13B |         24 GB |                7.8 GB |
 |   30B |         60 GB |               19.5 GB |
 |   65B |        120 GB |               38.5 GB |
-### Quantization
+```bash
 ./llama-server -m your_model.gguf --port 8080
-Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
+# Basic web UI can be accessed via browser: http://localhost:8080
-
+# Chat completion endpoint: http://localhost:8080/v1/chat/completions
 *(outdated)*
 | Model | Measure      |    F16 |   Q4_0 |   Q4_1 |   Q5_0 |   Q5_1 |   Q8_0 |
 |------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
 |    7B | perplexity   | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
 |    7B | file size    |  13.0G |   3.5G |   3.9G |   4.3G |   4.7G |   6.7G |
 |    7B | ms/tok @ 4th |    127 |     55 |     54 |     76 |     83 |     72 |
 |    7B | ms/tok @ 8th |    122 |     43 |     45 |     52 |     56 |     67 |
 |    7B | bits/weight  |   16.0 |    4.5 |    5.0 |    5.5 |    6.0 |    8.5 |
 |   13B | perplexity   | 5.2543 | 5.3860 | 5.3608 | 5.2856 | 5.2706 | 5.2548 |
 |   13B | file size    |  25.0G |   6.8G |   7.6G |   8.3G |   9.1G |    13G |
 |   13B | ms/tok @ 4th |      - |    103 |    105 |    148 |    160 |    131 |
 |   13B | ms/tok @ 8th |      - |     73 |     82 |     98 |    105 |    128 |
 |   13B | bits/weight  |   16.0 |    4.5 |    5.0 |    5.5 |    6.0 |    8.5 |
 - [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684)
 - recent k-quants improvements and new i-quants
  - [#2707](https://github.com/ggerganov/llama.cpp/pull/2707)
  - [#2807](https://github.com/ggerganov/llama.cpp/pull/2807)
  - [#4773 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4773)
  - [#4856 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4856)
  - [#4861 - importance matrix](https://github.com/ggerganov/llama.cpp/pull/4861)
  - [#4872 - MoE models](https://github.com/ggerganov/llama.cpp/pull/4872)
  - [#4897 - 2-bit quantization](https://github.com/ggerganov/llama.cpp/pull/4897)
  - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
  - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
  - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
  - [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
  - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
  - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
  - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)
 ### Perplexity (measuring model quality)
 You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
 For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
 The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
 The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 threads.
 #### How to run
 1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
 2. Run `./llama-perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
 3. Output:
 ```
 perplexity : calculating perplexity over 655 chunks
 24.43 seconds per pass - ETA 4.45 hours
 [1]4.5970,[2]5.1807,[3]6.0382,...
 ```
 And after 4.45 hours, you will have the final perplexity.
 ### Interactive mode
-If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
+> [!NOTE]
 > If you prefer basic usage, please consider using conversation mode instead of interactive mode
 In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
 Here is an example of a few-shot interaction, invoked with the command
@ -798,18 +393,70 @@ The `grammars/` folder contains a handful of sample grammars. To write your own,
 For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
-### Obtaining and using the Facebook LLaMA 2 model
+## Build
- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
+Please refer to [Build llama.cpp locally](./docs/build.md)
 - Alternatively, if you want to save time and space, you can download already converted and quantized models from [TheBloke](https://huggingface.co/TheBloke), including:
  - [LLaMA 2 7B base](https://huggingface.co/TheBloke/Llama-2-7B-GGUF)
  - [LLaMA 2 13B base](https://huggingface.co/TheBloke/Llama-2-13B-GGUF)
  - [LLaMA 2 70B base](https://huggingface.co/TheBloke/Llama-2-70B-GGUF)
  - [LLaMA 2 7B chat](https://huggingface.co/TheBloke/Llama-2-7B-chat-GGUF)
  - [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF)
  - [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGUF)
-### Seminal papers and background on the models
+## Supported backends
 | Backend | Target devices |
 | --- | --- |
 | [Metal](./docs/build.md#metal-build) | Apple Silicon |
 | [BLAS](./docs/build.md#blas-build) | All |
 | [BLIS](./docs/backend/BLIS.md) | All |
 | [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
 | [CUDA](./docs/build.md#cuda) | Nvidia GPU |
 | [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
 | [Vulkan](./docs/build.md#vulkan) | GPU |
 ## Tools
 ### Prepare and Quantize
 > [!NOTE]
 > You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
 To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
 Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
 It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face.
 To learn more about quantizing model, [read this documentation](./examples/quantize/README.md)
 ### Perplexity (measuring model quality)
 You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
 For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
 To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md)
 ## Contributing
 - Contributors can open PRs
 - Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
 - Collaborators will be invited based on contributions
 - Any help with managing issues and PRs is very appreciated!
 - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
 - Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
 - Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
 - A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
 ## Other documentations
 - [main (cli)](./examples/main/README.md)
 - [server](./examples/server/README.md)
 - [jeopardy](./examples/jeopardy/README.md)
 - [GBNF grammars](./grammars/README.md)
 **Development documentations**
 - [How to build](./docs/build.md)
 - [Running on Docker](./docs/docker.md)
 - [Build on Android](./docs/android.md)
 - [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
 - [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
 **Seminal papers and background on the models**
 If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
 - LLaMA:
@ -820,178 +467,3 @@ If your issue is with model generation quality, then please at least scan the fo
 - GPT-3.5 / InstructGPT / ChatGPT:
    - [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
    - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
 ### Android
 #### Build on Android using Termux
 [Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required).
 ```
 apt update && apt upgrade -y
 apt install git make cmake
 ```
 It's recommended to move your model inside the `~/` directory for best performance:
 ```
 cd storage/downloads
 mv model.gguf ~/
 ```
 [Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
 #### Building the Project using Android NDK
 Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
 Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
 ```
 $ mkdir build-android
 $ cd build-android
 $ export NDK=<your_ndk_directory>
 $ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
 $ make
 ```
 Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
 Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
 (Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
 ```
 $cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
 $cd /data/data/com.termux/files/home/bin
 $chmod +x ./*
 ```
 Download model [llama-2-7b-chat.Q4_K_M.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_K_M.gguf), and push it to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
 ```
 $mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/home/model/
 ```
 Now, you can start chatting:
 ```
 $cd /data/data/com.termux/files/home/bin
 $./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
 ```
 Here's a demo of an interactive session running on Pixel 5 phone:
 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
 ### Docker
 #### Prerequisites
 * Docker must be installed and running on your system.
 * Create a folder to store big models & intermediate files (ex. /llama/models)
 #### Images
 We have three Docker images available for this project:
 1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
 2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
 3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
 Additionally, there the following images, similar to the above:
 - `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 - `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
 #### Usage
 The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
 Replace `/path/to/models` below with the actual path where you downloaded the models.
 ```bash
 docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
 ```
 On completion, you are ready to play!
 ```bash
 docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```
 or with a light image:
 ```bash
 docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```
 or with a server image:
 ```bash
 docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
 ```
 ### Docker With CUDA
 Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
 #### Building Locally
 ```bash
 docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
 docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
 docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
 ```
 You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
 The defaults are:
 - `CUDA_VERSION` set to `11.7.1`
 - `CUDA_DOCKER_ARCH` set to `all`
 The resulting images, are essentially the same as the non-CUDA images:
 1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
 2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
 3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
 #### Usage
 After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
 ```bash
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
 ```
 ### Contributing
 - Contributors can open PRs
 - Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
 - Collaborators will be invited based on contributions
 - Any help with managing issues and PRs is very appreciated!
 - Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
 - A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
 ### Coding guidelines
 - Avoid adding third-party dependencies, extra files, extra headers, etc.
 - Always consider cross-compatibility with other operating systems and architectures
 - Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
 - There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
 - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
 - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
 - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
 - Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
 ![matmul](media/matmul.png)
 ### Docs
 - [main (cli)](./examples/main/README.md)
 - [server](./examples/server/README.md)
 - [jeopardy](./examples/jeopardy/README.md)
 - [BLIS](./docs/BLIS.md)
 - [Performance troubleshooting](./docs/token_generation_performance_tips.md)
 - [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
 - [GBNF grammars](./grammars/README.md)
--- a/ci/run.sh
+++ b/ci/run.sh
@ -103,6 +103,9 @@ function gg_run_ctest_debug {
    set -e
    # Check cmake, make and ctest are installed
    gg_check_build_requirements
    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                          ) 2>&1 | tee -a $OUT/${ci}-make.log
@ -131,6 +134,9 @@ function gg_run_ctest_release {
    set -e
    # Check cmake, make and ctest are installed
    gg_check_build_requirements
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
@ -287,7 +293,7 @@ function gg_run_open_llama_7b_v2 {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log
-    python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -421,7 +427,7 @@ function gg_run_pythia_1_4b {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
-    python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -553,7 +559,7 @@ function gg_run_pythia_2_8b {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log
-    python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -688,7 +694,7 @@ function gg_run_embd_bge_small {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
-    python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -701,6 +707,20 @@ function gg_run_embd_bge_small {
    set +e
 }
 function gg_check_build_requirements {
    if ! command -v cmake &> /dev/null; then
        gg_printf 'cmake not found, please install'
    fi
    if ! command -v make &> /dev/null; then
        gg_printf 'make not found, please install'
    fi
    if ! command -v ctest &> /dev/null; then
        gg_printf 'ctest not found, please install'
    fi
 }
 function gg_sum_embd_bge_small {
    gg_printf '### %s\n\n' "${ci}"
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1,3 +1,7 @@
 #if defined(_MSC_VER)
 #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
 #endif
 #include "common.h"
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
@ -190,6 +194,12 @@ int32_t cpu_get_num_math() {
 // CLI argument parsing
 //
 void gpt_params_handle_hf_token(gpt_params & params) {
    if (params.hf_token.empty() && std::getenv("HF_TOKEN")) {
        params.hf_token = std::getenv("HF_TOKEN");
    }
 }
 void gpt_params_handle_model_default(gpt_params & params) {
    if (!params.hf_repo.empty()) {
        // short-hand to avoid specifying --hf-file -> default it to --model
@ -237,6 +247,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
    gpt_params_handle_model_default(params);
    gpt_params_handle_hf_token(params);
    if (params.escape) {
        string_process_escapes(params.prompt);
        string_process_escapes(params.input_prefix);
@ -472,6 +484,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        else { invalid_param = true; }
        return true;
    }
    if (arg == "--attention") {
        CHECK_ARG
        std::string value(argv[i]);
        /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
        else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
        else { invalid_param = true; }
        return true;
    }
    if (arg == "--defrag-thold" || arg == "-dt") {
        CHECK_ARG
        params.defrag_thold = std::stof(argv[i]);
@ -644,6 +664,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.model_url = argv[i];
        return true;
    }
    if (arg == "-hft" || arg == "--hf-token") {
        if (++i >= argc) {
          invalid_param = true;
          return true;
        }
        params.hf_token = argv[i];
        return true;
    }
    if (arg == "-hfr" || arg == "--hf-repo") {
        CHECK_ARG
        params.hf_repo = argv[i];
@ -757,7 +785,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.cache_type_v = argv[++i];
        return true;
    }
-    if (arg == "--multiline-input") {
+    if (arg == "-mli" || arg == "--multiline-input") {
        params.multiline_input = true;
        return true;
    }
@ -1014,16 +1042,19 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
    }
    if (arg == "--in-prefix-bos") {
        params.input_prefix_bos = true;
        params.enable_chat_template = false;
        return true;
    }
    if (arg == "--in-prefix") {
        CHECK_ARG
        params.input_prefix = argv[i];
        params.enable_chat_template = false;
        return true;
    }
    if (arg == "--in-suffix") {
        CHECK_ARG
        params.input_suffix = argv[i];
        params.enable_chat_template = false;
        return true;
    }
    if (arg == "--spm-infill") {
@ -1391,7 +1422,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "       --keep N",               "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
    options.push_back({ "*",           "       --chunks N",             "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
    options.push_back({ "*",           "-fa,   --flash-attn",           "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
-    options.push_back({ "*",           "-p,    --prompt PROMPT",        "prompt to start generation with (default: '%s')", params.prompt.c_str() });
+    options.push_back({ "*",           "-p,    --prompt PROMPT",        "prompt to start generation with\n"
                                                                        "in conversation mode, this will be used as system prompt\n"
                                                                        "(default: '%s')", params.prompt.c_str() });
    options.push_back({ "*",           "-f,    --file FNAME",           "a file containing the prompt (default: none)" });
    options.push_back({ "*",           "       --in-file FNAME",        "an input file (repeat to specify multiple files)" });
    options.push_back({ "*",           "-bf,   --binary-file FNAME",    "binary file containing the prompt (default: none)" });
@ -1406,7 +1439,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                        "halt generation at PROMPT, return control in interactive mode\n"
                                                                        "can be specified more than once for multiple prompts" });
    options.push_back({ "main",        "-sp,   --special",              "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
-    options.push_back({ "main",        "-cnv,  --conversation",         "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" });
+    options.push_back({ "main",        "-cnv,  --conversation",         "run in conversation mode, does not print special tokens and suffix/prefix\n"
                                                                        "if suffix/prefix are not specified, default chat template will be used\n"
                                                                        "(default: %s)", params.conversation ? "true" : "false" });
    options.push_back({ "main infill", "-i,    --interactive",          "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
    options.push_back({ "main infill", "-if,   --interactive-first",    "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
    options.push_back({ "main infill", "-mli,  --multiline-input",      "allows you to write or paste multiple lines without ending each in '\\'" });
@ -1450,6 +1485,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "main",        "       --cfg-scale N",          "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
    options.push_back({ "main",        "       --chat-template JINJA_TEMPLATE",
                                                                        "set custom jinja chat template (default: template taken from model's metadata)\n"
                                                                        "if suffix/prefix are specified, template will be disabled\n"
                                                                        "only commonly used templates are accepted:\n"
                                                                        "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
    options.push_back({ "grammar" });
@ -1460,8 +1496,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                        "For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" });
    options.push_back({ "embedding" });
-    options.push_back({ "embedding",   "       --pooling {none,mean,cls}",
+    options.push_back({ "embedding",   "       --pooling {none,mean,cls,last}",
                                                                        "pooling type for embeddings, use model default if unspecified" });
    options.push_back({ "embedding",   "       --attention {causal,non-causal}",
                                                                        "attention type for embeddings, use model default if unspecified" });
    options.push_back({ "context hacking" });
    options.push_back({ "*",           "       --rope-scaling {none,linear,yarn}",
@ -1558,6 +1596,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "-mu,   --model-url MODEL_URL",  "model download url (default: unused)" });
    options.push_back({ "*",           "-hfr,  --hf-repo REPO",         "Hugging Face model repository (default: unused)" });
    options.push_back({ "*",           "-hff,  --hf-file FILE",         "Hugging Face model file (default: unused)" });
    options.push_back({ "*",           "-hft,  --hf-token TOKEN",       "Hugging Face access token (default: value from HF_TOKEN environment variable)" });
    options.push_back({ "retrieval" });
    options.push_back({ "retrieval",   "       --context-file FNAME",   "file to load context from (repeat to specify multiple files)" });
@ -1997,9 +2036,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
    llama_model * model = nullptr;
    if (!params.hf_repo.empty() && !params.hf_file.empty()) {
-        model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams);
+        model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
    } else if (!params.model_url.empty()) {
-        model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
+        model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
    } else {
        model = llama_load_model_from_file(params.model.c_str(), mparams);
    }
@ -2067,7 +2106,24 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
    if (params.warmup) {
        LOG("warming up the model with an empty run\n");
-        std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
+        std::vector<llama_token> tmp;
        llama_token bos = llama_token_bos(model);
        llama_token eos = llama_token_eos(model);
        // some models (e.g. T5) don't have a BOS token
        if (bos != -1) {
            tmp.push_back(bos);
        }
        tmp.push_back(eos);
        if (llama_model_has_encoder(model)) {
            llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
            llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
            if (decoder_start_token_id == -1) {
                decoder_start_token_id = bos;
            }
            tmp.clear();
            tmp.push_back(decoder_start_token_id);
        }
        llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
        llama_kv_cache_clear(lctx);
        llama_synchronize(lctx);
@ -2150,6 +2206,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.yarn_beta_slow    = params.yarn_beta_slow;
    cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
    cparams.pooling_type      = params.pooling_type;
    cparams.attention_type    = params.attention_type;
    cparams.defrag_thold      = params.defrag_thold;
    cparams.cb_eval           = params.cb_eval;
    cparams.cb_eval_user_data = params.cb_eval_user_data;
@ -2169,7 +2226,7 @@ static bool starts_with(const std::string & str, const std::string & prefix) {
    return str.rfind(prefix, 0) == 0;
 }
-static bool llama_download_file(const std::string & url, const std::string & path) {
+static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
    // Initialize libcurl
    std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
@ -2184,6 +2241,15 @@ static bool llama_download_file(const std::string & url, const std::string & pat
    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
    // Check if hf-token or bearer-token was specified
    if (!hf_token.empty()) {
      std::string auth_header = "Authorization: Bearer ";
      auth_header += hf_token.c_str();
      struct curl_slist *http_headers = NULL;
      http_headers = curl_slist_append(http_headers, auth_header.c_str());
      curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
    }
 #if defined(_WIN32)
    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
    //   operating system. Currently implemented under MS-Windows.
@ -2379,6 +2445,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
 struct llama_model * llama_load_model_from_url(
        const char * model_url,
        const char * path_model,
        const char * hf_token,
        const struct llama_model_params & params) {
    // Basic validation of the model_url
    if (!model_url || strlen(model_url) == 0) {
@ -2386,7 +2453,7 @@ struct llama_model * llama_load_model_from_url(
        return NULL;
    }
-    if (!llama_download_file(model_url, path_model)) {
+    if (!llama_download_file(model_url, path_model, hf_token)) {
        return NULL;
    }
@ -2434,14 +2501,14 @@ struct llama_model * llama_load_model_from_url(
        // Prepare download in parallel
        std::vector<std::future<bool>> futures_download;
        for (int idx = 1; idx < n_split; idx++) {
-            futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split](int download_idx) -> bool {
+            futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
                char split_path[PATH_MAX] = {0};
                llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
                char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
                llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
-                return llama_download_file(split_url, split_path);
+                return llama_download_file(split_url, split_path, hf_token);
            }, idx));
        }
@ -2460,6 +2527,7 @@ struct llama_model * llama_load_model_from_hf(
        const char * repo,
        const char * model,
        const char * path_model,
        const char * hf_token,
        const struct llama_model_params & params) {
    // construct hugging face model url:
    //
@ -2475,7 +2543,7 @@ struct llama_model * llama_load_model_from_hf(
    model_url += "/resolve/main/";
    model_url += model;
-    return llama_load_model_from_url(model_url.c_str(), path_model, params);
+    return llama_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
 }
 #else
@ -2483,6 +2551,7 @@ struct llama_model * llama_load_model_from_hf(
 struct llama_model * llama_load_model_from_url(
        const char * /*model_url*/,
        const char * /*path_model*/,
        const char * /*hf_token*/,
        const struct llama_model_params & /*params*/) {
    fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
    return nullptr;
@ -2492,6 +2561,7 @@ struct llama_model * llama_load_model_from_hf(
        const char * /*repo*/,
        const char * /*model*/,
        const char * /*path_model*/,
        const char * /*hf_token*/,
        const struct llama_model_params & /*params*/) {
    fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
    return nullptr;
@ -2556,51 +2626,35 @@ std::vector<llama_token> llama_tokenize(
 }
 std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
-    std::vector<char> result(8, 0);
+    std::string piece;
-    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
+    piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'
-    if (n_tokens < 0) {
+    const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
-        result.resize(-n_tokens);
+    if (n_chars < 0) {
-        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
+        piece.resize(-n_chars);
-        GGML_ASSERT(check == -n_tokens);
+        int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
-    } else {
+        GGML_ASSERT(check == -n_chars);
-        result.resize(n_tokens);
+    }
    else {
        piece.resize(n_chars);
    }
-    return std::string(result.data(), result.size());
+    return piece;
 }
-std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
+std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
-    const llama_token bos_id = llama_token_bos(llama_get_model(ctx));
+    std::string text;
-
+    text.resize(std::max(text.capacity(), tokens.size()));
-    std::string piece;
+    int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
-    std::string result;
+    if (n_chars < 0) {
-
+        text.resize(-n_chars);
-    for (size_t i = 0; i < tokens.size(); ++i) {
+        n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
-        piece = llama_token_to_piece(ctx, tokens[i]);
+        GGML_ASSERT(n_chars <= (int32_t)text.size());  // whitespace trimming is performed after per-token detokenization
        // remove the leading space of the first non-BOS token
        if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
            piece = piece.substr(1);
        }
        result += piece;
    }
-    return result;
+    text.resize(n_chars);
 }
 std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
    std::string piece;
    std::string result;
    for (size_t i = 0; i < tokens.size(); ++i) {
        piece = llama_token_to_piece(ctx, tokens[i]);
        result += piece;
    }
    // NOTE: the original tokenizer decodes bytes after collecting the pieces.
-    return result;
+    return text;
 }
 bool llama_should_add_bos_token(const llama_model * model) {
@ -2668,12 +2722,19 @@ std::string llama_chat_format_single(const struct llama_model * model,
        const std::vector<llama_chat_msg> & past_msg,
        const llama_chat_msg & new_msg,
        bool add_ass) {
    std::ostringstream ss;
    auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
    std::vector<llama_chat_msg> chat_new(past_msg);
    // if the past_msg ends with a newline, we must preserve it in the formatted version
    if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
        ss << "\n";
    };
    // format chat with new_msg
    chat_new.push_back(new_msg);
    auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
-    auto formatted = fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
+    // get the diff part
-    return formatted;
+    ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
    return ss.str();
 }
 std::string llama_chat_format_example(const struct llama_model * model,
--- a/common/common.h
+++ b/common/common.h
@ -99,6 +99,7 @@ struct gpt_params {
    enum llama_split_mode        split_mode        = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
    // // sampling parameters
    struct llama_sampling_params sparams;
@ -107,6 +108,7 @@ struct gpt_params {
    std::string model_draft          = ""; // draft model for speculative decoding
    std::string model_alias          = "unknown"; // model alias
    std::string model_url            = ""; // model url to download
    std::string hf_token             = ""; // HF token
    std::string hf_repo              = ""; // HF repo
    std::string hf_file              = ""; // HF file
    std::string prompt               = "";
@ -200,6 +202,7 @@ struct gpt_params {
    std::string public_path   = "";
    std::string chat_template = "";
    std::string system_prompt = "";
    bool enable_chat_template = true;
    std::vector<std::string> api_keys;
@ -254,6 +257,7 @@ struct gpt_params {
    bool spm_infill = false; // suffix/prefix/middle pattern for infill
 };
 void gpt_params_handle_hf_token(gpt_params & params);
 void gpt_params_handle_model_default(gpt_params & params);
 bool gpt_params_parse_ex   (int argc, char ** argv, gpt_params & params);
@ -309,8 +313,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
-struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params);
+struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
-struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params);
+struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
 // Batch utils
@ -348,21 +352,13 @@ std::string llama_token_to_piece(
                       llama_token   token,
                       bool          special = true);
 // TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
 //       that takes into account the tokenizer type and decides how to handle the leading space
 //
 // detokenizes a vector of tokens into a string
 // should work similar to Python's `tokenizer.decode`
-// removes the leading space from the first non-BOS token
+// optionally renders special/control tokens
-std::string llama_detokenize_spm(
+std::string llama_detokenize(
                         llama_context * ctx,
-        const std::vector<llama_token> & tokens);
+        const std::vector<llama_token> & tokens,
-
+                                  bool   special = true);
 // detokenizes a vector of tokens into a string
 // should work similar to Python's `tokenizer.decode`
 std::string llama_detokenize_bpe(
                         llama_context * ctx,
        const std::vector<llama_token> & tokens);
 // Uses the value from the model metadata if possible, otherwise
 // defaults to true when model type is SPM, otherwise false.
@ -458,4 +454,3 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
 void yaml_dump_non_result_info(
    FILE * stream, const gpt_params & params, const llama_context * lctx,
    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
--- a/common/log.h
+++ b/common/log.h
@ -630,7 +630,7 @@ inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
    buf << "[ ";
    bool first = true;
-    for (const auto &token : tokens)
+    for (const auto & token : tokens)
    {
        if (!first) {
            buf << ", ";
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -282,8 +282,6 @@ static llama_token llama_sampling_sample_impl(
        GGML_ASSERT(!original_logits.empty());
    }
    llama_token id = 0;
    // Get a pointer to the logits
    float * logits = llama_get_logits_ith(ctx_main, idx);
    if (temp < 0.0) {
        // greedy sampling, with probs
@ -324,6 +322,9 @@ static llama_token llama_sampling_sample_impl(
    }
    if (ctx_sampling->grammar != NULL && !is_resampling) {
        // Get a pointer to the logits
        float * logits = llama_get_logits_ith(ctx_main, idx);
        // Create an array with a single token data element for the sampled id
        llama_token_data single_token_data = {id, logits[id], 0.0f};
        llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
@ -377,7 +378,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
    if (ctx_sampling->grammar != NULL && !apply_grammar) {
        GGML_ASSERT(original_logits != NULL);
        // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
-        *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
+        *original_logits = {logits, logits + n_vocab};
    }
    // apply params.logit_bias map
@ -390,10 +391,10 @@ static llama_token_data_array llama_sampling_prepare_impl(
        llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
    }
-    cur.clear();
+    cur.resize(n_vocab);
    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
    }
    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -13,7 +13,7 @@ import sys
 from enum import IntEnum
 from pathlib import Path
 from hashlib import sha256
-from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast
+from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
 import math
 import numpy as np
@ -265,7 +265,7 @@ class Model:
                    break
            for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
-                data: np.ndarray = data  # type hint
+                data: np.ndarray  # type hint
                n_dims = len(data.shape)
                data_dtype = data.dtype
                data_qtype: gguf.GGMLQuantizationType | None = None
@ -404,7 +404,7 @@ class Model:
        return tokens, toktypes, tokpre
-    # NOTE: this function is generated by convert-hf-to-gguf-update.py
+    # NOTE: this function is generated by convert_hf_to_gguf_update.py
    #       do not modify it manually!
    # ref:  https://github.com/ggerganov/llama.cpp/pull/6920
    # Marker: Start get_vocab_base_pre
@ -424,7 +424,7 @@ class Model:
        res = None
-        # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
+        # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
        #       or pull the latest version of the model from Huggingface
        #       don't edit the hashes manually!
        if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
@ -487,18 +487,24 @@ class Model:
        if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
            res = "jina-v2-code"
        if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
            # ref: https://huggingface.co/THUDM/glm-4-9b-chat
            res = "chatglm-bpe"
        if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
            # ref: https://huggingface.co/LumiOpen/Viking-7B
            res = "viking"
        if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
            # ref: https://huggingface.co/core42/jais-13b
            res = "jais"
        if res is None:
            logger.warning("\n")
            logger.warning("**************************************************************************************")
            logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
            logger.warning("**          There are 2 possible reasons for this:")
-            logger.warning("**          - the model has not been added to convert-hf-to-gguf-update.py yet")
+            logger.warning("**          - the model has not been added to convert_hf_to_gguf_update.py yet")
            logger.warning("**          - the pre-tokenization config has changed upstream")
-            logger.warning("**          Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
+            logger.warning("**          Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
            logger.warning("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
            logger.warning("**")
            logger.warning(f"** chkhsh:  {chkhsh}")
@ -576,15 +582,23 @@ class Model:
        special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
        special_vocab.add_to_gguf(self.gguf_writer)
-    def _set_vocab_sentencepiece(self):
+    def _set_vocab_sentencepiece(self, add_to_gguf=True):
        tokens, scores, toktypes = self._create_vocab_sentencepiece()
        self.gguf_writer.add_tokenizer_model("llama")
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
        self.gguf_writer.add_token_types(toktypes)
        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
        special_vocab.add_to_gguf(self.gguf_writer)
    def _create_vocab_sentencepiece(self):
        from sentencepiece import SentencePieceProcessor
        tokenizer_path = self.dir_model / 'tokenizer.model'
        tokens: list[bytes] = []
        scores: list[float] = []
        toktypes: list[int] = []
        if not tokenizer_path.is_file():
            raise FileNotFoundError(f"File not found: {tokenizer_path}")
@ -638,14 +652,7 @@ class Model:
                scores.append(-1000.0)
                toktypes.append(SentencePieceTokenTypes.UNUSED)
-        self.gguf_writer.add_tokenizer_model("llama")
+        return tokens, scores, toktypes
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
        self.gguf_writer.add_token_types(toktypes)
        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
        special_vocab.add_to_gguf(self.gguf_writer)
    def _set_vocab_llama_hf(self):
        vocab = gguf.LlamaHfVocab(self.dir_model)
@ -669,6 +676,51 @@ class Model:
        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
        special_vocab.add_to_gguf(self.gguf_writer)
    def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
        tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf"
        logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
        vocab_reader = gguf.GGUFReader(tokenizer_path, "r")
        default_pre = "mpt" if model_name == "gpt-neox" else "default"
        field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL)
        assert field  # tokenizer model
        self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8"))
        field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE)
        self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else default_pre)
        field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST)
        assert field  # token list
        self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
        if model_name == "llama-spm":
            field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES)
            assert field  # token scores
            self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
        field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
        assert field  # token types
        self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
        if model_name != "llama-spm":
            field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES)
            assert field  # token merges
            self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)) is not None:
            self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)) is not None:
            self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)) is not None:
            self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)) is not None:
            self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0])
        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_BOS)) is not None:
            self.gguf_writer.add_add_bos_token(field.parts[-1].tolist()[0])
        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None:
            self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0])
@Model.register("GPTNeoXForCausalLM")
 class GPTNeoXModel(Model):
@ -1151,11 +1203,10 @@ class RefactModel(Model):
        # TODO: how to determine special FIM tokens automatically?
        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
-                                          special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
+                                          special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
        special_vocab._set_special_token("prefix", 1)
        special_vocab._set_special_token("suffix", 3)
        special_vocab._set_special_token("middle", 2)
        special_vocab._set_special_token("fsep",   4) # is this correct?
        special_vocab.add_to_gguf(self.gguf_writer)
    def set_gguf_parameters(self):
@ -1304,7 +1355,7 @@ class LlamaModel(Model):
    def set_vocab(self):
        try:
-            self. _set_vocab_sentencepiece()
+            self._set_vocab_sentencepiece()
        except FileNotFoundError:
            try:
                self._set_vocab_llama_hf()
@ -1934,7 +1985,7 @@ class Phi3MiniModel(Model):
        if len(rope_scaling_type) == 0:
            raise KeyError('Missing the required key rope_scaling.type')
-        if rope_scaling_type == 'su':
+        if rope_scaling_type == 'su' or rope_scaling_type == 'longrope':
            attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
        elif rope_scaling_type == 'yarn':
            attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
@ -2064,7 +2115,7 @@ class InternLM2Model(Model):
            logger.error(f'Error: Missing {tokenizer_path}')
            sys.exit(1)
-        sentencepiece_model = model.ModelProto()
+        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
        sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
        add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
@ -2092,6 +2143,9 @@ class InternLM2Model(Model):
                toktype = SentencePieceTokenTypes.UNUSED
            elif tokenizer.IsByte(token_id):
                toktype = SentencePieceTokenTypes.BYTE
            # take care of ununsed raw token
            if piece.startswith('[UNUSED'):
                toktype = SentencePieceTokenTypes.UNKNOWN
            tokens.append(text)
            scores.append(score)
@ -2107,6 +2161,47 @@ class InternLM2Model(Model):
                    scores.append(-1000.0)
                    toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
        chat_eos_token = '<|im_end|>'
        chat_eos_token_id = None
        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
        if tokenizer_config_file.is_file():
            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
                tokenizer_config_json = json.load(f)
                added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
                for token_id, foken_data in added_tokens_decoder.items():
                    token_id = int(token_id)
                    token = foken_data["content"]
                    if token == chat_eos_token:
                        chat_eos_token_id = token_id
                    token = token.encode("utf-8")
                    if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
                        assert(tokens[token_id] == token)
                    tokens[token_id] = token
                    scores[token_id] = -1000.0
                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
                    if foken_data.get("special"):
                        toktypes[token_id] = SentencePieceTokenTypes.CONTROL
        tokenizer_file = self.dir_model / 'tokenizer.json'
        if tokenizer_file.is_file():
            with open(tokenizer_file, "r", encoding="utf-8") as f:
                tokenizer_json = json.load(f)
                added_tokens = tokenizer_json.get("added_tokens", [])
                for foken_data in added_tokens:
                    token_id = int(foken_data["id"])
                    token = foken_data["content"]
                    if token == chat_eos_token:
                        chat_eos_token_id = token_id
                    token = token.encode("utf-8")
                    if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
                        assert(tokens[token_id] == token)
                    tokens[token_id] = token
                    scores[token_id] = -1000.0
                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
                    if foken_data.get("special"):
                        toktypes[token_id] = SentencePieceTokenTypes.CONTROL
        self.gguf_writer.add_tokenizer_model("llama")
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
@ -2116,28 +2211,16 @@ class InternLM2Model(Model):
        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
        old_eos = special_vocab.special_token_ids["eos"]
-        if "chat" in os.path.basename(self.dir_model.absolute()):
+        if chat_eos_token_id is not None:
            # For the chat model, we replace the eos with '<|im_end|>'.
            # TODO: this is a hack, should be fixed
            #       https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
-            special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
+            special_vocab.special_token_ids["eos"] = chat_eos_token_id
-            logger.warning(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
+            logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
-in chat mode so that the conversation can end normally.")
+                           " in chat mode so that the conversation can end normally.")
        special_vocab.add_to_gguf(self.gguf_writer)
    def _try_get_sft_eos(self, tokenizer):
        unused_145_list = tokenizer.Encode('[UNUSED_TOKEN_145]')
        im_end_list = tokenizer.Encode('<|im_end|>')
        eos_token = None
        assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
        if len(unused_145_list) == 1:
            eos_token = unused_145_list[0]
        if len(im_end_list) == 1:
            eos_token = im_end_list[0]
        assert eos_token
        return eos_token
    def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
        if n_head_kv is not None and n_head != n_head_kv:
            n_head = n_head_kv
@ -2156,6 +2239,10 @@ in chat mode so that the conversation can end normally.")
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
        self.gguf_writer.add_file_type(self.ftype)
        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
            if self.hparams["rope_scaling"].get("type") == "linear":
                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        num_heads = self.hparams["num_attention_heads"]
@ -2308,6 +2395,8 @@ class GemmaModel(Model):
        special_vocab._set_special_token("eot",    107)
        special_vocab.add_to_gguf(self.gguf_writer)
        self.gguf_writer.add_add_space_prefix(False)
    def set_gguf_parameters(self):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]
@ -2345,7 +2434,20 @@ class Gemma2Model(Model):
    model_arch = gguf.MODEL_ARCH.GEMMA2
    def set_vocab(self):
-        self._set_vocab_llama_hf()
+        tokens, scores, toktypes = self._create_vocab_sentencepiece()
        # hack: This is required so that we can properly use start/end-of-turn for chat template
        for i in range(108):
            # including <unusedX>, <start_of_turn>, <end_of_turn>
            toktypes[i] = SentencePieceTokenTypes.CONTROL
        self.gguf_writer.add_tokenizer_model("llama")
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
        self.gguf_writer.add_token_types(toktypes)
        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
        special_vocab.add_to_gguf(self.gguf_writer)
        self.gguf_writer.add_add_space_prefix(False)
    def set_gguf_parameters(self):
@ -2363,9 +2465,21 @@ class Gemma2Model(Model):
        self.gguf_writer.add_key_length(hparams["head_dim"])
        self.gguf_writer.add_value_length(hparams["head_dim"])
        self.gguf_writer.add_file_type(self.ftype)
        self.gguf_writer.add_attn_logit_softcapping(
            self.hparams["attn_logit_softcapping"]
        )
        self.gguf_writer.add_final_logit_softcapping(
            self.hparams["final_logit_softcapping"]
        )
        self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
        # sanity check
        attn_scalar = self.hparams["query_pre_attn_scalar"]
        if attn_scalar != hparams["hidden_size"] / hparams["num_attention_heads"]:
            raise ValueError("query_pre_attn_scalar must be equal to n_embd / n_head")
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        del bid  # unusem
+        del bid  # unused
        # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
        # To prevent errors, skip loading lm_head.weight.
@ -2404,39 +2518,7 @@ class MambaModel(Model):
            self._set_vocab_sentencepiece()
        else:
            # Use the GPT-NeoX tokenizer when no tokenizer files are present
-            tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf"
+            self._set_vocab_builtin("gpt-neox", vocab_size)
            logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
            neox_reader = gguf.GGUFReader(tokenizer_path, "r")
            field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
            self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8") if field else "gpt2")
            field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE)
            self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else "mpt")
            field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
            assert field
            self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
            field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
            assert field
            self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
            field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
            assert field
            self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
            field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
            self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0] if field else 1)
            field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
            self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0] if field else 0)
            field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
            self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0] if field else 0)
            field = neox_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)
            self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0] if field else 0)
    def set_gguf_parameters(self):
        d_model = self.find_hparam(["hidden_size",       "d_model"])
@ -2588,6 +2670,82 @@ class JinaBertV2Model(BertModel):
        self.gguf_writer.add_add_eos_token(True)
@Model.register("OpenELMForCausalLM")
 class OpenELMModel(Model):
    model_arch = gguf.MODEL_ARCH.OPENELM
    @staticmethod
    def _make_divisible(v: float | int, divisor: int) -> int:
        # ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38
        new_v = max(divisor, int(v + divisor / 2) // divisor * divisor)
        # Make sure that round down does not go down by more than 10%.
        if new_v < 0.9 * v:
            new_v += divisor
        return new_v
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        ffn_multipliers: list[float] = self.hparams["ffn_multipliers"]
        ffn_dim_divisor: int = self.hparams["ffn_dim_divisor"]
        self._n_embd: int = self.hparams["model_dim"]
        self._num_kv_heads: list[int] = self.hparams["num_kv_heads"]
        self._num_query_heads: list[int] = self.hparams["num_query_heads"]
        self._ffn_dims: list[int] = [
            OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor)
            for multiplier in ffn_multipliers
        ]
        assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
        assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int)
    # Uses the tokenizer from meta-llama/Llama-2-7b-hf
    def set_vocab(self):
        try:
            self._set_vocab_sentencepiece()
        except FileNotFoundError:
            self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"])
    def set_gguf_parameters(self):
        n_embd = self._n_embd
        head_dim = self.hparams["head_dim"]
        rot_pct = 1.0
        assert self.block_count == len(self._num_kv_heads)
        assert self.block_count == len(self._num_query_heads)
        assert self.block_count == len(self._ffn_dims)
        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_block_count(self.block_count)
        self.gguf_writer.add_context_length(self.hparams["max_context_length"])
        self.gguf_writer.add_embedding_length(n_embd)
        self.gguf_writer.add_feed_forward_length(self._ffn_dims)
        self.gguf_writer.add_head_count(self._num_query_heads)
        self.gguf_writer.add_head_count_kv(self._num_kv_heads)
        self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"])
        # https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30
        self.gguf_writer.add_layer_norm_rms_eps(1e-6)
        self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim))
        self.gguf_writer.add_key_length(head_dim)
        self.gguf_writer.add_value_length(head_dim)
        self.gguf_writer.add_file_type(self.ftype)
    def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
        if "n_layers" in keys:
            return self.hparams["num_transformer_layers"]
        return super().find_hparam(keys, optional)
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        # split ff
        if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight":
            ff_dim = self._ffn_dims[bid]
            yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])
            yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])
            return
        yield (self.map_tensor_name(name), data_torch)
@Model.register("ArcticForCausalLM")
 class ArcticModel(Model):
    model_arch = gguf.MODEL_ARCH.ARCTIC
@ -2818,11 +2976,17 @@ class DeepseekV2Model(Model):
                raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("T5ForConditionalGeneration")
@Model.register("T5WithLMHeadModel")
@Model.register("T5ForConditionalGeneration")
@Model.register("MT5ForConditionalGeneration")
@Model.register("UMT5ForConditionalGeneration")
 class T5Model(Model):
    model_arch = gguf.MODEL_ARCH.T5
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.shared_token_embeddings_found = False
    def set_vocab(self):
        # to avoid TypeError: Descriptors cannot be created directly
        # exception when importing sentencepiece_model_pb2
@ -2830,17 +2994,29 @@ class T5Model(Model):
        from sentencepiece import SentencePieceProcessor
        from sentencepiece import sentencepiece_model_pb2 as model
-        tokenizer_path = self.dir_model / 'spiece.model'
+        tokenizer_path = self.dir_model / 'tokenizer.model'
        # many older models use spiece.model tokenizer model filename
        if not tokenizer_path.is_file():
            tokenizer_path = self.dir_model / 'spiece.model'
        if not tokenizer_path.is_file():
            raise FileNotFoundError(f"File not found: {tokenizer_path}")
-        sentencepiece_model = model.ModelProto()
+        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
        sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
        # some models like Pile-T5 family use BPE tokenizer instead of Unigram
        if sentencepiece_model.trainer_spec.model_type == 2:  # BPE
            # assure the tokenizer model file name is correct
            assert tokenizer_path.name == 'tokenizer.model'
            return self._set_vocab_sentencepiece()
        else:
            assert sentencepiece_model.trainer_spec.model_type == 1  # UNIGRAM
        add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
        remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
        precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
        assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
        tokenizer = SentencePieceProcessor()
        tokenizer.LoadFromFile(str(tokenizer_path))
@ -2910,7 +3086,10 @@ class T5Model(Model):
    def set_gguf_parameters(self):
        self.gguf_writer.add_name("T5")
-        self.gguf_writer.add_context_length(self.hparams["n_positions"])
+        if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
            logger.warning("Couldn't find context length in config.json, assuming default value of 512")
            n_ctx = 512
        self.gguf_writer.add_context_length(n_ctx)
        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
        self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
        self.gguf_writer.add_block_count(self.hparams["num_layers"])
@ -2926,16 +3105,295 @@ class T5Model(Model):
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused
-        # Sometimes T5 and Flan-T5 based models contain "encoder.embed_tokens.weight" tensor or
+        # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
-        # "decoder.embed_tokens.weight" tensors that are duplicates of "shared.weight" tensor
+        # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
-        # To prevent errors caused by an unnecessary unmapped tensor, skip both of them and use only "shared.weight".
+        # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
-        if name == "decoder.embed_tokens.weight" or name == "encoder.embed_tokens.weight":
+        # and decoder and ignore the remaining ones.
-            logger.debug(f"Skipping tensor {name!r} in safetensors so that convert can end normally.")
+        if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]:
-            return []
+            if not self.shared_token_embeddings_found:
                name = "shared.weight"
                self.shared_token_embeddings_found = True
            else:
                logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.")
                return []
        return [(self.map_tensor_name(name), data_torch)]
@Model.register("JAISLMHeadModel")
 class JaisModel(Model):
    model_arch = gguf.MODEL_ARCH.JAIS
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # SwigLU activation
        assert self.hparams["activation_function"] == "swiglu"
        # ALiBi position embedding
        assert self.hparams["position_embedding_type"] == "alibi"
        # Embeddings scale
        self.embeddings_scale = 1.0
        # note: For some JAIS flavors, output is tied to (same as) wte in original model
        self.output_is_wte = False
        if 'mup_embeddings_scale' in self.hparams:
            self.output_is_wte = True   # Hack (?)
            self.embeddings_scale = self.hparams['mup_embeddings_scale']
        elif 'embeddings_scale' in self.hparams:
            self.embeddings_scale = self.hparams['embeddings_scale']
        else:
            assert False
        self.width_scale = 1.0
        if 'mup_output_alpha' in self.hparams:
            assert 'mup_width_scale' in self.hparams
            self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale']
        elif 'width_scale' in self.hparams:
            self.width_scale = self.hparams['width_scale']
        else:
            assert False
        self.max_alibi_bias = 8.0
    def set_vocab(self):
        self._set_vocab_gpt2()
    def set_gguf_parameters(self):
        self.gguf_writer.add_name(self.dir_model.name)
        self.gguf_writer.add_block_count(self.hparams["n_layer"])
        self.gguf_writer.add_context_length(self.hparams["n_positions"])
        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
        self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"])
        self.gguf_writer.add_head_count(self.hparams["n_head"])
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
        self.gguf_writer.add_file_type(self.ftype)
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused
        tensors: list[tuple[str, Tensor]] = []
        # we don't need these
        if name.endswith((".attn.bias")):
            return tensors
        if name.endswith(("relative_pe.slopes")):
            # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation)
            # Some other models has max_alibi_bias spelled out explicitly in the hyperparams,
            # but Jais's PyTorch model simply precalculates the slope values and places them
            # in relative_pes.slopes
            n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"]))
            first_val = float(data_torch[0].item())
            self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2)
            return tensors
        if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")):
            data_torch = data_torch.transpose(1, 0)
        new_name = self.map_tensor_name(name)
        if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
            tensors.append((new_name, data_torch * self.embeddings_scale))
            if self.output_is_wte:
                tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
        elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
            assert not self.output_is_wte
            tensors.append((new_name, data_torch * self.width_scale))
        else:
            tensors.append((new_name, data_torch))
        return tensors
    def write_tensors(self):
        super().write_tensors()
        self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
@Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration")
 class ChatGLMModel(Model):
    model_arch = gguf.MODEL_ARCH.CHATGLM
    def set_vocab_chatglm3(self):
        dir_model = self.dir_model
        hparams = self.hparams
        tokens: list[bytes] = []
        toktypes: list[int] = []
        scores: list[float] = []
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
        vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab()))
        assert max(tokenizer.get_vocab().values()) < vocab_size
        role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
        special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
        for token_id in range(vocab_size):
            piece = tokenizer._convert_id_to_token(token_id)
            if token_id == 0:
                piece = "<unk>"
            elif token_id == 1:
                piece = "<bos>"
            elif token_id == 2:
                piece = "<eos>"
            text = piece.encode("utf-8")
            score = 0.0
            # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py),
            # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size()
            if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size():
                score = tokenizer.tokenizer.sp_model.get_score(token_id)
            if len(piece) == 0:
                text = f"[PAD{token_id}]".encode("utf-8")
            if token_id >= tokenizer.tokenizer.sp_model.vocab_size():
                if piece in special_tokens:
                    # show special tokens in prompt
                    toktype = SentencePieceTokenTypes.USER_DEFINED
                else:
                    toktype = SentencePieceTokenTypes.UNKNOWN
                tokens.append(text)
                scores.append(score)
                toktypes.append(toktype)
                continue
            toktype = SentencePieceTokenTypes.NORMAL
            if tokenizer.tokenizer.sp_model.is_unknown(token_id):
                toktype = SentencePieceTokenTypes.UNKNOWN
            elif tokenizer.tokenizer.sp_model.is_control(token_id):
                toktype = SentencePieceTokenTypes.CONTROL
            elif tokenizer.tokenizer.sp_model.is_unused(token_id):
                toktype = SentencePieceTokenTypes.UNUSED
            elif tokenizer.tokenizer.sp_model.is_byte(token_id):
                toktype = SentencePieceTokenTypes.BYTE
            tokens.append(text)
            scores.append(score)
            toktypes.append(toktype)
        self.gguf_writer.add_tokenizer_model("llama")
        # glm3 needs prefix and suffix formatted as:
        # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
        self.gguf_writer.add_tokenizer_pre("chatglm-spm")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
        self.gguf_writer.add_token_types(toktypes)
        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
        special_vocab.add_to_gguf(self.gguf_writer)
    @staticmethod
    def token_bytes_to_string(b):
        from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
        byte_encoder = bytes_to_unicode()
        return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
    @staticmethod
    def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
        parts = [bytes([b]) for b in token]
        while True:
            min_idx = None
            min_rank = None
            for i, pair in enumerate(zip(parts[:-1], parts[1:])):
                rank = mergeable_ranks.get(pair[0] + pair[1])
                if rank is not None and (min_rank is None or rank < min_rank):
                    min_idx = i
                    min_rank = rank
            if min_rank is None or (max_rank is not None and min_rank >= max_rank):
                break
            assert min_idx is not None
            parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
        return parts
    def set_vocab(self):
        if "THUDM/chatglm3-6b" in self.hparams.get("_name_or_path", ""):
            self.set_vocab_chatglm3()
            return
        dir_model = self.dir_model
        hparams = self.hparams
        tokens: list[str] = []
        toktypes: list[int] = []
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
        vocab_size = hparams["padded_vocab_size"]
        assert max(tokenizer.get_vocab().values()) < vocab_size
        tokpre = self.get_vocab_base_pre(tokenizer)
        merges = []
        vocab = {}
        mergeable_ranks = tokenizer.mergeable_ranks
        for token, rank in mergeable_ranks.items():
            vocab[ChatGLMModel.token_bytes_to_string(token)] = rank
            if len(token) == 1:
                continue
            merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
            assert len(merged) >= 2 and len(merged) <= 7
            merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged)))
        # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
        added_vocab = tokenizer.get_added_vocab()
        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
        for i in range(vocab_size):
            if i not in reverse_vocab:
                tokens.append(f"[PAD{i}]")
                toktypes.append(gguf.TokenType.USER_DEFINED)
            elif reverse_vocab[i] in added_vocab:
                tokens.append(reverse_vocab[i])
                if tokenizer.added_tokens_decoder[i].special:
                    toktypes.append(gguf.TokenType.CONTROL)
                else:
                    toktypes.append(gguf.TokenType.USER_DEFINED)
            else:
                tokens.append(reverse_vocab[i])
                toktypes.append(gguf.TokenType.NORMAL)
        self.gguf_writer.add_tokenizer_model("gpt2")
        self.gguf_writer.add_tokenizer_pre(tokpre)
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_types(toktypes)
        special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
        special_vocab.merges = merges
        # only add special tokens when they were not already loaded from config.json
        special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
        special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
        # this one is usually not in config.json anyway
        special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
        special_vocab.add_to_gguf(self.gguf_writer)
    def set_gguf_parameters(self):
        self.gguf_writer.add_name(self.hparams["_name_or_path"].split("/")[1]) # THUDM/glm4-9b-chat or THUDM/chatglm3-6b
        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
        n_head_kv = self.hparams.get("multi_query_group_num", n_head)
        self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
        self.gguf_writer.add_embedding_length(n_embed)
        self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", 4 * n_embed))
        self.gguf_writer.add_block_count(self.hparams["num_layers"])
        self.gguf_writer.add_head_count(n_head)
        self.gguf_writer.add_head_count_kv(n_head_kv)
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layernorm_epsilon"])
        self.gguf_writer.add_file_type(self.ftype)
        self.gguf_writer.add_rope_dimension_count(64)
        self.gguf_writer.add_add_bos_token(False)
        rope_freq = 10000
        if "rope_ratio" in self.hparams:
            rope_freq = rope_freq * self.hparams["rope_ratio"]
        self.gguf_writer.add_rope_freq_base(rope_freq)
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused
        if name.endswith(".rotary_pos_emb.inv_freq"):
            return []
        name = name.removeprefix("transformer.")
        return [(self.map_tensor_name(name), data_torch)]
 ###### CONVERSION LOGIC ######
@ -2985,10 +3443,6 @@ def parse_args() -> argparse.Namespace:
        "--vocab-only", action="store_true",
        help="extract only the vocab",
    )
    parser.add_argument(
        "--awq-path", type=Path, default=None,
        help="Path to scale awq cache file",
    )
    parser.add_argument(
        "--outfile", type=Path,
        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
@ -3066,19 +3520,6 @@ def main() -> None:
    dir_model = args.model
    if args.awq_path:
        sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
        from awq.apply_awq import add_scale_weights  # type: ignore[import-not-found]
        tmp_model_path = args.model / "weighted_model"
        dir_model = tmp_model_path
        if tmp_model_path.is_dir():
            logger.info(f"{tmp_model_path} exists as a weighted model.")
        else:
            tmp_model_path.mkdir(parents=True, exist_ok=True)
            logger.info("Saving new weighted model ...")
            add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
            logger.info(f"Saved weighted model at {tmp_model_path}.")
    if not dir_model.is_dir():
        logger.error(f'Error: {args.model} is not a directory')
        sys.exit(1)
@ -3091,7 +3532,8 @@ def main() -> None:
        "auto": gguf.LlamaFileType.GUESSED,
    }
-    if args.use_temp_file and (args.split_max_tensors > 0 or args.split_max_size != "0"):
+    is_split = args.split_max_tensors > 0 or args.split_max_size != "0"
    if args.use_temp_file and is_split:
        logger.error("Error: Cannot use temp file when splitting")
        sys.exit(1)
@ -3128,11 +3570,12 @@ def main() -> None:
        if args.vocab_only:
            logger.info("Exporting model vocab...")
            model_instance.write_vocab()
-            logger.info("Model vocab successfully exported.")
+            logger.info(f"Model vocab successfully exported to {model_instance.fname_out}")
        else:
            logger.info("Exporting model...")
            model_instance.write()
-            logger.info("Model successfully exported.")
+            out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
            logger.info(f"Model successfully exported to {out_path}")
 if __name__ == '__main__':
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 # This script downloads the tokenizer models of the specified models from Huggingface and
-# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
+# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
 #
 # This is necessary in order to analyze the type of pre-tokenizer used by the model and
 # provide the necessary information to llama.cpp via the GGUF header in order to implement
@ -15,9 +15,9 @@
 # - Add a new model to the "models" list
 # - Run the script with your huggingface token:
 #
-#   python3 convert-hf-to-gguf-update.py <huggingface_token>
+#   python3 convert_hf_to_gguf_update.py <huggingface_token>
 #
-# - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py
+# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
 # - Update llama.cpp with the new pre-tokenizer if necessary
 #
 # TODO: generate tokenizer tests for llama.cpp
@ -37,7 +37,7 @@ from enum import IntEnum, auto
 from transformers import AutoTokenizer
 logging.basicConfig(level=logging.DEBUG)
-logger = logging.getLogger("convert-hf-to-gguf-update")
+logger = logging.getLogger("convert_hf_to_gguf_update")
 sess = requests.Session()
@ -45,6 +45,7 @@ class TOKENIZER_TYPE(IntEnum):
    SPM = auto()
    BPE = auto()
    WPM = auto()
    UGM = auto()
 # TODO: this string has to exercise as much pre-tokenizer functionality as possible
@ -55,10 +56,10 @@ if len(sys.argv) == 2:
    token = sys.argv[1]
    if not token.startswith("hf_"):
        logger.info("Huggingface token seems invalid")
-        logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
+        logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
        sys.exit(1)
 else:
-    logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
+    logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
    sys.exit(1)
 # TODO: add models here, base models preferred
@ -86,6 +87,10 @@ models = [
    {"name": "poro-chat",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
    {"name": "jina-v2-code",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
    {"name": "viking",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
    {"name": "gemma",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
    {"name": "gemma-2",        "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
    {"name": "jais",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
    {"name": "t5",             "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
 ]
@ -107,9 +112,13 @@ def download_model(model):
    os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
    files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
    if tokt == TOKENIZER_TYPE.SPM:
        files.append("tokenizer.model")
    if tokt == TOKENIZER_TYPE.UGM:
        files.append("spiece.model")
    for file in files:
        save_path = f"models/tokenizers/{name}/{file}"
        if os.path.isfile(save_path):
@ -125,14 +134,14 @@ for model in models:
        logger.error(f"Failed to download model {model['name']}. Error: {e}")
-# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
+# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
 src_ifs = ""
 for model in models:
    name = model["name"]
    tokt = model["tokt"]
-    if tokt == TOKENIZER_TYPE.SPM:
+    if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
        continue
    # Skip if the tokenizer folder does not exist or there are other download issues previously
@ -142,7 +151,10 @@ for model in models:
    # create the tokenizer
    try:
-        tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+        if name == "t5":
            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
        else:
            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
    except OSError as e:
        logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
        continue  # Skip to the next model if the tokenizer can't be loaded
@ -189,7 +201,7 @@ src_func = f"""
        res = None
-        # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
+        # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
        #       or pull the latest version of the model from Huggingface
        #       don't edit the hashes manually!
 {src_ifs}
@ -198,9 +210,9 @@ src_func = f"""
            logger.warning("**************************************************************************************")
            logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
            logger.warning("**          There are 2 possible reasons for this:")
-            logger.warning("**          - the model has not been added to convert-hf-to-gguf-update.py yet")
+            logger.warning("**          - the model has not been added to convert_hf_to_gguf_update.py yet")
            logger.warning("**          - the pre-tokenization config has changed upstream")
-            logger.warning("**          Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
+            logger.warning("**          Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
            logger.warning("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
            logger.warning("**")
            logger.warning(f"** chkhsh:  {{chkhsh}}")
@ -214,7 +226,7 @@ src_func = f"""
        return res
 """
-convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
+convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
 convert_py = convert_py_pth.read_text(encoding="utf-8")
 convert_py = re.sub(
    r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
@ -225,7 +237,7 @@ convert_py = re.sub(
 convert_py_pth.write_text(convert_py, encoding="utf-8")
-logger.info("+++ convert-hf-to-gguf.py was updated")
+logger.info("+++ convert_hf_to_gguf.py was updated")
 # generate tests for each tokenizer model
@ -263,6 +275,7 @@ tests = [
    "\n =",
    "' era",
    "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
    "!!!!!!",
    "3",
    "33",
    "333",
@ -272,7 +285,8 @@ tests = [
    "3333333",
    "33333333",
    "333333333",
-    # "Cửa Việt", # llama-bpe fails on this
+    "Cửa Việt", # llama-bpe fails on this
    " discards",
    chktxt,
 ]
@ -300,7 +314,10 @@ for model in models:
    # create the tokenizer
    try:
-        tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+        if name == "t5":
            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
        else:
            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
    except OSError as e:
        logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
        continue  # Skip this model and continue with the next one in the loop
@ -326,6 +343,6 @@ logger.info("\nRun the following commands to generate the vocab files for testin
 for model in models:
    name = model["name"]
-    print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
+    print(f"python3 convert_hf_to_gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
 logger.info("\n")
--- a/convert_llama_ggml_to_gguf.py
+++ b/convert_llama_ggml_to_gguf.py
@ -354,7 +354,8 @@ class GGMLToGGUF:
 def handle_metadata(cfg, hp):
-    import convert
+    import examples.convert_legacy_llama as convert
    assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
    hf_config_path   = cfg.model_metadata_dir / "config.json"
    orig_config_path = cfg.model_metadata_dir / "params.json"
--- a/docs/android.md
+++ b/docs/android.md
@ -0,0 +1,56 @@
 # Android
 ## Build on Android using Termux
 [Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required).
 ```
 apt update && apt upgrade -y
 apt install git make cmake
 ```
 It's recommended to move your model inside the `~/` directory for best performance:
 ```
 cd storage/downloads
 mv model.gguf ~/
 ```
 [Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
 ## Building the Project using Android NDK
 Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
 Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
 ```
 $ mkdir build-android
 $ cd build-android
 $ export NDK=<your_ndk_directory>
 $ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
 $ make
 ```
 Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
 Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
 (Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
 ```
 $cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
 $cd /data/data/com.termux/files/home/bin
 $chmod +x ./*
 ```
 Download model [llama-2-7b-chat.Q4_K_M.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_K_M.gguf), and push it to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
 ```
 $mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/home/model/
 ```
 Now, you can start chatting:
 ```
 $cd /data/data/com.termux/files/home/bin
 $./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
 ```
 Here's a demo of an interactive session running on Pixel 5 phone:
 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
--- a/docs/backend/BLIS.md
+++ b/docs/backend/BLIS.md
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
--- a/docs/build.md
+++ b/docs/build.md
@ -0,0 +1,290 @@
 # Build llama.cpp locally
 **To get the Code:**
 ```bash
 git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 ```
 In order to build llama.cpp you have four different options.
 - Using `make`:
  - On Linux or MacOS:
      ```bash
      make
      ```
  - On Windows:
    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
    2. Extract `w64devkit` on your pc.
    3. Run `w64devkit.exe`.
    4. Use the `cd` command to reach the `llama.cpp` folder.
    5. From here you can run:
        ```bash
        make
        ```
  - Notes:
    - For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
    - For faster repeated compilation, install [ccache](https://ccache.dev/).
    - For debug builds, run `make LLAMA_DEBUG=1`
 - Using `CMake`:
  ```bash
  cmake -B build
  cmake --build build --config Release
  ```
  **Notes**:
    - For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
    - For faster repeated compilation, install [ccache](https://ccache.dev/).
    - For debug builds, there are two cases:
      1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
      ```bash
      cmake -B build -DCMAKE_BUILD_TYPE=Debug
      cmake --build build
      ```
      2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
      ```bash
      cmake -B build -G "Xcode"
      cmake --build build --config Debug
      ```
 -   Using `gmake` (FreeBSD):
    1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
    2. Add your user to **video** group
    3. Install compilation dependencies.
        ```bash
        sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
        gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
        ```
 ## Metal Build
 On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
 To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option.
 When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
 argument.
 ## BLAS Build
 Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
 ### Accelerate Framework:
 This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
 ### OpenBLAS:
 This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
 - Using `make`:
  - On Linux:
    ```bash
    make GGML_OPENBLAS=1
    ```
  - On Windows:
    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
    2. Download the latest version of [OpenBLAS for Windows](https://github.com/xianyi/OpenBLAS/releases).
    3. Extract `w64devkit` on your pc.
    4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
    5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
    6. Run `w64devkit.exe`.
    7. Use the `cd` command to reach the `llama.cpp` folder.
    8. From here you can run:
        ```bash
        make GGML_OPENBLAS=1
        ```
 - Using `CMake` on Linux:
    ```bash
    cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
    cmake --build build --config Release
    ```
 ### BLIS
 Check [BLIS.md](./backend/BLIS.md) for more information.
 ### SYCL
 SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
 llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
 For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
 ### Intel oneMKL
 Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
 - Using manual oneAPI installation:
  By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
    ```bash
    source /opt/intel/oneapi/setvars.sh # You can skip this step if  in oneapi-basekit docker image, only required for manual installation
    cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=ON
    cmake --build build --config Release
    ```
 - Using oneAPI docker image:
  If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
 Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
 ### CUDA
 This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
 For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
 - Using `make`:
  ```bash
  make GGML_CUDA=1
  ```
 - Using `CMake`:
  ```bash
  cmake -B build -DGGML_CUDA=ON
  cmake --build build --config Release
  ```
 The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
 | Option                        | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
 |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | GGML_CUDA_FORCE_DMMV          | Boolean                | false   | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
 | GGML_CUDA_DMMV_X              | Positive integer >= 32 | 32      | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants.                                         |
 | GGML_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                                                                                                                         |
 | GGML_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower.                       |
 | GGML_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models                                                                                                                                                                                       |
 | GGML_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
 | GGML_CUDA_KQUANTS_ITER        | 1 or 2                 | 2       | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                                                                     |
 | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
 | GGML_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |
 ### hipBLAS
 This provides BLAS acceleration on HIP-supported AMD GPUs.
 Make sure to have ROCm installed.
 You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
 - Using `make`:
  ```bash
  make GGML_HIPBLAS=1
  ```
 - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
  ```bash
  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
      cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
      && cmake --build build --config Release -- -j 16
  ```
  On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
  However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
  Note that if you get the following error:
  ```
  clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
  ```
  Try searching for a directory under `HIP_PATH` that contains the file
  `oclc_abi_version_400.bc`. Then, add the following to the start of the
  command: `HIP_DEVICE_LIB_PATH=<directory-you-just-found>`, so something
  like:
  ```bash
  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
  HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
      cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
      && cmake --build build -- -j 16
  ```
 - Using `make` (example for target gfx1030, build with 16 CPU threads):
  ```bash
  make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
  ```
 - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
  ```bash
  set PATH=%HIP_PATH%\bin;%PATH%
  cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
  cmake --build build
  ```
  Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
  Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.
 The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
 If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
 The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
 | Option                 | Legal values           | Default | Description                                                                                                                                                                                                                                    |
 |------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | GGML_CUDA_DMMV_X       | Positive integer >= 32 | 32      | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
 | GGML_CUDA_MMV_Y        | Positive integer       | 1       | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants.                                                                       |
 | GGML_CUDA_KQUANTS_ITER | 1 or 2                 | 2       | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                             |
 ### Vulkan
 **With docker**:
 You don't need to install Vulkan SDK. It will be installed inside the container.
 ```sh
 # Build the image
 docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
 # Then, use it:
 docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
 ```
 **Without docker**:
 Firstly, you need to make sure you have installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html)
 For example, on Ubuntu 22.04 (jammy), use the command below:
 ```bash
 wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
 wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
 apt update -y
 apt-get install -y vulkan-sdk
 # To verify the installation, use the command below:
 vulkaninfo
 ```
 Alternatively your package manager might be able to provide the appropriate libraries.
 For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
 For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
 Then, build llama.cpp using the cmake command below:
 ```bash
 cmake -B build -DGGML_VULKAN=1
 cmake --build build --config Release
 # Test the output binary (with "-ngl 33" to offload all layers to GPU)
 ./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
 # You should see in the output, ggml_vulkan detected your GPU. For example:
 # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
 ```
 ### Android
 To read documentation for how to build on Android, [click here](./android.md)
--- a/docs/development/HOWTO-add-model.md
+++ b/docs/development/HOWTO-add-model.md
@ -1,4 +1,4 @@
-## Add a new model architecture to `llama.cpp`
+# Add a new model architecture to `llama.cpp`
 Adding a model requires few steps:
@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M
 ### 1. Convert the model to GGUF
 This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
-Depending on the model architecture, you can use either [convert-hf-to-gguf.py](../convert-hf-to-gguf.py) or [examples/convert-legacy-llama.py](../examples/convert-legacy-llama.py) (for `llama/llama2` models in `.pth` format).
+Depending on the model architecture, you can use either [convert_hf_to_gguf.py](../convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](../examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format).
 The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
--- a/docs/development/debugging-tests.md
+++ b/docs/development/debugging-tests.md
--- a/docs/development/llama-star/idea-arch.key
+++ b/docs/development/llama-star/idea-arch.key
--- a/docs/development/llama-star/idea-arch.pdf
+++ b/docs/development/llama-star/idea-arch.pdf
--- a/docs/development/token_generation_performance_tips.md
+++ b/docs/development/token_generation_performance_tips.md
--- a/docs/docker.md
+++ b/docs/docker.md
@ -0,0 +1,86 @@
 # Docker
 ## Prerequisites
 * Docker must be installed and running on your system.
 * Create a folder to store big models & intermediate files (ex. /llama/models)
 ## Images
 We have three Docker images available for this project:
 1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
 2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
 3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
 Additionally, there the following images, similar to the above:
 - `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 - `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
 ## Usage
 The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
 Replace `/path/to/models` below with the actual path where you downloaded the models.
 ```bash
 docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
 ```
 On completion, you are ready to play!
 ```bash
 docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```
 or with a light image:
 ```bash
 docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```
 or with a server image:
 ```bash
 docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
 ```
 ## Docker With CUDA
 Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
 ## Building Docker locally
 ```bash
 docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
 docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
 docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
 ```
 You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
 The defaults are:
 - `CUDA_VERSION` set to `11.7.1`
 - `CUDA_DOCKER_ARCH` set to `all`
 The resulting images, are essentially the same as the non-CUDA images:
 1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
 2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
 3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
 ## Usage
 After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
 ```bash
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
 ```
--- a/docs/install.md
+++ b/docs/install.md
@ -0,0 +1,39 @@
 # Install pre-built version of llama.cpp
 ## Homebrew
 On Mac and Linux, the homebrew package manager can be used via
 ```sh
 brew install llama.cpp
 ```
 The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
 ## Nix
 On Mac and Linux, the Nix package manager can be used via
 ```sh
 nix profile install nixpkgs#llama-cpp
 ```
 For flake enabled installs.
 Or
 ```sh
 nix-env --file '<nixpkgs>' --install --attr llama-cpp
 ```
 For non-flake enabled installs.
 This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
 ## Flox
 On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
 ```sh
 flox install llama-cpp
 ```
 Flox follows the nixpkgs build of llama.cpp.
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -23,6 +23,7 @@ else()
    add_subdirectory(export-lora)
    add_subdirectory(finetune)
    add_subdirectory(gbnf-validator)
    add_subdirectory(gguf-hash)
    add_subdirectory(gguf-split)
    add_subdirectory(gguf)
    add_subdirectory(gritlm)
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@ -229,7 +229,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
 private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
    var result = [CChar](repeating: 0, count: 8)
-    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), false)
+    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), 0, false)
    if nTokens < 0 {
        let actualTokensCount = -Int(nTokens)
        result = .init(repeating: 0, count: actualTokensCount)
@ -238,6 +238,7 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
            token,
            &result,
            Int32(result.count),
            0,
            false
        )
        assert(check == actualTokensCount)
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -93,14 +93,34 @@ int main(int argc, char ** argv) {
    // create a llama_batch
    // we use this object to submit token data for decoding
-    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0, 1);
+    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
    std::vector<llama_seq_id> seq_ids(n_parallel, 0);
    for (int32_t i = 0; i < n_parallel; ++i) {
        seq_ids[i] = i;
    }
    // evaluate the initial prompt
    for (size_t i = 0; i < tokens_list.size(); ++i) {
-        llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
+        llama_batch_add(batch, tokens_list[i], i, seq_ids, false);
    }
    GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
    if (llama_model_has_encoder(model)) {
        if (llama_encode(ctx, batch)) {
            LOG_TEE("%s : failed to eval\n", __func__);
            return 1;
        }
        llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
        if (decoder_start_token_id == -1) {
            decoder_start_token_id = llama_token_bos(model);
        }
        llama_batch_clear(batch);
        llama_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
    }
    // llama_decode will output logits only for the last token of the prompt
    batch.logits[batch.n_tokens - 1] = true;
@ -109,11 +129,11 @@ int main(int argc, char ** argv) {
        return 1;
    }
-    // assign the system KV cache to all parallel sequences
+    //// assign the system KV cache to all parallel sequences
-    // this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
+    //// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
-    for (int32_t i = 1; i < n_parallel; ++i) {
+    //for (int32_t i = 1; i < n_parallel; ++i) {
-        llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
+    //    llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
-    }
+    //}
    if (n_parallel > 1) {
        LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
--- a/examples/convert_legacy_llama.py
+++ b/examples/convert_legacy_llama.py
@ -353,7 +353,7 @@ class Metadata:
    version: Optional[str] = None
    url: Optional[str] = None
    description: Optional[str] = None
-    licence: Optional[str] = None
+    license: Optional[str] = None
    source_url: Optional[str] = None
    source_hf_repo: Optional[str] = None
@ -492,12 +492,13 @@ class LazyTensor:
 LazyModel: TypeAlias = 'dict[str, LazyTensor]'
 ModelFormat: TypeAlias = Literal['ggml', 'torch', 'safetensors', 'none']
@dataclass
 class ModelPlus:
    model: LazyModel
    paths: list[Path]  # Where this was read from.
-    format: Literal['ggml', 'torch', 'safetensors', 'none']
+    format: ModelFormat
    vocab: BaseVocab | None  # For GGML models (which have vocab built in), the vocab.
@ -536,7 +537,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
 def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
-    formats = set(mp.format for mp in models_plus)
+    formats: set[ModelFormat] = set(mp.format for mp in models_plus)
    assert len(formats) == 1, "different formats?"
    format = formats.pop()
    paths = [path for mp in models_plus for path in mp.paths]
@ -555,7 +556,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
    else:
        model = merge_sharded([mp.model for mp in models_plus])
-    return ModelPlus(model, paths, format, vocab)  # pytype: disable=wrong-arg-types
+    return ModelPlus(model, paths, format, vocab)
 def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
@ -805,7 +806,7 @@ class OutputFile:
    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
-    def add_meta_model(self, params: Params, metadata: Metadata) -> None:
+    def add_meta_model(self, params: Params, metadata: Metadata | None) -> None:
        # Metadata About The Model And Its Provenence
        name = "LLaMA"
        if metadata is not None and metadata.name is not None:
@ -827,8 +828,8 @@ class OutputFile:
                self.gguf.add_url(metadata.url)
            if metadata.description is not None:
                self.gguf.add_description(metadata.description)
-            if metadata.licence is not None:
+            if metadata.license is not None:
-                self.gguf.add_licence(metadata.licence)
+                self.gguf.add_licence(metadata.license)
            if metadata.source_url is not None:
                self.gguf.add_source_url(metadata.source_url)
            if metadata.source_hf_repo is not None:
@ -943,7 +944,7 @@ class OutputFile:
    @staticmethod
    def write_vocab_only(
        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata = None,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata | None = None,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
@ -977,7 +978,7 @@ class OutputFile:
        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        pad_vocab: bool = False,
-        metadata: Metadata = None,
+        metadata: Metadata | None = None,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
@ -1396,6 +1397,8 @@ def main(args_in: list[str] | None = None) -> None:
    if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
        vocab = model_plus.vocab
    assert params is not None
    logger.info(f"Vocab info: {vocab}")
    logger.info(f"Special vocab info: {special_vocab}")
    model   = model_plus.model
--- a/examples/deprecation-warning/README.md
+++ b/examples/deprecation-warning/README.md
@ -0,0 +1,51 @@
 # Migration notice for binary filenames
 > [!IMPORTANT]
 [2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
 This migration was important, but it is a breaking change that may not always be immediately obvious to users.
 Please update all scripts and workflows to use the new binary names.
 | Old Filename | New Filename |
 | ---- | ---- |
 | main | llama-cli |
 | server | llama-server |
 | llama-bench | llama-bench |
 | embedding | llama-embedding |
 | finetune | llama-finetune |
 | quantize | llama-quantize |
 | tokenize | llama-tokenize |
 | export-lora | llama-export-lora |
 | libllava.a | libllava.a |
 | baby-llama | llama-baby-llama |
 | batched | llama-batched |
 | batched-bench | llama-batched-bench |
 | benchmark-matmult | llama-benchmark-matmult |
 | convert-llama2c-to-ggml | llama-convert-llama2c-to-ggml |
 | eval-callback | llama-eval-callback |
 | gbnf-validator | llama-gbnf-validator |
 | gguf | llama-gguf |
 | gguf-split | llama-gguf-split |
 | gritlm | llama-gritlm |
 | imatrix | llama-imatrix |
 | infill | llama-infill |
 | llava-cli | llama-llava-cli |
 | lookahead | llama-lookahead |
 | lookup | llama-lookup |
 | lookup-create | llama-lookup-create |
 | lookup-merge | llama-lookup-merge |
 | lookup-stats | llama-lookup-stats |
 | parallel | llama-parallel |
 | passkey | llama-passkey |
 | perplexity | llama-perplexity |
 | q8dot | llama-q8dot |
 | quantize-stats | llama-quantize-stats |
 | retrieval | llama-retrieval |
 | save-load-state | llama-save-load-state |
 | simple | llama-simple |
 | speculative | llama-speculative |
 | train-text-from-scratch | llama-train-text-from-scratch |
 | vdot | llama-vdot |
 | tests/test-c.o | tests/test-c.o |
--- a/examples/deprecation-warning/deprecation-warning.cpp
+++ b/examples/deprecation-warning/deprecation-warning.cpp
@ -0,0 +1,35 @@
 // Warns users that this filename was deprecated, and provides a link for more information.
 #include <cstdio>
 #include <string>
 #include <unordered_map>
 // Main
 int main(int argc, char** argv) {
    std::string filename = "main";
    if (argc >= 1) {
        filename = argv[0];
    }
    // Get only the program name from the full path
    auto pos = filename.find_last_of('/');
    if (pos != std::string::npos) {
        filename = filename.substr(pos+1);
    }
    // Append "llama-" to the beginning of filename to get the replacemnt filename
    auto replacement_filename = "llama-" + filename;
    // The exception is if the filename is "main", then our replacement filename is "llama-cli"
    if (filename == "main") {
        replacement_filename = "llama-cli";
    }
    fprintf(stdout, "\n");
    fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
    fprintf(stdout, " Please use '%s' instead.\n", replacement_filename.c_str());
    fprintf(stdout, " See https://github.com/ggerganov/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
    fprintf(stdout, "\n");
    return EXIT_FAILURE;
 }
--- a/examples/embedding/README.md
+++ b/examples/embedding/README.md
@ -58,4 +58,3 @@ The above command will output space-separated float values.
 ```powershell
 embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2  --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
 ```
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@ -99,7 +99,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
    char src1_str[128] = {0};
    if (src1) {
-        sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
+        snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
    }
    printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
--- a/examples/finetune/README.md
+++ b/examples/finetune/README.md
@ -87,4 +87,4 @@ The LORA rank can be configured for each model tensor type separately with these
 The LORA rank of 'norm' tensors should always be 1.
-To see all available options use `finetune --help`.
+To see all available options use `llama-finetune --help`.
--- a/examples/finetune/convert_finetune_checkpoint_to_gguf.py
+++ b/examples/finetune/convert_finetune_checkpoint_to_gguf.py
@ -74,7 +74,7 @@ class Tensor:
            if len(self.ne) == 0:
                self.nbytes = 0
            else:
-                self.nbytes = int(np.product(self.ne)) * 4
+                self.nbytes = int(np.prod(self.ne)) * 4
        else:
            raise ValueError(f"Unhandled data type '{self.dtype}'")
--- a/examples/finetune/finetune.sh
+++ b/examples/finetune/finetune.sh
@ -8,7 +8,7 @@ if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
 if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
 # MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
-MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing.
+MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "llama-cli --lora" with GPU inferencing.
 while getopts "dg" opt; do
  case $opt in
--- a/examples/gguf-hash/CMakeLists.txt
+++ b/examples/gguf-hash/CMakeLists.txt
@ -0,0 +1,15 @@
 set(TARGET llama-gguf-hash)
 add_executable(${TARGET} gguf-hash.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 # clibs dependencies
 include_directories(deps/)
 add_library(xxhash OBJECT deps/xxhash/xxhash.c deps/xxhash/xxhash.h)
 target_link_libraries(${TARGET} PRIVATE xxhash)
 add_library(sha1 OBJECT deps/sha1/sha1.c deps/sha1/sha1.h)
 target_link_libraries(${TARGET} PRIVATE sha1)
 add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h)
 target_link_libraries(${TARGET} PRIVATE sha256)
 target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/gguf-hash/README.md
+++ b/examples/gguf-hash/README.md
@ -0,0 +1,206 @@
 # llama-gguf-hash
 CLI to hash GGUF files to detect difference on a per model and per tensor level.
 **Command line options:**
 - `--help`: display help message
 - `--xxh64`: use xhash 64bit hash mode (default)
 - `--sha1`: use sha1
 - `--uuid`: use uuid
 - `--sha256`: use sha256
 - `--all`: use all hash
 - `--no-layer`: exclude per layer hash
 - `--uuid`: generate UUIDv5 ID
 - `-c`, `--check <manifest>`:  verify against a manifest
 ## About
 While most POSIX systems already have hash checking programs like sha256sum, it
 is designed to check entire files. This is not ideal for our purpose if we want
 to check for consistency of the tensor data even if the metadata content of the
 gguf KV store has been updated.
 This program is designed to hash a gguf tensor payload on a 'per tensor layer'
 in addition to a 'entire tensor model' hash. The intent is that the entire
 tensor layer can be checked first but if there is any detected inconsistencies,
 then the per tensor hash can be used to narrow down the specific tensor layer
 that has inconsistencies.
 For Maintainers:
 - Detection of tensor inconsistency during development and automated tests
    - This is served by xxh64 which is fast
    - This is also served by having per tensor layer to assist in narrowing down
      the location of the faulty tensor layer
    - This is also served by sha1 which is much slower but more widely supported
 For Model Creators:
 - Optional consistent UUID generation based on model tensor content
    - This is served by UUIDv5 which is useful for databases keys
        - llama.cpp UUIDv5 Namespace: `ef001206-dadc-5f6d-a15f-3359e577d4e5`
            - Made via UUIDv5 URL namespace of `en.wikipedia.org/wiki/Llama.cpp`
 For Model Users:
 - Assurance of tensor layer integrity even if metadata was updated
    - This is served by sha256 which is still considered very secure as of 2024
 ### Design Note
 - The default behavior of this program if no arguments is provided is to hash
  using xxhash's xxh32 mode because it is very fast and is primarily targeted
  towards maintainers who may want to use this in automated tests.
 - xxhash support xxh32 and xxh128 for 32bit hash and 128bit hash respectively
  however we picked 64bit xxhash as most computers are 64bit as of 2024 and thus
  would have a better affinity to calculating hash that is 64bit in size.
 ## Compile Example
 ```bash
 cmake -B build -DCMAKE_BUILD_TYPE=Debug -DLLAMA_FATAL_WARNINGS=ON
 make -C build clean
 make -C build llama-gguf-hash VERBOSE=1
 ./build/bin/llama-gguf-hash test.gguf
 ./build/bin/llama-gguf-hash --xxh64 test.gguf
 ./build/bin/llama-gguf-hash --sha1 test.gguf
 ./build/bin/llama-gguf-hash --uuid test.gguf
 ./build/bin/llama-gguf-hash --sha256 test.gguf
 ```
 ## Generation and Verification Example
 To generate we may use this command
 ```bash
 ./llama-gguf-hash --all test.gguf > test.gguf.manifest
 ```
 Which would generate a manifest that looks like below, which contains multiple hash type and per tensor layer hashes as well
 (This excludes UUID as that is an ID not a hash)
 ```bash
 xxh64     f66e9cd66a4396a0  test.gguf:tensor_0
 sha1      59f79ecefd8125a996fdf419239051a7e99e5f20  test.gguf:tensor_0
 sha256    c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd  test.gguf:tensor_0
 xxh64     7d3a1f9ac04d0537  test.gguf:tensor_1
 sha1      4765f592eacf096df4628ba59476af94d767080a  test.gguf:tensor_1
 sha256    8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7  test.gguf:tensor_1
 xxh64     a0af5d700049693b  test.gguf:tensor_2
 sha1      25cbfbad4513cc348e2c95ebdee69d6ff2fd8753  test.gguf:tensor_2
 sha256    947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180  test.gguf:tensor_2
 xxh64     e83fddf559d7b6a6  test.gguf:tensor_3
 sha1      a9cba73e2d90f2ee3dae2548caa42bef3fe6a96c  test.gguf:tensor_3
 sha256    423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1  test.gguf:tensor_3
 xxh64     1257733306b7992d  test.gguf:tensor_4
 sha1      d7bc61db93bb685ce9d598da89717c66729b7543  test.gguf:tensor_4
 sha256    79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf  test.gguf:tensor_4
 xxh64     d238d16ba4711e58  test.gguf:tensor_5
 sha1      0706566c198fe1072f37e0a5135b4b5f23654c52  test.gguf:tensor_5
 sha256    60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b  test.gguf:tensor_5
 xxh64     3fbc3b65ab8c7f39  test.gguf:tensor_6
 sha1      73922a0727226a409049f6fc3172a52219ca6f00  test.gguf:tensor_6
 sha256    574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0  test.gguf:tensor_6
 xxh64     c22021c29854f093  test.gguf:tensor_7
 sha1      efc39cece6a951188fc41e354c73bbfe6813d447  test.gguf:tensor_7
 sha256    4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75  test.gguf:tensor_7
 xxh64     936df61f5d64261f  test.gguf:tensor_8
 sha1      c2490296d789a4f34398a337fed8377d943d9f06  test.gguf:tensor_8
 sha256    c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01  test.gguf:tensor_8
 xxh64     93fd20c64421c081  test.gguf:tensor_9
 sha1      7047ce1e78437a6884337a3751c7ee0421918a65  test.gguf:tensor_9
 sha256    23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514  test.gguf:tensor_9
 xxh64     5a54d3aad816f302  test.gguf
 sha1      d15be52c4ff213e823cb6dd13af7ee2f978e7042  test.gguf
 sha256    7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4  test.gguf
 ```
 We can then use the normal check command which will by default check for the highest security strength hash and verify against that:
 ```bash
 $ ./llama-gguf-hash --check test.gguf.manifest test.gguf
 manifest  test.gguf.manifest  sha256  sha1  xxh64
 sha256    c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd  test.gguf:tensor_0  -  Ok
 sha256    8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7  test.gguf:tensor_1  -  Ok
 sha256    947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180  test.gguf:tensor_2  -  Ok
 sha256    423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1  test.gguf:tensor_3  -  Ok
 sha256    79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf  test.gguf:tensor_4  -  Ok
 sha256    60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b  test.gguf:tensor_5  -  Ok
 sha256    574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0  test.gguf:tensor_6  -  Ok
 sha256    4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75  test.gguf:tensor_7  -  Ok
 sha256    c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01  test.gguf:tensor_8  -  Ok
 sha256    23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514  test.gguf:tensor_9  -  Ok
 sha256    7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4  test.gguf  -  Ok
 Verification results for test.gguf.manifest - Success
 ```
 Or we may explicitly ask for a faster hash like:
 ```bash
 $ ./llama-gguf-hash --check test.gguf.manifest --xxh64 test.gguf
 manifest  test.gguf.manifest  sha256  sha1  xxh64
 xxh64     f66e9cd66a4396a0  test.gguf:tensor_0  -  Ok
 xxh64     7d3a1f9ac04d0537  test.gguf:tensor_1  -  Ok
 xxh64     a0af5d700049693b  test.gguf:tensor_2  -  Ok
 xxh64     e83fddf559d7b6a6  test.gguf:tensor_3  -  Ok
 xxh64     1257733306b7992d  test.gguf:tensor_4  -  Ok
 xxh64     d238d16ba4711e58  test.gguf:tensor_5  -  Ok
 xxh64     3fbc3b65ab8c7f39  test.gguf:tensor_6  -  Ok
 xxh64     c22021c29854f093  test.gguf:tensor_7  -  Ok
 xxh64     936df61f5d64261f  test.gguf:tensor_8  -  Ok
 xxh64     93fd20c64421c081  test.gguf:tensor_9  -  Ok
 xxh64     5a54d3aad816f302  test.gguf  -  Ok
 Verification results for test.gguf.manifest - Success
 ```
 Or maybe we want to just check that all the hash is valid:
 ```bash
 $./llama-gguf-hash --check test.gguf.manifest --all test.gguf.manifest
 manifest  test.gguf.manifest  sha256  sha1  xxh64
 xxh64     f66e9cd66a4396a0  test.gguf:tensor_0  -  Ok
 sha1      59f79ecefd8125a996fdf419239051a7e99e5f20  test.gguf:tensor_0  -  Ok
 sha256    c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd  test.gguf:tensor_0  -  Ok
 xxh64     7d3a1f9ac04d0537  test.gguf:tensor_1  -  Ok
 sha1      4765f592eacf096df4628ba59476af94d767080a  test.gguf:tensor_1  -  Ok
 sha256    8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7  test.gguf:tensor_1  -  Ok
 xxh64     a0af5d700049693b  test.gguf:tensor_2  -  Ok
 sha1      25cbfbad4513cc348e2c95ebdee69d6ff2fd8753  test.gguf:tensor_2  -  Ok
 sha256    947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180  test.gguf:tensor_2  -  Ok
 xxh64     e83fddf559d7b6a6  test.gguf:tensor_3  -  Ok
 sha1      a9cba73e2d90f2ee3dae2548caa42bef3fe6a96c  test.gguf:tensor_3  -  Ok
 sha256    423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1  test.gguf:tensor_3  -  Ok
 xxh64     1257733306b7992d  test.gguf:tensor_4  -  Ok
 sha1      d7bc61db93bb685ce9d598da89717c66729b7543  test.gguf:tensor_4  -  Ok
 sha256    79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf  test.gguf:tensor_4  -  Ok
 xxh64     d238d16ba4711e58  test.gguf:tensor_5  -  Ok
 sha1      0706566c198fe1072f37e0a5135b4b5f23654c52  test.gguf:tensor_5  -  Ok
 sha256    60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b  test.gguf:tensor_5  -  Ok
 xxh64     3fbc3b65ab8c7f39  test.gguf:tensor_6  -  Ok
 sha1      73922a0727226a409049f6fc3172a52219ca6f00  test.gguf:tensor_6  -  Ok
 sha256    574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0  test.gguf:tensor_6  -  Ok
 xxh64     c22021c29854f093  test.gguf:tensor_7  -  Ok
 sha1      efc39cece6a951188fc41e354c73bbfe6813d447  test.gguf:tensor_7  -  Ok
 sha256    4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75  test.gguf:tensor_7  -  Ok
 xxh64     936df61f5d64261f  test.gguf:tensor_8  -  Ok
 sha1      c2490296d789a4f34398a337fed8377d943d9f06  test.gguf:tensor_8  -  Ok
 sha256    c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01  test.gguf:tensor_8  -  Ok
 xxh64     93fd20c64421c081  test.gguf:tensor_9  -  Ok
 sha1      7047ce1e78437a6884337a3751c7ee0421918a65  test.gguf:tensor_9  -  Ok
 sha256    23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514  test.gguf:tensor_9  -  Ok
 xxh64     5a54d3aad816f302  test.gguf  -  Ok
 sha1      d15be52c4ff213e823cb6dd13af7ee2f978e7042  test.gguf  -  Ok
 sha256    7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4  test.gguf  -  Ok
 Verification results for test.gguf.manifest - Success
 ```
 ## Crypto/Hash Libraries Used
 These micro c libraries dependencies was installed via the [clib c package manager](https://github.com/clibs)
 - https://github.com/mofosyne/xxHash (From: https://github.com/Cyan4973/xxHash)
 - https://github.com/clibs/sha1/
 - https://github.com/jb55/sha256.c
--- a/examples/gguf-hash/deps/rotate-bits/package.json
+++ b/examples/gguf-hash/deps/rotate-bits/package.json
@ -0,0 +1,13 @@
 {
  "name": "rotate-bits",
  "version": "0.1.1",
  "repo": "jb55/rotate-bits.h",
  "description": "rotate bits",
  "keywords": ["rotl", "rotr"],
  "src": ["rotate-bits.h"],
  "license": "Public Domain",
  "development": {
    "thlorenz/tap.c": "*"
  }
 }
--- a/examples/gguf-hash/deps/rotate-bits/rotate-bits.h
+++ b/examples/gguf-hash/deps/rotate-bits/rotate-bits.h
@ -0,0 +1,46 @@
 #ifndef __ROTATE_DEFS_H
 #define __ROTATE_DEFS_H
 #ifdef _MSC_VER
 #include <stdlib.h>
 #define ROTL32(v, n) _rotl((v), (n))
 #define ROTL64(v, n) _rotl64((v), (n))
 #define ROTR32(v, n) _rotr((v), (n))
 #define ROTR64(v, n) _rotr64((v), (n))
 #else
 #include <stdint.h>
 #define U8V(v) ((uint8_t)(v) & 0xFFU)
 #define U16V(v) ((uint16_t)(v) & 0xFFFFU)
 #define U32V(v) ((uint32_t)(v) & 0xFFFFFFFFU)
 #define U64V(v) ((uint64_t)(v) & 0xFFFFFFFFFFFFFFFFU)
 #define ROTL32(v, n) \
  (U32V((uint32_t)(v) << (n)) | ((uint32_t)(v) >> (32 - (n))))
 // tests fail if we don't have this cast...
 #define ROTL64(v, n) \
  (U64V((uint64_t)(v) << (n)) | ((uint64_t)(v) >> (64 - (n))))
 #define ROTR32(v, n) ROTL32(v, 32 - (n))
 #define ROTR64(v, n) ROTL64(v, 64 - (n))
 #endif
 #define ROTL8(v, n) \
  (U8V((uint8_t)(v) << (n)) | ((uint8_t)(v) >> (8 - (n))))
 #define ROTL16(v, n) \
  (U16V((uint16_t)(v) << (n)) | ((uint16_t)(v) >> (16 - (n))))
 #define ROTR8(v, n) ROTL8(v, 8 - (n))
 #define ROTR16(v, n) ROTL16(v, 16 - (n))
 #endif
--- a/examples/gguf-hash/deps/sha1/package.json
+++ b/examples/gguf-hash/deps/sha1/package.json
@ -0,0 +1,9 @@
 {
  "name": "sha1",
  "version": "0.0.1",
  "repo": "clibs/sha1",
  "description": "sha1 hash algorithm",
  "keywords": ["sha1", "hash"],
  "license": "public domain",
  "src": ["sha1.c", "sha1.h"]
 }
--- a/examples/gguf-hash/deps/sha1/sha1.c
+++ b/examples/gguf-hash/deps/sha1/sha1.c
@ -0,0 +1,295 @@
 /*
 SHA-1 in C
 By Steve Reid <steve@edmweb.com>
 100% Public Domain
 Test Vectors (from FIPS PUB 180-1)
 "abc"
  A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D
 "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
  84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1
 A million repetitions of "a"
  34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F
 */
 /* #define LITTLE_ENDIAN * This should be #define'd already, if true. */
 /* #define SHA1HANDSOFF * Copies data before messing with it. */
 #define SHA1HANDSOFF
 #include <stdio.h>
 #include <string.h>
 /* for uint32_t */
 #include <stdint.h>
 #include "sha1.h"
 #define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
 /* blk0() and blk() perform the initial expand. */
 /* I got the idea of expanding during the round function from SSLeay */
 #if BYTE_ORDER == LITTLE_ENDIAN
 #define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \
    |(rol(block->l[i],8)&0x00FF00FF))
 #elif BYTE_ORDER == BIG_ENDIAN
 #define blk0(i) block->l[i]
 #else
 #error "Endianness not defined!"
 #endif
 #define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \
    ^block->l[(i+2)&15]^block->l[i&15],1))
 /* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
 #define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30);
 #define R1(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30);
 #define R2(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30);
 #define R3(v,w,x,y,z,i) z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30);
 #define R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30);
 /* Hash a single 512-bit block. This is the core of the algorithm. */
 void SHA1Transform(
    uint32_t state[5],
    const unsigned char buffer[64]
 )
 {
    uint32_t a, b, c, d, e;
    typedef union
    {
        unsigned char c[64];
        uint32_t l[16];
    } CHAR64LONG16;
 #ifdef SHA1HANDSOFF
    CHAR64LONG16 block[1];      /* use array to appear as a pointer */
    memcpy(block, buffer, 64);
 #else
    /* The following had better never be used because it causes the
     * pointer-to-const buffer to be cast into a pointer to non-const.
     * And the result is written through.  I threw a "const" in, hoping
     * this will cause a diagnostic.
     */
    CHAR64LONG16 *block = (const CHAR64LONG16 *) buffer;
 #endif
    /* Copy context->state[] to working vars */
    a = state[0];
    b = state[1];
    c = state[2];
    d = state[3];
    e = state[4];
    /* 4 rounds of 20 operations each. Loop unrolled. */
    R0(a, b, c, d, e, 0);
    R0(e, a, b, c, d, 1);
    R0(d, e, a, b, c, 2);
    R0(c, d, e, a, b, 3);
    R0(b, c, d, e, a, 4);
    R0(a, b, c, d, e, 5);
    R0(e, a, b, c, d, 6);
    R0(d, e, a, b, c, 7);
    R0(c, d, e, a, b, 8);
    R0(b, c, d, e, a, 9);
    R0(a, b, c, d, e, 10);
    R0(e, a, b, c, d, 11);
    R0(d, e, a, b, c, 12);
    R0(c, d, e, a, b, 13);
    R0(b, c, d, e, a, 14);
    R0(a, b, c, d, e, 15);
    R1(e, a, b, c, d, 16);
    R1(d, e, a, b, c, 17);
    R1(c, d, e, a, b, 18);
    R1(b, c, d, e, a, 19);
    R2(a, b, c, d, e, 20);
    R2(e, a, b, c, d, 21);
    R2(d, e, a, b, c, 22);
    R2(c, d, e, a, b, 23);
    R2(b, c, d, e, a, 24);
    R2(a, b, c, d, e, 25);
    R2(e, a, b, c, d, 26);
    R2(d, e, a, b, c, 27);
    R2(c, d, e, a, b, 28);
    R2(b, c, d, e, a, 29);
    R2(a, b, c, d, e, 30);
    R2(e, a, b, c, d, 31);
    R2(d, e, a, b, c, 32);
    R2(c, d, e, a, b, 33);
    R2(b, c, d, e, a, 34);
    R2(a, b, c, d, e, 35);
    R2(e, a, b, c, d, 36);
    R2(d, e, a, b, c, 37);
    R2(c, d, e, a, b, 38);
    R2(b, c, d, e, a, 39);
    R3(a, b, c, d, e, 40);
    R3(e, a, b, c, d, 41);
    R3(d, e, a, b, c, 42);
    R3(c, d, e, a, b, 43);
    R3(b, c, d, e, a, 44);
    R3(a, b, c, d, e, 45);
    R3(e, a, b, c, d, 46);
    R3(d, e, a, b, c, 47);
    R3(c, d, e, a, b, 48);
    R3(b, c, d, e, a, 49);
    R3(a, b, c, d, e, 50);
    R3(e, a, b, c, d, 51);
    R3(d, e, a, b, c, 52);
    R3(c, d, e, a, b, 53);
    R3(b, c, d, e, a, 54);
    R3(a, b, c, d, e, 55);
    R3(e, a, b, c, d, 56);
    R3(d, e, a, b, c, 57);
    R3(c, d, e, a, b, 58);
    R3(b, c, d, e, a, 59);
    R4(a, b, c, d, e, 60);
    R4(e, a, b, c, d, 61);
    R4(d, e, a, b, c, 62);
    R4(c, d, e, a, b, 63);
    R4(b, c, d, e, a, 64);
    R4(a, b, c, d, e, 65);
    R4(e, a, b, c, d, 66);
    R4(d, e, a, b, c, 67);
    R4(c, d, e, a, b, 68);
    R4(b, c, d, e, a, 69);
    R4(a, b, c, d, e, 70);
    R4(e, a, b, c, d, 71);
    R4(d, e, a, b, c, 72);
    R4(c, d, e, a, b, 73);
    R4(b, c, d, e, a, 74);
    R4(a, b, c, d, e, 75);
    R4(e, a, b, c, d, 76);
    R4(d, e, a, b, c, 77);
    R4(c, d, e, a, b, 78);
    R4(b, c, d, e, a, 79);
    /* Add the working vars back into context.state[] */
    state[0] += a;
    state[1] += b;
    state[2] += c;
    state[3] += d;
    state[4] += e;
    /* Wipe variables */
    a = b = c = d = e = 0;
 #ifdef SHA1HANDSOFF
    memset(block, '\0', sizeof(block));
 #endif
 }
 /* SHA1Init - Initialize new context */
 void SHA1Init(
    SHA1_CTX * context
 )
 {
    /* SHA1 initialization constants */
    context->state[0] = 0x67452301;
    context->state[1] = 0xEFCDAB89;
    context->state[2] = 0x98BADCFE;
    context->state[3] = 0x10325476;
    context->state[4] = 0xC3D2E1F0;
    context->count[0] = context->count[1] = 0;
 }
 /* Run your data through this. */
 void SHA1Update(
    SHA1_CTX * context,
    const unsigned char *data,
    uint32_t len
 )
 {
    uint32_t i;
    uint32_t j;
    j = context->count[0];
    if ((context->count[0] += len << 3) < j)
        context->count[1]++;
    context->count[1] += (len >> 29);
    j = (j >> 3) & 63;
    if ((j + len) > 63)
    {
        memcpy(&context->buffer[j], data, (i = 64 - j));
        SHA1Transform(context->state, context->buffer);
        for (; i + 63 < len; i += 64)
        {
            SHA1Transform(context->state, &data[i]);
        }
        j = 0;
    }
    else
        i = 0;
    memcpy(&context->buffer[j], &data[i], len - i);
 }
 /* Add padding and return the message digest. */
 void SHA1Final(
    unsigned char digest[20],
    SHA1_CTX * context
 )
 {
    unsigned i;
    unsigned char finalcount[8];
    unsigned char c;
 #if 0    /* untested "improvement" by DHR */
    /* Convert context->count to a sequence of bytes
     * in finalcount.  Second element first, but
     * big-endian order within element.
     * But we do it all backwards.
     */
    unsigned char *fcp = &finalcount[8];
    for (i = 0; i < 2; i++)
    {
        uint32_t t = context->count[i];
        int j;
        for (j = 0; j < 4; t >>= 8, j++)
            *--fcp = (unsigned char) t}
 #else
    for (i = 0; i < 8; i++)
    {
        finalcount[i] = (unsigned char) ((context->count[(i >= 4 ? 0 : 1)] >> ((3 - (i & 3)) * 8)) & 255);      /* Endian independent */
    }
 #endif
    c = 0200;
    SHA1Update(context, &c, 1);
    while ((context->count[0] & 504) != 448)
    {
        c = 0000;
        SHA1Update(context, &c, 1);
    }
    SHA1Update(context, finalcount, 8); /* Should cause a SHA1Transform() */
    for (i = 0; i < 20; i++)
    {
        digest[i] = (unsigned char)
            ((context->state[i >> 2] >> ((3 - (i & 3)) * 8)) & 255);
    }
    /* Wipe variables */
    memset(context, '\0', sizeof(*context));
    memset(&finalcount, '\0', sizeof(finalcount));
 }
 void SHA1(
    char *hash_out,
    const char *str,
    uint32_t len)
 {
    SHA1_CTX ctx;
    unsigned int ii;
    SHA1Init(&ctx);
    for (ii=0; ii<len; ii+=1)
        SHA1Update(&ctx, (const unsigned char*)str + ii, 1);
    SHA1Final((unsigned char *)hash_out, &ctx);
 }
--- a/examples/gguf-hash/deps/sha1/sha1.h
+++ b/examples/gguf-hash/deps/sha1/sha1.h
@ -0,0 +1,52 @@
 #ifndef SHA1_H
 #define SHA1_H
 /*
   SHA-1 in C
   By Steve Reid <steve@edmweb.com>
   100% Public Domain
 */
 #include "stdint.h"
 #if defined(__cplusplus)
 extern "C" {
 #endif
 typedef struct
 {
    uint32_t state[5];
    uint32_t count[2];
    unsigned char buffer[64];
 } SHA1_CTX;
 void SHA1Transform(
    uint32_t state[5],
    const unsigned char buffer[64]
    );
 void SHA1Init(
    SHA1_CTX * context
    );
 void SHA1Update(
    SHA1_CTX * context,
    const unsigned char *data,
    uint32_t len
    );
 void SHA1Final(
    unsigned char digest[20],
    SHA1_CTX * context
    );
 void SHA1(
    char *hash_out,
    const char *str,
    uint32_t len);
 #if defined(__cplusplus)
 }
 #endif
 #endif /* SHA1_H */
--- a/examples/gguf-hash/deps/sha256/package.json
+++ b/examples/gguf-hash/deps/sha256/package.json
@ -0,0 +1,15 @@
 {
  "name": "sha256",
  "version": "0.0.2",
  "repo": "jb55/sha256.c",
  "description": "sha256 in c",
  "keywords": ["sha256", "sha2"],
  "src": ["sha256.c", "sha256.h"],
  "dependencies": {
    "jb55/rotate-bits.h": "0.1.1"
  },
  "development": {
    "thlorenz/tap.c": "*"
  }
 }
--- a/examples/gguf-hash/deps/sha256/sha256.c
+++ b/examples/gguf-hash/deps/sha256/sha256.c
@ -0,0 +1,221 @@
 /* Crypto/Sha256.c -- SHA-256 Hash
 2010-06-11 : Igor Pavlov : Public domain
 This code is based on public domain code from Wei Dai's Crypto++ library. */
 #include "rotate-bits/rotate-bits.h"
 #include "sha256.h"
 /* define it for speed optimization */
 #define _SHA256_UNROLL
 #define _SHA256_UNROLL2
 void
 sha256_init(sha256_t *p)
 {
  p->state[0] = 0x6a09e667;
  p->state[1] = 0xbb67ae85;
  p->state[2] = 0x3c6ef372;
  p->state[3] = 0xa54ff53a;
  p->state[4] = 0x510e527f;
  p->state[5] = 0x9b05688c;
  p->state[6] = 0x1f83d9ab;
  p->state[7] = 0x5be0cd19;
  p->count = 0;
 }
 #define S0(x) (ROTR32(x, 2) ^ ROTR32(x,13) ^ ROTR32(x, 22))
 #define S1(x) (ROTR32(x, 6) ^ ROTR32(x,11) ^ ROTR32(x, 25))
 #define s0(x) (ROTR32(x, 7) ^ ROTR32(x,18) ^ (x >> 3))
 #define s1(x) (ROTR32(x,17) ^ ROTR32(x,19) ^ (x >> 10))
 #define blk0(i) (W[i] = data[i])
 #define blk2(i) (W[i&15] += s1(W[(i-2)&15]) + W[(i-7)&15] + s0(W[(i-15)&15]))
 #define Ch(x,y,z) (z^(x&(y^z)))
 #define Maj(x,y,z) ((x&y)|(z&(x|y)))
 #define a(i) T[(0-(i))&7]
 #define b(i) T[(1-(i))&7]
 #define c(i) T[(2-(i))&7]
 #define d(i) T[(3-(i))&7]
 #define e(i) T[(4-(i))&7]
 #define f(i) T[(5-(i))&7]
 #define g(i) T[(6-(i))&7]
 #define h(i) T[(7-(i))&7]
 #ifdef _SHA256_UNROLL2
 #define R(a,b,c,d,e,f,g,h, i) h += S1(e) + Ch(e,f,g) + K[i+j] + (j?blk2(i):blk0(i));\
  d += h; h += S0(a) + Maj(a, b, c)
 #define RX_8(i) \
  R(a,b,c,d,e,f,g,h, i); \
  R(h,a,b,c,d,e,f,g, (i+1)); \
  R(g,h,a,b,c,d,e,f, (i+2)); \
  R(f,g,h,a,b,c,d,e, (i+3)); \
  R(e,f,g,h,a,b,c,d, (i+4)); \
  R(d,e,f,g,h,a,b,c, (i+5)); \
  R(c,d,e,f,g,h,a,b, (i+6)); \
  R(b,c,d,e,f,g,h,a, (i+7))
 #else
 #define R(i) h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[i+j] + (j?blk2(i):blk0(i));\
  d(i) += h(i); h(i) += S0(a(i)) + Maj(a(i), b(i), c(i))
 #ifdef _SHA256_UNROLL
 #define RX_8(i) R(i+0); R(i+1); R(i+2); R(i+3); R(i+4); R(i+5); R(i+6); R(i+7);
 #endif
 #endif
 static const uint32_t K[64] = {
  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
  0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
  0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
  0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
  0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
  0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
  0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
  0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
  0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
  0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
  0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
  0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
  0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 };
 static void
 sha256_transform(uint32_t *state, const uint32_t *data)
 {
  uint32_t W[16] = {0};
  unsigned j;
  #ifdef _SHA256_UNROLL2
  uint32_t a,b,c,d,e,f,g,h;
  a = state[0];
  b = state[1];
  c = state[2];
  d = state[3];
  e = state[4];
  f = state[5];
  g = state[6];
  h = state[7];
  #else
  uint32_t T[8];
  for (j = 0; j < 8; j++)
    T[j] = state[j];
  #endif
  for (j = 0; j < 64; j += 16)
  {
    #if defined(_SHA256_UNROLL) || defined(_SHA256_UNROLL2)
    RX_8(0); RX_8(8);
    #else
    unsigned i;
    for (i = 0; i < 16; i++) { R(i); }
    #endif
  }
  #ifdef _SHA256_UNROLL2
  state[0] += a;
  state[1] += b;
  state[2] += c;
  state[3] += d;
  state[4] += e;
  state[5] += f;
  state[6] += g;
  state[7] += h;
  #else
  for (j = 0; j < 8; j++)
    state[j] += T[j];
  #endif
  /* Wipe variables */
  /* memset(W, 0, sizeof(W)); */
  /* memset(T, 0, sizeof(T)); */
 }
 #undef S0
 #undef S1
 #undef s0
 #undef s1
 static void
 sha256_write_byte_block(sha256_t *p)
 {
  uint32_t data32[16];
  unsigned i;
  for (i = 0; i < 16; i++)
    data32[i] =
      ((uint32_t)(p->buffer[i * 4    ]) << 24) +
      ((uint32_t)(p->buffer[i * 4 + 1]) << 16) +
      ((uint32_t)(p->buffer[i * 4 + 2]) <<  8) +
      ((uint32_t)(p->buffer[i * 4 + 3]));
  sha256_transform(p->state, data32);
 }
 void
 sha256_hash(unsigned char *buf, const unsigned char *data, size_t size)
 {
  sha256_t hash;
  sha256_init(&hash);
  sha256_update(&hash, data, size);
  sha256_final(&hash, buf);
 }
 void
 sha256_update(sha256_t *p, const unsigned char *data, size_t size)
 {
  uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
  while (size > 0)
  {
    p->buffer[curBufferPos++] = *data++;
    p->count++;
    size--;
    if (curBufferPos == 64)
    {
      curBufferPos = 0;
      sha256_write_byte_block(p);
    }
  }
 }
 void
 sha256_final(sha256_t *p, unsigned char *digest)
 {
  uint64_t lenInBits = (p->count << 3);
  uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
  unsigned i;
  p->buffer[curBufferPos++] = 0x80;
  while (curBufferPos != (64 - 8))
  {
    curBufferPos &= 0x3F;
    if (curBufferPos == 0)
      sha256_write_byte_block(p);
    p->buffer[curBufferPos++] = 0;
  }
  for (i = 0; i < 8; i++)
  {
    p->buffer[curBufferPos++] = (unsigned char)(lenInBits >> 56);
    lenInBits <<= 8;
  }
  sha256_write_byte_block(p);
  for (i = 0; i < 8; i++)
  {
    *digest++ = (unsigned char)(p->state[i] >> 24);
    *digest++ = (unsigned char)(p->state[i] >> 16);
    *digest++ = (unsigned char)(p->state[i] >> 8);
    *digest++ = (unsigned char)(p->state[i]);
  }
  sha256_init(p);
 }
--- a/examples/gguf-hash/deps/sha256/sha256.h
+++ b/examples/gguf-hash/deps/sha256/sha256.h
@ -0,0 +1,24 @@
 /* Sha256.h -- SHA-256 Hash
 2010-06-11 : Igor Pavlov : Public domain */
 #ifndef __CRYPTO_SHA256_H
 #define __CRYPTO_SHA256_H
 #include <stdlib.h>
 #include <stdint.h>
 #define SHA256_DIGEST_SIZE 32
 typedef struct sha256_t
 {
  uint32_t state[8];
  uint64_t count;
  unsigned char buffer[64];
 } sha256_t;
 void sha256_init(sha256_t *p);
 void sha256_update(sha256_t *p, const unsigned char *data, size_t size);
 void sha256_final(sha256_t *p, unsigned char *digest);
 void sha256_hash(unsigned char *buf, const unsigned char *data, size_t size);
 #endif
--- a/examples/gguf-hash/deps/xxhash/clib.json
+++ b/examples/gguf-hash/deps/xxhash/clib.json
@ -0,0 +1,12 @@
 {
  "name": "xxhash",
  "version": "0.8.2",
  "repo": "mofosyne/xxhash",
  "description": "Extremely fast non-cryptographic hash algorithm",
  "keywords": ["xxhash", "hashing"],
  "license": "BSD-2-Clause",
  "src": [
    "xxhash.c",
    "xxhash.h"
  ]
 }
--- a/examples/gguf-hash/deps/xxhash/xxhash.c
+++ b/examples/gguf-hash/deps/xxhash/xxhash.c
@ -0,0 +1,42 @@
 /*
 * xxHash - Extremely Fast Hash algorithm
 * Copyright (C) 2012-2023 Yann Collet
 *
 * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 *    * Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *    * Redistributions in binary form must reproduce the above
 *      copyright notice, this list of conditions and the following disclaimer
 *      in the documentation and/or other materials provided with the
 *      distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * You can contact the author at:
 *   - xxHash homepage: https://www.xxhash.com
 *   - xxHash source repository: https://github.com/Cyan4973/xxHash
 */
 /*
 * xxhash.c instantiates functions defined in xxhash.h
 */
 #define XXH_STATIC_LINKING_ONLY /* access advanced declarations */
 #define XXH_IMPLEMENTATION      /* access definitions */
 #include "xxhash.h"
--- a/examples/gguf-hash/deps/xxhash/xxhash.h
+++ b/examples/gguf-hash/deps/xxhash/xxhash.h
--- a/examples/gguf-hash/gguf-hash.cpp
+++ b/examples/gguf-hash/gguf-hash.cpp
@ -0,0 +1,693 @@
 #include "ggml.h"
 #include <cstdlib>   /* abort() */
 #include <cstddef>
 #include <cstdio>
 #include <string>
 #include <stdexcept>
 #include <algorithm>
 #include <cstring>
 #include <sstream>
 #include <fstream>
 #ifdef __cplusplus
 extern "C" {
 #endif
 #include "xxhash/xxhash.h"
 #include "sha1/sha1.h"
 #include "sha256/sha256.h"
 #ifdef __cplusplus
 }
 #endif
 // uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Llama.cpp')
 #define UUID_NAMESPACE_LLAMA_CPP "ef001206-dadc-5f6d-a15f-3359e577d4e5"
 #define UUID_NAMESPACE_LLAMA_CPP_HEX 0xef, 0x00, 0x12, 0x06, 0xda, 0xdc, 0x5f, 0x6d, 0xa1, 0x5f, 0x33, 0x59, 0xe5, 0x77, 0xd4, 0xe5
 #define HASH_TYPE_SHA256_STR "sha256"
 #define HASH_TYPE_SHA1_STR   "sha1"
 #define HASH_TYPE_XXH64_STR  "xxh64"
 #define HASH_TYPE_UUID_STR   "uuid"
 typedef enum {
    HASH_EXIT_SUCCESS = 0, // All hash has been generated or validated
    HASH_EXIT_FAILURE = 1, // Generic Failure
    HASH_EXIT_MISMATCH = 2, // Hash mismatched during validation
    HASH_EXIT_MANIFEST_MISSING_ENTRY = 3, // Hash attempted validation but missing entry in manifest
    HASH_EXIT_MANIFEST_UNKNOWN_HASH = 4, // Manifest is present, but we do not know any hash format within it
    HASH_EXIT_MANIFEST_FILE_ERROR = 5 // Manifest is either missing or not a known format
 } hash_exit_code_t;
 typedef enum {
    HASH_MANIFEST_NOT_FOUND,
    HASH_MANIFEST_MISMATCH,
    HASH_MANIFEST_OK,
 } hash_manifest_result_t;
 struct hash_params {
    std::string input;
    bool xxh64 = false;
    bool sha1 = false;
    bool sha256 = false;
    bool uuid = false;
    bool no_layer = false;
    bool manifest_is_usable = false;
    std::string manifest_file;
 };
 struct manifest_check_params {
    bool xxh64 = false;
    bool sha1 = false;
    bool sha256 = false;
    bool uuid = false;
 };
 static char const * hash_manifest_result_to_str(hash_manifest_result_t value) {
    switch (value) {
        case HASH_MANIFEST_NOT_FOUND: return "Not Found";
        case HASH_MANIFEST_MISMATCH: return "Mismatch";
        case HASH_MANIFEST_OK: return "Ok";
    }
    return "?";
 }
 static char const * hash_exit_code_to_str(hash_exit_code_t value) {
    switch (value) {
        case HASH_EXIT_SUCCESS: return "Success";
        case HASH_EXIT_FAILURE: return "Failure";
        case HASH_EXIT_MISMATCH: return "Mismatch";
        case HASH_EXIT_MANIFEST_MISSING_ENTRY: return "Manifest Missing Entry";
        case HASH_EXIT_MANIFEST_UNKNOWN_HASH: return "Manifest Unknown Hash";
        case HASH_EXIT_MANIFEST_FILE_ERROR: return "Manifest File Error";
    }
    return "?";
 }
 static void hash_print_usage(const char * executable) {
    const hash_params default_params;
    printf("\n");
    printf("usage: %s [options] GGUF_IN\n", executable);
    printf("\n");
    printf("Hash a GGUF file");
    printf("\n");
    printf("options:\n");
    printf("  -h, --help              show this help message and exit\n");
    printf("      --xxh64             use xxh64 hash\n");
    printf("      --sha1              use sha1 hash\n");
    printf("      --sha256            use sha256 hash\n");
    printf("      --all               use all hash\n");
    printf("      --no-layer          exclude per layer hash\n");
    printf("      --uuid              generate UUIDv5 ID\n");
    printf("  -c, --check <manifest>  verify against a manifest\n");
    printf("\n");
 }
 static void hash_params_parse_ex(int argc, const char ** argv, hash_params & params) {
    std::string arg;
    bool invalid_param = false;
    const std::string arg_prefix = "--";
    int arg_idx = 1;
    for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
        arg = argv[arg_idx];
        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
            std::replace(arg.begin(), arg.end(), '_', '-');
        }
        bool arg_found = false;
        if (arg == "-h" || arg == "--help") {
            hash_print_usage(argv[0]);
            exit(0);
        }
        if (arg == "--xxh64") {
            arg_found = true;
            params.xxh64 = true;
        }
        if (arg == "--sha1") {
            arg_found = true;
            params.sha1 = true;
        }
        if (arg == "--uuid") {
            arg_found = true;
            params.uuid = true;
        }
        if (arg == "--sha256") {
            arg_found = true;
            params.sha256 = true;
        }
        if (arg == "--all") {
            arg_found = true;
            params.sha256 = true;
            params.sha1 = true;
            params.xxh64 = true;
        }
        if (arg == "--no-layer") {
            arg_found = true;
            params.no_layer = true;
        }
        if (arg == "-c" || arg == "--check") {
            if (++arg_idx >= argc) {
                invalid_param = true;
                break;
            }
            arg_found = true;
            params.manifest_file = argv[arg_idx];
        }
        if (!arg_found) {
            throw std::invalid_argument("error: unknown argument: " + arg);
        }
    }
    if (invalid_param) {
        throw std::invalid_argument("error: invalid parameter for argument:" + arg);
    }
    if (argc - arg_idx < 1) {
        throw std::invalid_argument("error: bad arguments");
    }
    params.input = argv[arg_idx++];
 }
 static bool hash_params_parse(int argc, const char ** argv, hash_params & params) {
    bool result = true;
    try {
        hash_params_parse_ex(argc, argv, params);
    }
    catch (const std::invalid_argument & ex) {
        fprintf(stderr, "%s\n", ex.what());
        hash_print_usage(argv[0]);
        exit(EXIT_FAILURE);
    }
    return result;
 }
 static bool manifest_type(const std::string & manifest_file, manifest_check_params & manifest_check) {
    if (manifest_file.empty()) {
        return false;
    }
    std::ifstream file(manifest_file);
    if (!file.is_open()) {
        return false;
    }
    std::string manifest_entry_line;
    while (getline(file, manifest_entry_line)) {
        // hash_type_str hash_str tensor_name
        // e.g. 'xxh64     f66e9cd66a4396a0  test.gguf:tensor_0'
        std::istringstream line_stream(manifest_entry_line);
        std::string file_hash_type;
        if (line_stream >> file_hash_type) {
            if (file_hash_type == HASH_TYPE_SHA256_STR) {
                manifest_check.sha256 = true;
            } else if (file_hash_type == HASH_TYPE_SHA1_STR) {
                manifest_check.sha1 = true;
            } else if (file_hash_type == HASH_TYPE_XXH64_STR) {
                manifest_check.xxh64 = true;
            } else if (file_hash_type == HASH_TYPE_UUID_STR) {
                manifest_check.uuid = true;
            }
        }
    }
    return true;
 }
 static hash_manifest_result_t manifest_verify(const std::string& manifest_file, const std::string& hash_type_str, const std::string& hash_str, const std::string& tensor_name) {
    if (manifest_file.empty()) {
        return HASH_MANIFEST_NOT_FOUND;
    }
    std::ifstream file(manifest_file);
    if (!file.is_open()) {
        return HASH_MANIFEST_NOT_FOUND;
    }
    std::string manifest_entry_line;
    while (getline(file, manifest_entry_line)) {
        std::istringstream line_stream(manifest_entry_line);
        std::string file_hash_type;
        std::string file_hash;
        std::string file_tensor_name;
        if (line_stream >> file_hash_type >> file_hash >> file_tensor_name) {
            // Line parsed. Check hash validity
            if (file_hash_type != hash_type_str) {
                continue;
            }
            if (file_tensor_name != tensor_name) {
                continue;
            }
            return (file_hash == hash_str) ? HASH_MANIFEST_OK : HASH_MANIFEST_MISMATCH;
        }
    }
    return HASH_MANIFEST_NOT_FOUND;
 }
 static void generate_uuidv5(const unsigned char sha1_digest[20], unsigned char uuid[16]) {
    // Ref: https://www.rfc-editor.org/rfc/rfc9562.html#section-5.5
    // Assumes that digest was processed correctly with the expected namespace
    for (int i = 0; i < 16; i++) {
        uuid[i] = sha1_digest[i];
    }
    // Set bits corresponding to UUID ver 5
    uuid[ 6] &= ~(0xF << 4);
    uuid[ 6] |= (5 << 4);
    // Set bits corresponding to UUID variant 0b10XX
    uuid[ 8] &= ~(0xc << 4);
    uuid[ 8] |= (0x8 << 4);
 }
 static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
    const std::string & fname = hash_params.input;
    struct ggml_context * ctx_data = NULL;
    struct gguf_init_params params = {
        /*.no_alloc = */ false,
        /*.ctx      = */ &ctx_data,
    };
    // xxh64 init
    XXH64_state_t* xxh64_model_hash_state = NULL;
    if (hash_params.xxh64) {
        xxh64_model_hash_state = XXH64_createState();
        if (xxh64_model_hash_state==NULL) {
            abort();
        }
        XXH64_hash_t const seed = 0;
        if (XXH64_reset(xxh64_model_hash_state, seed) == XXH_ERROR) {
            abort();
        }
    }
    // sha1 init
    SHA1_CTX sha1_model_hash_ctx;
    if (hash_params.sha1) {
        SHA1Init(&sha1_model_hash_ctx);
    }
    // sha256 init
    sha256_t sha256_model_hash_ctx;
    if (hash_params.sha256) {
        sha256_init(&sha256_model_hash_ctx);
    }
    // sha1 for uuid init
    SHA1_CTX sha1_for_uuid_ctx;
    if (hash_params.uuid) {
        unsigned char const uuidv5_namespace[] = {UUID_NAMESPACE_LLAMA_CPP_HEX};
        SHA1Init(&sha1_for_uuid_ctx);
        SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)uuidv5_namespace, sizeof(uuidv5_namespace));
    }
    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
    const int n_tensors = gguf_get_n_tensors(ctx);
    bool tensor_layer_in_manifest = false;
    bool model_in_manifest = false;
    bool tensor_layer_has_mismatch = false;
    bool model_has_mismatch = false;
    for (int i = 0; i < n_tensors; ++i) {
        const char * name = gguf_get_tensor_name(ctx, i);
        struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
        auto n_bytes = ggml_nbytes(cur);
        auto *raw_data = cur->data;
        const std::string tensor_layer_name = fname + ":" + name;
        if (hash_params.xxh64) {
            if (!hash_params.no_layer) {
                // Per Layer Hash
                XXH64_hash_t hash = XXH64(raw_data, n_bytes, 0);
                char hex_result[17];
                for (int  offset = 0; offset < 8; offset++) {
                    unsigned int shift_bits_by = (8 * (8 - offset - 1));
                    snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
                }
                if (hash_params.manifest_is_usable) {
                    hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name);
                    switch (verify_result) {
                        case HASH_MANIFEST_NOT_FOUND:
                            break;
                        case HASH_MANIFEST_MISMATCH:
                            tensor_layer_in_manifest = true;
                            tensor_layer_has_mismatch = true;
                            break;
                        case HASH_MANIFEST_OK:
                            tensor_layer_in_manifest = true;
                            break;
                    }
                    printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
                } else {
                    printf("%-8s  %-s  %s\n", HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name.c_str());
                }
            }
            // Overall Model Hash
            if (XXH64_update(xxh64_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort();
        }
        if (hash_params.sha1) {
            if (!hash_params.no_layer) {
                // Per Layer Hash
                char result[21]; // sha1 outputs 20 bytes
                SHA1( result, (const char *)raw_data, n_bytes);
                char hex_result[41] = {0};
                for (int  offset = 0; offset < 20; offset++) {
                    snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
                }
                if (hash_params.manifest_is_usable) {
                    hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name);
                    switch (verify_result) {
                        case HASH_MANIFEST_NOT_FOUND:
                            break;
                        case HASH_MANIFEST_MISMATCH:
                            tensor_layer_in_manifest = true;
                            tensor_layer_has_mismatch = true;
                            break;
                        case HASH_MANIFEST_OK:
                            tensor_layer_in_manifest = true;
                            break;
                    }
                    printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
                } else {
                    printf("%-8s  %-s  %s\n", HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name.c_str());
                }
            }
            // Overall Model Hash
            SHA1Update( &sha1_model_hash_ctx, (unsigned char const *)raw_data, n_bytes);
        }
        if (hash_params.sha256) {
            if (!hash_params.no_layer) {
                // Per Layer Hash
                unsigned char result[SHA256_DIGEST_SIZE]; // sha256 outputs 32 bytes
                sha256_hash((unsigned char*) result, (const unsigned char *)raw_data, n_bytes);
                char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
                for (int  offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
                    snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
                }
                if (hash_params.manifest_is_usable) {
                    hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name);
                    switch (verify_result) {
                        case HASH_MANIFEST_NOT_FOUND:
                            break;
                        case HASH_MANIFEST_MISMATCH:
                            tensor_layer_in_manifest = true;
                            tensor_layer_has_mismatch = true;
                            break;
                        case HASH_MANIFEST_OK:
                            tensor_layer_in_manifest = true;
                            break;
                    }
                    printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
                } else {
                    printf("%-8s  %-s  %s\n", HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name.c_str());
                }
            }
            // Overall Model Hash
            sha256_update( &sha256_model_hash_ctx, (unsigned char const *)raw_data, n_bytes);
        }
        if (hash_params.uuid) {
            SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)raw_data, n_bytes);
        }
    }
    if (hash_params.xxh64) {
        XXH64_hash_t const hash = XXH64_digest(xxh64_model_hash_state);
        char hex_result[17];
        for (int  offset = 0; offset < 8; offset++) {
            unsigned int shift_bits_by = (8 * (8 - offset - 1));
            snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
        }
        if (hash_params.manifest_is_usable) {
            hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH64_STR, hex_result, fname);
            switch (verify_result) {
                case HASH_MANIFEST_NOT_FOUND:
                    break;
                case HASH_MANIFEST_MISMATCH:
                    model_in_manifest = true;
                    model_has_mismatch = true;
                    break;
                case HASH_MANIFEST_OK:
                    model_in_manifest = true;
                    break;
            }
            printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_XXH64_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
        } else {
            printf("%-8s  %-s  %s\n", HASH_TYPE_XXH64_STR, hex_result, fname.c_str());
        }
    }
    if (hash_params.sha1) {
        unsigned char result[21];
        SHA1Final(result, &sha1_model_hash_ctx);
        char hex_result[41];
        for (int  offset = 0; offset < 20; offset++) {
            snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
        }
        if (hash_params.manifest_is_usable) {
            hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA1_STR, hex_result, fname);
            switch (verify_result) {
                case HASH_MANIFEST_NOT_FOUND:
                    break;
                case HASH_MANIFEST_MISMATCH:
                    model_in_manifest = true;
                    model_has_mismatch = true;
                    break;
                case HASH_MANIFEST_OK:
                    model_in_manifest = true;
                    break;
            }
            printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_SHA1_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
        } else {
            printf("%-8s  %-s  %s\n", HASH_TYPE_SHA1_STR, hex_result, fname.c_str());
        }
    }
    if (hash_params.sha256) {
        unsigned char result[SHA256_DIGEST_SIZE]; // sha256 outputs 32 bytes
        sha256_final( &sha256_model_hash_ctx,  result);
        char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
        for (int  offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
            snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
        }
        if (hash_params.manifest_is_usable) {
            hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, hex_result, fname);
            switch (verify_result) {
                case HASH_MANIFEST_NOT_FOUND:
                    break;
                case HASH_MANIFEST_MISMATCH:
                    model_in_manifest = true;
                    model_has_mismatch = true;
                    break;
                case HASH_MANIFEST_OK:
                    model_in_manifest = true;
                    break;
            }
            printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_SHA256_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
        } else {
            printf("%-8s  %-s  %s\n", HASH_TYPE_SHA256_STR, hex_result, fname.c_str());
        }
    }
    if (hash_params.uuid) {
        unsigned char result[21];
        SHA1Final(result, &sha1_for_uuid_ctx);
        unsigned char uuid[16];
        generate_uuidv5(result, uuid);
        char string_buffer[37] = {0};
        snprintf(string_buffer, sizeof(string_buffer), "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
            uuid[0], uuid[1], uuid[2], uuid[3],
            uuid[4], uuid[5], uuid[6], uuid[7],
            uuid[8], uuid[9], uuid[10], uuid[11],
            uuid[12], uuid[13], uuid[14], uuid[15]);
        if (hash_params.manifest_is_usable) {
            hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, string_buffer, fname);
            switch (verify_result) {
                case HASH_MANIFEST_NOT_FOUND:
                    break;
                case HASH_MANIFEST_MISMATCH:
                    model_in_manifest = true;
                    model_has_mismatch = true;
                    break;
                case HASH_MANIFEST_OK:
                    model_in_manifest = true;
                    break;
            }
            printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_UUID_STR, string_buffer, fname.c_str(), hash_manifest_result_to_str(verify_result));
        } else {
            printf("%-8s  %-s  %s\n", HASH_TYPE_UUID_STR, string_buffer, fname.c_str());
        }
    }
    ggml_free(ctx_data);
    gguf_free(ctx);
    if (hash_params.manifest_is_usable) {
        // In hash verification mode
        if (!model_in_manifest) {
            // model missing in manifest?
            // Check tensor layer...
            if (!tensor_layer_in_manifest) {
                // Still missing? Maybe we are reading the wrong manifest.
                return HASH_EXIT_MANIFEST_MISSING_ENTRY;
            }
            if (tensor_layer_has_mismatch) {
                // Per tensor check found error
                return HASH_EXIT_FAILURE;
            }
            // All per tensor layer checks passed? Sounds good enough.
            return HASH_EXIT_SUCCESS;
        }
        // Overall model check passed, but let's check per layer just in case
        // If missing, we don't care too much as the overall model checked
        if (tensor_layer_in_manifest && tensor_layer_has_mismatch) {
            return HASH_EXIT_FAILURE;
        }
        if (model_has_mismatch) {
            // model has failed hash somewhere in the model
            return HASH_EXIT_FAILURE;
        }
        // All checks appears to be fine
        return HASH_EXIT_SUCCESS;
    }
    // In hash generation mode
    return HASH_EXIT_SUCCESS;
 }
 int main(int argc, const char ** argv) {
    hash_params params;
    manifest_check_params manifest_check;
    hash_params_parse(argc, argv, params);
    if (!params.manifest_file.empty()) {
        if (!manifest_type(params.manifest_file, manifest_check)) {
            printf("ERROR cannot open manifest %s", params.manifest_file.c_str());
            return HASH_EXIT_MANIFEST_FILE_ERROR;
        }
        if (!manifest_check.sha256 && !manifest_check.sha1 && !manifest_check.xxh64 && !manifest_check.uuid) {
            printf("ERROR manifest does not have any known hash format in %s", params.manifest_file.c_str());
            return HASH_EXIT_MANIFEST_UNKNOWN_HASH;
        }
        printf("manifest  %s", params.manifest_file.c_str());
        if (manifest_check.sha256) {
            printf("  sha256");
        }
        if (manifest_check.sha1) {
            printf("  sha1");
        }
        if (manifest_check.xxh64) {
            printf("  xxh64");
        }
        if (manifest_check.uuid) {
            printf("  uuid");
        }
        printf("\n");
        // Autoselect the highest security hash if manifest is provided but
        // the user has not specifically defined the hash they care about
        if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
            // User has not selected a specific value, pick most secure hash
            if (manifest_check.sha256) {
                params.sha256 = true;
            } else if (manifest_check.sha1) {
                params.sha1 = true;
            } else if (manifest_check.xxh64) {
                params.xxh64 = true;
            } else if (manifest_check.uuid) {
                params.uuid = true;
            }
        }
        params.manifest_is_usable = true;
    }
    // By default if no swich argument provided, assume xxh64
    if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
        params.xxh64 = true;
    }
    hash_exit_code_t exit_code = gguf_hash(params);
    if (params.manifest_is_usable) {
        printf("\nVerification results for %s - %s\n", params.manifest_file.c_str(), hash_exit_code_to_str(exit_code));
    }
    return exit_code;
 }
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -204,21 +204,17 @@ int main(int argc, char ** argv) {
    GGML_ASSERT(llama_add_eos_token(model) != 1);
    LOG("add_bos: %d\n", add_bos);
    bool suff_rm_leading_spc = params.escape;
    if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
        params.input_suffix.erase(0, 1);
        suff_rm_leading_spc = false;
    }
    std::vector<llama_token> embd_inp;
    std::vector<llama_token> embd_end;
    std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
    std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
-    const int space_token = 29871;
+
-    if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
+    GGML_ASSERT(llama_token_prefix(model) >= 0);
-        inp_sfx.erase(inp_sfx.begin());
+    GGML_ASSERT(llama_token_suffix(model) >= 0);
-    }
+
    inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
    embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
    embd_end = params.spm_infill ? inp_pfx : inp_sfx;
    if (add_bos) {
@ -516,19 +512,14 @@ int main(int argc, char ** argv) {
                    string_process_escapes(params.input_prefix);
                    string_process_escapes(params.input_suffix);
                }
-                suff_rm_leading_spc = params.escape;
+
                if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
                    params.input_suffix.erase(0, 1);
                    suff_rm_leading_spc = false;
                }
                // tokenize new prefix and suffix
                std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
                std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
-                if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
+
                    inp_sfx.erase(inp_sfx.begin());
                }
                inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
                embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
                embd_end = params.spm_infill ? inp_pfx : inp_sfx;
                if (add_bos) {
@ -659,4 +650,3 @@ int main(int argc, char ** argv) {
    return 0;
 }
--- a/examples/json_schema_pydantic_example.py
+++ b/examples/json_schema_pydantic_example.py
@ -1,9 +1,9 @@
 # Usage:
 #! ./llama-server -m some-model.gguf &
 #! pip install pydantic
-#! python json-schema-pydantic-example.py
+#! python json_schema_pydantic_example.py
-from pydantic import BaseModel, Extra, TypeAdapter
+from pydantic import BaseModel, Field, TypeAdapter
 from annotated_types import MinLen
 from typing import Annotated, List, Optional
 import json, requests
@ -17,6 +17,9 @@ if True:
        The response_model param takes a type (+ supports Pydantic) and behaves just as w/ Instructor (see below)
        '''
        response_format = None
        type_adapter = None
        if response_model:
            type_adapter = TypeAdapter(response_model)
            schema = type_adapter.json_schema()
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@ -1,4 +1,6 @@
 #!/usr/bin/env python3
 from __future__ import annotations
 import argparse
 import itertools
 import json
@ -188,7 +190,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou
    raise RuntimeError("At least one of min_value or max_value must be set")
 class BuiltinRule:
-    def __init__(self, content: str, deps: list = None):
+    def __init__(self, content: str, deps: list | None = None):
        self.content = content
        self.deps = deps or []
@ -248,7 +250,7 @@ class SchemaConverter:
    def _format_literal(self, literal):
        escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
-            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), literal
+            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)) or m.group(0), literal
        )
        return f'"{escaped}"'
@ -403,11 +405,11 @@ class SchemaConverter:
        i = 0
        length = len(pattern)
-        def to_rule(s: Tuple[str, bool]) -> str:
+        def to_rule(s: tuple[str, bool]) -> str:
            (txt, is_literal) = s
            return "\"" + txt + "\"" if is_literal else txt
-        def transform() -> Tuple[str, bool]:
+        def transform() -> tuple[str, bool]:
            '''
                Parse a unit at index i (advancing it), and return its string representation + whether it's a literal.
            '''
@ -420,7 +422,7 @@ class SchemaConverter:
            # We only need a flat structure here to apply repetition operators to the last item, and
            # to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially
            # (GBNF's syntax is luckily very close to regular expressions!)
-            seq: list[Tuple[str, bool]] = []
+            seq: list[tuple[str, bool]] = []
            def get_dot():
                if self._dotall:
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@ -322,7 +322,7 @@ actor LlamaContext {
        defer {
            result.deallocate()
        }
-        let nTokens = llama_token_to_piece(model, token, result, 8, false)
+        let nTokens = llama_token_to_piece(model, token, result, 8, 0, false)
        if nTokens < 0 {
            let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
@ -330,7 +330,7 @@ actor LlamaContext {
            defer {
                newResult.deallocate()
            }
-            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false)
+            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, 0, false)
            let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
            return Array(bufferPointer)
        } else {
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@ -30,16 +30,16 @@ git clone https://huggingface.co/mtgv/MobileVLM-1.7B
 git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 ```
-2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
+2. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
 ```sh
-python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
+python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
 ```
-3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
+3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
 ```sh
-python ./examples/llava/convert-image-encoder-to-gguf \
+python ./examples/llava/convert_image_encoder_to_gguf \
    -m path/to/clip-vit-large-patch14-336 \
    --llava-projector path/to/MobileVLM-1.7B/llava.projector \
    --output-dir path/to/MobileVLM-1.7B \
@ -47,17 +47,17 @@ python ./examples/llava/convert-image-encoder-to-gguf \
 ```
 ```sh
-python ./examples/llava/convert-image-encoder-to-gguf \
+python ./examples/llava/convert_image_encoder_to_gguf \
    -m path/to/clip-vit-large-patch14-336 \
    --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
    --output-dir path/to/MobileVLM-1.7B_V2 \
    --projector-type ldpv2
 ```
-4. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
+4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
 ```sh
-python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B
+python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B
 ```
 5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@ -38,22 +38,22 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 pip install -r examples/llava/requirements.txt
 ```
-3. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
+3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
 ```sh
-python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
+python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b
 ```
-4. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:
+4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:
 ```sh
-python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
+python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
 ```
-5. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
+5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
 ```sh
-python ./examples/convert-legacy-llama.py ../llava-v1.5-7b --skip-unknown
+python ./examples/convert_legacy_llama.py ../llava-v1.5-7b --skip-unknown
 ```
 Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
@ -70,9 +70,9 @@ git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
 pip install -r examples/llava/requirements.txt
 ```
-3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
+3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
 ```console
-python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/
+python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
 ```
 - you will find a llava.projector and a llava.clip file in your model directory
@ -86,13 +86,13 @@ curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.jso
 5) Create the visual gguf model:
 ```console
-python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
+python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
 ```
 - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
 6) Then convert the model to gguf format:
 ```console
-python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
+python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
 ```
 7) And finally we can run the llava cli using the 1.6 model version:
--- a/examples/llava/convert_image_encoder_to_gguf.py
+++ b/examples/llava/convert_image_encoder_to_gguf.py
@ -185,6 +185,8 @@ else:
    fout.add_description("two-tower CLIP model")
 if has_text_encoder:
    assert t_hparams is not None
    assert tokens is not None
    # text_model hparams
    fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
@ -259,8 +261,8 @@ if has_vision_encoder:
    if processor is not None:
-        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
+        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean  # pyright: ignore[reportAttributeAccessIssue]
-        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
+        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std  # pyright: ignore[reportAttributeAccessIssue]
    else:
        image_mean = args.image_mean if args.image_mean is not None else default_image_mean
        image_std = args.image_std if args.image_std is not None else default_image_std
@ -272,7 +274,7 @@ fout.add_bool("clip.use_gelu", use_gelu)
 if has_llava_projector:
-    model.vision_model.encoder.layers.pop(-1)
+    model.vision_model.encoder.layers.pop(-1)  # pyright: ignore[reportAttributeAccessIssue]
    projector = torch.load(args.llava_projector)
    for name, data in projector.items():
        name = get_tensor_name(name)
@ -286,7 +288,7 @@ if has_llava_projector:
    print("Projector tensors added\n")
-state_dict = model.state_dict()
+state_dict = model.state_dict()  # pyright: ignore[reportAttributeAccessIssue]
 for name, data in state_dict.items():
    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
        # we don't need this
--- a/examples/llava/llava_surgery.py
+++ b/examples/llava/llava_surgery.py
--- a/examples/llava/llava_surgery_v2.py
+++ b/examples/llava/llava_surgery_v2.py
@ -2,7 +2,9 @@ import argparse
 import glob
 import os
 import torch
-from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file
+from safetensors import safe_open
 from safetensors.torch import save_file
 from typing import Any, ContextManager, cast
 # Function to determine if file is a SafeTensor file
 def is_safetensor_file(file_path):
@ -13,7 +15,7 @@ def is_safetensor_file(file_path):
 def load_model(file_path):
    if is_safetensor_file(file_path):
        tensors = {}
-        with safe_open(file_path, framework="pt", device="cpu") as f:
+        with cast(ContextManager[Any], safe_open(file_path, framework="pt", device="cpu")) as f:
            for key in f.keys():
                tensors[key] = f.get_tensor(key).clone()
                # output shape
@ -134,7 +136,7 @@ if len(mm_tensors) == 0:
    if last_checkpoint is not None:
        for k, v in last_checkpoint.items():
            print(k)
-    print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint)} tensors.")
+    print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint) if last_checkpoint is not None else 0} tensors.")
    print("No tensors found. Is this a LLaVA model?")
    exit()
@ -143,8 +145,10 @@ print(f"Found additional {len(first_mm_tensors)} tensors to extract.")
 # projector = {name: checkpoint.[name].float() for name in mm_tensors}
 projector = {}
 for name in mm_tensors:
    assert last_checkpoint is not None
    projector[name] = last_checkpoint[name].float()
 for name in first_mm_tensors:
    assert first_checkpoint is not None
    projector[name] = first_checkpoint[name].float()
 if len(projector) > 0:
--- a/examples/llava/requirements.txt
+++ b/examples/llava/requirements.txt
@ -1,3 +1,4 @@
-r ../../requirements/requirements-convert-legacy-llama.txt
+-r ../../requirements/requirements-convert_legacy_llama.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
 pillow~=10.2.0
-torch~=2.1.1
+torch~=2.2.1
--- a/examples/lookup/README.md
+++ b/examples/lookup/README.md
@ -10,4 +10,3 @@ More info:
 https://github.com/ggerganov/llama.cpp/pull/4484
 https://github.com/ggerganov/llama.cpp/issues/4226
--- a/examples/main-cmake-pkg/.gitignore
+++ b/examples/main-cmake-pkg/.gitignore
@ -48,4 +48,3 @@
 build*/
 out/
 tmp/
--- a/examples/main-cmake-pkg/CMakeLists.txt
+++ b/examples/main-cmake-pkg/CMakeLists.txt
@ -30,4 +30,3 @@ target_include_directories(${TARGET} PRIVATE ${_common_path})
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -1,6 +1,6 @@
 # llama.cpp/examples/main
-This example program allows you to use various LLaMA language models in an easy and efficient way. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
+This example program allows you to use various LLaMA language models easily and efficiently. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
 ## Table of Contents
@ -17,60 +17,59 @@ This example program allows you to use various LLaMA language models in an easy
 To get started right away, run the following command, making sure to use the correct path for the model you have:
-#### Unix-based systems (Linux, macOS, etc.):
+First, we will need to download a model. In these examples, we will use the Gemma model from the ggml-org repo on Hugging Face.
 [https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true)
 Once downloaded, place your model in the models folder in llama.cpp.
 ### Unix-based systems (Linux, macOS, etc.):
 ##### Input prompt (One-and-done)
 ```bash
-./llama-cli -m models/7B/ggml-model.bin --prompt "Once upon a time"
+./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time"
 ```
-
+##### Conversation mode (Allow for continuous interaction with the model)
 #### Windows:
 ```powershell
 llama-cli.exe -m models\7B\ggml-model.bin --prompt "Once upon a time"
 ```
 For an interactive experience, try this command:
 #### Unix-based systems (Linux, macOS, etc.):
 ```bash
-./llama-cli -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
+./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma
 'User: Hi
 AI: Hello. I am an AI chatbot. Would you like to talk?
 User: Sure!
 AI: What would you like to talk about?
 User:'
 ```
-#### Windows:
+##### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
 ```powershell
 llama-cli.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
 ```
 The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
 #### Unix-based systems (Linux, macOS, etc.):
 ```bash
-./llama-cli -m models/7B/ggml-model.bin --ignore-eos -n -1
+./llama-cli -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 ```
-#### Windows:
+### Windows:
 ##### Input prompt (One-and-done)
 ```powershell
 ./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time"
 ```
 ##### Conversation mode (Allow for continuous interaction with the model)
 ```powershell
-llama-cli.exe -m models\7B\ggml-model.bin --ignore-eos -n -1
+./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma
 ```
 #### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
 ```powershell
 llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 ```
 ## Common Options
 In this section, we cover the most commonly used options for running the `llama-cli` program with the LLaMA models:
-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`; inferred from `--model-url` if set).
+-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/gemma-1.1-7b-it.Q4_K_M.gguf`; inferred from `--model-url` if set).
-   `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
+-   `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g [https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true)).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
 -   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
 -   `-mli, --multiline-input`: Allows you to write or paste multiple lines without ending each in '\'
 -   `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has.
 -   -   `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 ## Input Prompts
@ -90,6 +89,7 @@ In interactive mode, users can participate in text generation by injecting their
 -   `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model.
 -   `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
 -   `-cnv,  --conversation`:  Run the program in conversation mode (does not print special tokens and suffix/prefix, use default chat template) (default: false)
 -   `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
 By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
@ -117,6 +117,13 @@ The `--in-suffix` flag is used to add a suffix after your input. This is useful
 ```sh
 ./llama-cli -r "User:" --in-prefix " " --in-suffix "Assistant:"
 ```
 When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled
 ### Chat templates
 `--chat-template JINJA_TEMPLATE`: This option sets a custom jinja chat template. It accepts a string, not a file name.  Default: template taken from model's metadata. Llama.cpp only supports [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template). These include llama2, llama3, gemma, monarch, chatml, orion, vicuna, vicuna-orca, deepseek, command-r, zephyr. When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled.
 Example usage: `--chat-template gemma`
 ## Context Management
@ -124,9 +131,7 @@ During text generation, LLaMA models have a limited context size, which means th
 ### Context Size
-The `--ctx-size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
+- `-c N, --ctx-size N`: Set the size of the prompt context (default: 0, 0 = loaded from model). The LLaMA models were built with a context of 2048-8192, which will yield the best results on longer input/inference.
 -   `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
 ### Extended Context Size
@ -148,15 +153,15 @@ The following options allow you to control the text generation process and fine-
 ### Number of Tokens to Predict
-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity, -2 = until context filled)
+-   `-n N, --predict N`: Set the number of tokens to predict when generating text (default: -1, -1 = infinity, -2 = until context filled)
-The `--n-predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text.
+The `--predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text.
-A value of -1 will enable infinite text generation, even though we have a finite context window. When the context window is full, some of the earlier tokens (half of the tokens after `--n-keep`) will be discarded. The context must then be re-evaluated before generation can resume. On large models and/or large context windows, this will result in significant pause in output.
+A value of -1 will enable infinite text generation, even though we have a finite context window. When the context window is full, some of the earlier tokens (half of the tokens after `--keep`) will be discarded. The context must then be re-evaluated before generation can resume. On large models and/or large context windows, this will result in a significant pause in output.
 If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled.
-It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n-predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
+It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode, text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `--predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
 ### Temperature
@ -164,15 +169,15 @@ It is important to note that the generated text may be shorter than the specifie
 Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
-Example usage: `--temp 0.5`
+Example usage: `--temp 0`
 ### Repeat Penalty
-   `--repeat-penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
+-   `--repeat-penalty N`: Control the repetition of token sequences in the generated text default: 1.0, 1.0 = disabled).
 -   `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
 -   `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty.
-The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.
+The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.
 The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`).
@ -196,19 +201,19 @@ Top-p sampling, also known as nucleus sampling, is another text generation metho
 Example usage: `--top-p 0.95`
-### Min P Sampling
+### Min-P Sampling
-   `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.05).
+-   `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.1).
 The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.
 Example usage: `--min-p 0.05`
-### Tail Free Sampling (TFS)
+### Tail-Free Sampling (TFS)
 -   `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
-Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. Similar to Top-P it tries to determine the bulk of the most likely tokens dynamically. But TFS filters out logits based on the second derivative of their probabilities. Adding tokens is stopped after the sum of the second derivatives reaches the parameter z. In short: TFS looks how quickly the probabilities of the tokens decrease and cuts off the tail of unlikely tokens using the parameter z. Typical values for z are in the range of 0.9 to 0.95. A value of 1.0 would include all tokens, and thus disables the effect of TFS.
+Tail-free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. Similar to Top-P it tries to determine the bulk of the most likely tokens dynamically. But TFS filters out logits based on the second derivative of their probabilities. Adding tokens is stopped after the sum of the second derivatives reaches the parameter z. In short: TFS looks at how quickly the probabilities of the tokens decrease and cuts off the tail of unlikely tokens using the parameter z. Typical values for z are in the range of 0.9 to 0.95. A value of 1.0 would include all tokens and thus disables the effect of TFS.
 Example usage: `--tfs 0.95`
@ -307,10 +312,8 @@ These options provide extra functionality and customization when running the LLa
 -   `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
 -   `--verbose-prompt`: Print the prompt before generating text.
 -   `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
 -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
 -   `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable  or in an OS-specific local cache.
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -37,7 +37,8 @@ static gpt_params               * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
 static std::vector<llama_token> * g_output_tokens;
-static bool is_interacting = false;
+static bool is_interacting  = false;
 static bool need_insert_eot = false;
 static bool file_exists(const std::string & path) {
    std::ifstream f(path.c_str());
@ -99,7 +100,8 @@ static void write_logfile(
 static void sigint_handler(int signo) {
    if (signo == SIGINT) {
        if (!is_interacting && g_params->interactive) {
-            is_interacting = true;
+            is_interacting  = true;
            need_insert_eot = true;
        } else {
            console::cleanup();
            printf("\n");
@ -224,7 +226,14 @@ int main(int argc, char ** argv) {
                __func__, n_ctx_train, n_ctx);
    }
-    LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
+    // print chat template example in conversation mode
    if (params.conversation) {
        if (params.enable_chat_template) {
            LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
        } else {
            LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
        }
    }
    // print system information
    {
@ -255,13 +264,15 @@ int main(int argc, char ** argv) {
    }
    const bool add_bos = llama_should_add_bos_token(model);
-    GGML_ASSERT(llama_add_eos_token(model) != 1);
+    if (!llama_model_has_encoder(model)) {
        GGML_ASSERT(llama_add_eos_token(model) != 1);
    }
    LOG("add_bos: %d\n", add_bos);
    std::vector<llama_token> embd_inp;
    {
-        auto prompt = params.conversation
+        auto prompt = (params.conversation && params.enable_chat_template && !params.prompt.empty())
            ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
            : params.prompt;
        if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
@ -278,8 +289,13 @@ int main(int argc, char ** argv) {
    // Should not run without any tokens
    if (embd_inp.empty()) {
-        embd_inp.push_back(llama_token_bos(model));
+        if (add_bos) {
-        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+            embd_inp.push_back(llama_token_bos(model));
            LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
        } else {
            LOG_TEE("error: input is empty\n");
            return -1;
        }
    }
    // Tokenize negative prompt
@ -517,6 +533,24 @@ int main(int argc, char ** argv) {
        exit(1);
    }
    if (llama_model_has_encoder(model)) {
        int enc_input_size = embd_inp.size();
        llama_token * enc_input_buf = embd_inp.data();
        if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
            LOG_TEE("%s : failed to eval\n", __func__);
            return 1;
        }
        llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
        if (decoder_start_token_id == -1) {
            decoder_start_token_id = llama_token_bos(model);
        }
        embd_inp.clear();
        embd_inp.push_back(decoder_start_token_id);
    }
    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
        // predict
        if (!embd.empty()) {
@ -810,7 +844,9 @@ int main(int argc, char ** argv) {
                        is_antiprompt = true;
                    }
-                    chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
+                    if (params.enable_chat_template) {
                        chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
                    }
                    is_interacting = true;
                    printf("\n");
                }
@ -872,16 +908,24 @@ int main(int argc, char ** argv) {
                        string_process_escapes(buffer);
                    }
-                    std::string user_inp = params.conversation
+                    bool format_chat = params.conversation && params.enable_chat_template;
                    std::string user_inp = format_chat
                        ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
                        : std::move(buffer);
                    // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
-                    const auto line_inp = ::llama_tokenize(ctx, user_inp,            false, params.conversation);
+                    const auto line_inp = ::llama_tokenize(ctx, user_inp,            false, format_chat);
                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
                    // if user stop generation mid-way, we must add EOT to finish model's last response
                    if (need_insert_eot && format_chat) {
                        llama_token eot = llama_token_eot(model);
                        embd_inp.push_back(eot == -1 ? llama_token_eos(model) : eot);
                        need_insert_eot = false;
                    }
                    embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
                    embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
--- a/examples/passkey/README.md
+++ b/examples/passkey/README.md
@ -1,5 +1,8 @@
 # llama.cpp/example/passkey
 A passkey retrieval task is an evaluation method used to measure a language
 models ability to recall information from long contexts.
 See the following PRs for more info:
 - https://github.com/ggerganov/llama.cpp/pull/3856
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -1991,6 +1991,12 @@ int main(int argc, char ** argv) {
        params.n_batch = std::min(params.n_batch, n_kv);
    } else {
        params.n_batch = std::min(params.n_batch, params.n_ctx);
        if (params.kl_divergence) {
            params.n_parallel = 1;
        } else {
            // ensure there's at least enough seq_ids for HellaSwag
            params.n_parallel = std::max(4, params.n_parallel);
        }
    }
    if (params.ppl_stride > 0) {
@ -2015,9 +2021,6 @@ int main(int argc, char ** argv) {
    llama_model * model;
    llama_context * ctx;
    // ensure there's at least enough seq_ids for HellaSwag
    params.n_parallel = std::max(4, params.n_parallel);
    // load the model and apply lora adapter, if any
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    if (model == NULL) {
--- a/examples/pydantic_models_to_grammar.py
+++ b/examples/pydantic_models_to_grammar.py
@ -6,10 +6,10 @@ import re
 from copy import copy
 from enum import Enum
 from inspect import getdoc, isclass
-from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints
+from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin
 from docstring_parser import parse
-from pydantic import BaseModel, Field, create_model
+from pydantic import BaseModel, create_model
 if TYPE_CHECKING:
    from types import GenericAlias
@ -17,6 +17,9 @@ else:
    # python 3.8 compat
    from typing import _GenericAlias as GenericAlias
 # TODO: fix this
 # pyright: reportAttributeAccessIssue=information
 class PydanticDataType(Enum):
    """
@ -234,8 +237,9 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None
    # Define the integer part rule
    integer_part_rule = (
-        "integer-part" + (f"-max{max_digit}" if max_digit is not None else "") + (
+        "integer-part"
-        f"-min{min_digit}" if min_digit is not None else "")
+        + (f"-max{max_digit}" if max_digit is not None else "")
        + (f"-min{min_digit}" if min_digit is not None else "")
    )
    # Define the fractional part rule based on precision constraints
@ -458,7 +462,7 @@ def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[Bas
    if not issubclass(model, BaseModel):
        # For non-Pydantic classes, generate model_fields from __annotations__ or __init__
        if hasattr(model, "__annotations__") and model.__annotations__:
-            model_fields = {name: (typ, ...) for name, typ in model.__annotations__.items()}
+            model_fields = {name: (typ, ...) for name, typ in model.__annotations__.items()}  # pyright: ignore[reportGeneralTypeIssues]
        else:
            init_signature = inspect.signature(model.__init__)
            parameters = init_signature.parameters
@ -680,7 +684,7 @@ def generate_markdown_documentation(
        str: Generated text documentation.
    """
    documentation = ""
-    pyd_models = [(model, True) for model in pydantic_models]
+    pyd_models: list[tuple[type[BaseModel], bool]] = [(model, True) for model in pydantic_models]
    for model, add_prefix in pyd_models:
        if add_prefix:
            documentation += f"{model_prefix}: {model.__name__}\n"
@ -700,7 +704,7 @@ def generate_markdown_documentation(
            # Indenting the fields section
            documentation += f"  {fields_prefix}:\n"
        else:
-            documentation += f"  Fields:\n"
+            documentation += f"  Fields:\n"  # noqa: F541
        if isclass(model) and issubclass(model, BaseModel):
            for name, field_type in model.__annotations__.items():
                # if name == "markdown_code_block":
@ -778,7 +782,7 @@ def generate_field_markdown(
        return field_text
    if field_description != "":
-        field_text += f"        Description: " + field_description + "\n"
+        field_text += f"        Description: {field_description}\n"
    # Check for and include field-specific examples if available
    if hasattr(model, "Config") and hasattr(model.Config,
@ -833,7 +837,7 @@ def generate_text_documentation(
        str: Generated text documentation.
    """
    documentation = ""
-    pyd_models = [(model, True) for model in pydantic_models]
+    pyd_models: list[tuple[type[BaseModel], bool]] = [(model, True) for model in pydantic_models]
    for model, add_prefix in pyd_models:
        if add_prefix:
            documentation += f"{model_prefix}: {model.__name__}\n"
@ -1164,7 +1168,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]):
        dynamic_fields[param.name] = (
            param.annotation if param.annotation != inspect.Parameter.empty else str, default_value)
    # Creating the dynamic model
-    dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)  # type: ignore[call-overload]
+    dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)
    for name, param_doc in param_docs:
        dynamic_model.model_fields[name].description = param_doc.description
@ -1228,9 +1232,6 @@ def map_grammar_names_to_pydantic_model_class(pydantic_model_list):
    return output
 from enum import Enum
 def json_schema_to_python_types(schema):
    type_map = {
        "any": Any,
@ -1275,7 +1276,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name:
                    if items != {}:
                        array = {"properties": items}
                        array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items")
-                        fields[field_name] = (List[array_type], ...)  # type: ignore[valid-type]
+                        fields[field_name] = (List[array_type], ...)
                    else:
                        fields[field_name] = (list, ...)
                elif field_type == "object":
@ -1285,7 +1286,8 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name:
                    required = field_data.get("enum", [])
                    for key, field in fields.items():
                        if key not in required:
-                            fields[key] = (Optional[fields[key][0]], ...)
+                            optional_type = fields[key][0]
                            fields[key] = (Optional[optional_type], ...)
                else:
                    field_type = json_schema_to_python_types(field_type)
                    fields[field_name] = (field_type, ...)
@ -1305,6 +1307,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name:
        required = dictionary.get("required", [])
        for key, field in fields.items():
            if key not in required:
-                fields[key] = (Optional[fields[key][0]], ...)
+                optional_type = fields[key][0]
                fields[key] = (Optional[optional_type], ...)
    custom_model = create_model(model_name, **fields)
    return custom_model
--- a/examples/pydantic_models_to_grammar_examples.py
+++ b/examples/pydantic_models_to_grammar_examples.py
@ -1,6 +1,7 @@
 # Function calling example using pydantic models.
 from __future__ import annotations
 import datetime
 import importlib
 import json
 from enum import Enum
 from typing import Optional, Union
@ -215,9 +216,9 @@ for call in json_data:
    if call["function"] == "Calculator":
        print(Calculator(**call["params"]).run())
    elif call["function"] == "get_current_datetime":
-        print(current_datetime_model(**call["params"]).run())
+        print(current_datetime_model(**call["params"]).run())  # pyright: ignore[reportAttributeAccessIssue]
    elif call["function"] == "get_current_weather":
-        print(current_weather_tool_model(**call["params"]).run())
+        print(current_weather_tool_model(**call["params"]).run())  # pyright: ignore[reportAttributeAccessIssue]
 # Should output something like this:
 # 2024-01-14 13:36:06
 # {"location": "London", "temperature": "42", "unit": "celsius"}
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@ -154,7 +154,7 @@ static void test_roundtrip_on_chunk(
    }
    if (use_reference) {
-        qfns.from_float_reference(input_scratch, quantized_scratch, chunk_size);
+        qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size);
    } else {
        qfns.from_float(input_scratch, quantized_scratch, chunk_size);
    }
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@ -4,7 +4,89 @@ You can also use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-
 Note: It is synced from llama.cpp `main` every 6 hours.
-## Llama 2 7B
+Example usage:
 ```bash
 # obtain the official LLaMA model weights and place them in ./models
 ls ./models
 llama-2-7b tokenizer_checklist.chk tokenizer.model
 # [Optional] for models using BPE tokenizers
 ls ./models
 <folder containing weights and tokenizer json> vocab.json
 # [Optional] for PyTorch .bin models like Mistral-7B
 ls ./models
 <folder containing weights and tokenizer json>
 # install Python dependencies
 python3 -m pip install -r requirements.txt
 # convert the model to ggml FP16 format
 python3 convert_hf_to_gguf.py models/mymodel/
 # quantize the model to 4-bits (using Q4_K_M method)
 ./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
 # update the gguf filetype to current version if older version is now unsupported
 ./llama-quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
 ```
 Run the quantized model:
 ```bash
 # start inference on a gguf model
 ./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
 ```
 When running the larger models, make sure you have enough disk space to store all the intermediate files.
 ## Memory/Disk Requirements
 As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
 | Model | Original size | Quantized size (Q4_0) |
 |------:|--------------:|----------------------:|
 |    7B |         13 GB |                3.9 GB |
 |   13B |         24 GB |                7.8 GB |
 |   30B |         60 GB |               19.5 GB |
 |   65B |        120 GB |               38.5 GB |
 ## Quantization
 Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
 *(outdated)*
 | Model | Measure      |    F16 |   Q4_0 |   Q4_1 |   Q5_0 |   Q5_1 |   Q8_0 |
 |------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
 |    7B | perplexity   | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
 |    7B | file size    |  13.0G |   3.5G |   3.9G |   4.3G |   4.7G |   6.7G |
 |    7B | ms/tok @ 4th |    127 |     55 |     54 |     76 |     83 |     72 |
 |    7B | ms/tok @ 8th |    122 |     43 |     45 |     52 |     56 |     67 |
 |    7B | bits/weight  |   16.0 |    4.5 |    5.0 |    5.5 |    6.0 |    8.5 |
 |   13B | perplexity   | 5.2543 | 5.3860 | 5.3608 | 5.2856 | 5.2706 | 5.2548 |
 |   13B | file size    |  25.0G |   6.8G |   7.6G |   8.3G |   9.1G |    13G |
 |   13B | ms/tok @ 4th |      - |    103 |    105 |    148 |    160 |    131 |
 |   13B | ms/tok @ 8th |      - |     73 |     82 |     98 |    105 |    128 |
 |   13B | bits/weight  |   16.0 |    4.5 |    5.0 |    5.5 |    6.0 |    8.5 |
 - [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684)
 - recent k-quants improvements and new i-quants
  - [#2707](https://github.com/ggerganov/llama.cpp/pull/2707)
  - [#2807](https://github.com/ggerganov/llama.cpp/pull/2807)
  - [#4773 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4773)
  - [#4856 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4856)
  - [#4861 - importance matrix](https://github.com/ggerganov/llama.cpp/pull/4861)
  - [#4872 - MoE models](https://github.com/ggerganov/llama.cpp/pull/4872)
  - [#4897 - 2-bit quantization](https://github.com/ggerganov/llama.cpp/pull/4897)
  - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
  - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
  - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
  - [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
  - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
  - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
  - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)
 **Llama 2 7B**
 | Quantization | Bits per Weight (BPW) |
 |--------------|-----------------------|
@ -18,7 +100,8 @@ Note: It is synced from llama.cpp `main` every 6 hours.
 | Q5_K_M       | 5.68                  |
 | Q6_K         | 6.56                  |
-## Llama 2 13B
+**Llama 2 13B**
 Quantization | Bits per Weight (BPW)
 -- | --
 Q2_K | 3.34
@ -31,7 +114,7 @@ Q5_K_S | 5.51
 Q5_K_M | 5.67
 Q6_K | 6.56
-# Llama 2 70B
+**Llama 2 70B**
 Quantization | Bits per Weight (BPW)
 -- | --
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -46,6 +46,9 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B",  },
    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 6.14G, +0.0217 ppl @ Llama-3-8B",  },
    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 7.96G, +0.0026 ppl @ Llama-3-8B",  },
    { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
    { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
    { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "14.00G, +0.0020 ppl @ Mistral-7B",  },
    { "BF16",   LLAMA_FTYPE_MOSTLY_BF16,   "14.00G, -0.0050 ppl @ Mistral-7B",  },
    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B",          },
--- a/examples/regex_to_grammar.py
+++ b/examples/regex_to_grammar.py
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -366,7 +366,8 @@ Notice that each `probs` is an array of length `n_probs`.
  "assistant_name": "",
  "user_name": "",
  "default_generation_settings": { ... },
-  "total_slots": 1
+  "total_slots": 1,
  "chat_template": ""
 }
 ```
@ -374,8 +375,9 @@ Notice that each `probs` is an array of length `n_probs`.
 - `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
 - `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
 - `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
 - `chat_template` - the model's original Jinja2 prompt template
- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only model with [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, ChatML template will be used.
+- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
    *Options:*
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@ -1,3 +1,5 @@
 from __future__ import annotations
 import argparse
 import json
 import os
@ -59,10 +61,11 @@ def main(args_in: list[str] | None = None) -> None:
        sys.exit(1)
    # start the benchmark
    iterations = 0
    data = {}
    try:
        start_benchmark(args)
        iterations = 0
        with open("results.github.env", 'w') as github_env:
            # parse output
            with open('k6-results.json', 'r') as bench_results:
@ -129,7 +132,7 @@ def main(args_in: list[str] | None = None) -> None:
                timestamps, metric_values = zip(*values)
                metric_values = [float(value) for value in metric_values]
                prometheus_metrics[metric] = metric_values
-                timestamps_dt = [datetime.fromtimestamp(int(ts)) for ts in timestamps]
+                timestamps_dt = [str(datetime.fromtimestamp(int(ts))) for ts in timestamps]
                plt.figure(figsize=(16, 10), dpi=80)
                plt.plot(timestamps_dt, metric_values, label=metric)
                plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7)
@ -156,7 +159,7 @@ def main(args_in: list[str] | None = None) -> None:
                plt.close()
                # Mermaid format in case images upload failed
-                with (open(f"{metric}.mermaid", 'w') as mermaid_f):
+                with open(f"{metric}.mermaid", 'w') as mermaid_f:
                    mermaid = (
                    f"""---
 config:
@ -278,7 +281,7 @@ def start_server_background(args):
    }
    server_process = subprocess.Popen(
        args,
-        **pkwargs)
+        **pkwargs)  # pyright: ignore[reportArgumentType, reportCallIssue]
    def server_log(in_stream, out_stream):
        for line in iter(in_stream.readline, b''):
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -737,6 +737,8 @@ struct server_context {
            slot.ga_n = ga_n;
            slot.ga_w = ga_w;
            slot.sparams = params.sparams;
            slot.reset();
            slots.push_back(slot);
@ -884,7 +886,8 @@ struct server_context {
    bool launch_slot_with_task(server_slot & slot, const server_task & task) {
        slot_params default_params;
-        llama_sampling_params default_sparams;
+        // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
        llama_sampling_params default_sparams = params.sparams;
        auto & data = task.data;
        if (data.count("__oaicompat") != 0) {
@ -2002,6 +2005,11 @@ struct server_context {
        int32_t n_batch  = llama_n_batch(ctx);
        int32_t n_ubatch = llama_n_ubatch(ctx);
        // track if this is an embedding or non-embedding batch
        // if we've added sampled tokens above, we are in non-embedding mode
        // -1: none, 0: non-embedding, 1: embedding
        int32_t batch_type = batch.n_tokens > 0 ? 0 : -1;
        // next, batch any pending prompts without exceeding n_batch
        if (params.cont_batching || batch.n_tokens == 0) {
            for (auto & slot : slots) {
@ -2172,6 +2180,14 @@ struct server_context {
                        }
                    }
                    // check that we are in the right batch_type, if not defer the slot
                    bool slot_type = slot.embedding ? 1 : 0;
                    if (batch_type == -1) {
                        batch_type = slot_type;
                    } else if (batch_type != slot_type) {
                        continue;
                    }
                    // keep only the common part
                    int p0 = (int) system_tokens.size() + slot.n_past;
                    if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) {
@ -2273,6 +2289,9 @@ struct server_context {
            {"n_tokens", batch.n_tokens},
        });
        // make sure we're in the right embedding mode
        llama_set_embeddings(ctx, batch_type == 1);
        // process the created batch of tokens
        for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
            const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
@ -2605,7 +2624,7 @@ int main(int argc, char ** argv) {
    // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
    if (params.chat_template.empty()) {
        if (!ctx_server.validate_model_chat_template()) {
-            LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
+            LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
            params.chat_template = "chatml";
        }
    }
@ -2967,17 +2986,31 @@ int main(int argc, char ** argv) {
    };
    const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
        std::string template_key = "tokenizer.chat_template", curr_tmpl;
        int32_t tlen = llama_model_meta_val_str(ctx_server.model, template_key.c_str(), nullptr, 0);
        if (tlen > 0) {
            std::vector<char> curr_tmpl_buf(tlen + 1, 0);
            if (llama_model_meta_val_str(ctx_server.model, template_key.c_str(), curr_tmpl_buf.data(), curr_tmpl_buf.size()) == tlen) {
                curr_tmpl = std::string(curr_tmpl_buf.data(), tlen);
            }
        }
        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
        json data = {
            { "system_prompt",               ctx_server.system_prompt.c_str() },
            { "default_generation_settings", ctx_server.default_generation_settings_for_props },
-            { "total_slots",                 ctx_server.params.n_parallel }
+            { "total_slots",                 ctx_server.params.n_parallel },
            { "chat_template",               curr_tmpl.c_str() }
        };
        res.set_content(data.dump(), "application/json; charset=utf-8");
    };
    const auto handle_completions = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
        if (ctx_server.params.embedding) {
            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
            return;
        }
        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
        json data = json::parse(req.body);
@ -3073,6 +3106,11 @@ int main(int argc, char ** argv) {
    };
    const auto handle_chat_completions = [&ctx_server, &params, &res_error](const httplib::Request & req, httplib::Response & res) {
        if (ctx_server.params.embedding) {
            res_error(res, format_error_response("This server does not support chat completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
            return;
        }
        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
        json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
@ -3145,6 +3183,11 @@ int main(int argc, char ** argv) {
    };
    const auto handle_infill = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
        if (ctx_server.params.embedding) {
            res_error(res, format_error_response("This server does not support infill. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
            return;
        }
        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
        json data = json::parse(req.body);
@ -3231,13 +3274,8 @@ int main(int argc, char ** argv) {
        return res.set_content(data.dump(), "application/json; charset=utf-8");
    };
-    const auto handle_embeddings = [&params, &ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
+    const auto handle_embeddings = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
        if (!params.embedding) {
            res.status = 501;
            res.set_content("This server does not support embeddings. Start it with `--embeddings`", "text/plain; charset=utf-8");
            return;
        }
        const json body = json::parse(req.body);
        bool is_openai = false;
--- a/examples/server/tests/features/passkey.feature
+++ b/examples/server/tests/features/passkey.feature
@ -52,4 +52,3 @@ Feature: Passkey / Self-extend with context shift
      #| TheBloke/Llama-2-7B-GGUF        | llama-2-7b.Q2_K.gguf        | 4096        | 3   | 16384 | 512     | 4    | 512    | 500    | 300   | 1234    | 5           | 1234           |
      #| TheBloke/Mixtral-8x7B-v0.1-GGUF | mixtral-8x7b-v0.1.Q2_K.gguf | 32768       | 2   | 16384 | 512     | 4    | 512    | 500    | 100   | 0987    | 5           | 0
      # 987           |
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -1,5 +1,4 @@
 import asyncio
 import collections
 import json
 import os
 import re
@ -8,19 +7,23 @@ import subprocess
 import sys
 import threading
 import time
 from collections.abc import Sequence
 from contextlib import closing
 from re import RegexFlag
 from typing import Any, Literal, cast
 import aiohttp
 import numpy as np
 import openai
-from behave import step
+from openai.types.chat import ChatCompletionChunk
 from behave import step  # pyright: ignore[reportAttributeAccessIssue]
 from behave.api.async_step import async_run_until_complete
 from prometheus_client import parser
 # pyright: reportRedeclaration=false
@step("a server listening on {server_fqdn}:{server_port}")
-def step_server_config(context, server_fqdn, server_port):
+def step_server_config(context, server_fqdn: str, server_port: str):
    context.server_fqdn = server_fqdn
    context.server_port = int(server_port)
    context.n_threads = None
@ -74,34 +77,34 @@ def step_server_config(context, server_fqdn, server_port):
@step('a model file {hf_file} from HF repo {hf_repo}')
-def step_download_hf_model(context, hf_file, hf_repo):
+def step_download_hf_model(context, hf_file: str, hf_repo: str):
    context.model_hf_repo = hf_repo
    context.model_hf_file = hf_file
    context.model_file = os.path.basename(hf_file)
@step('a model file {model_file}')
-def step_model_file(context, model_file):
+def step_model_file(context, model_file: str):
    context.model_file = model_file
@step('a model url {model_url}')
-def step_model_url(context, model_url):
+def step_model_url(context, model_url: str):
    context.model_url = model_url
@step('a model alias {model_alias}')
-def step_model_alias(context, model_alias):
+def step_model_alias(context, model_alias: str):
    context.model_alias = model_alias
@step('{seed:d} as server seed')
-def step_seed(context, seed):
+def step_seed(context, seed: int):
    context.server_seed = seed
@step('{ngl:d} GPU offloaded layers')
-def step_n_gpu_layer(context, ngl):
+def step_n_gpu_layer(context, ngl: int):
    if 'N_GPU_LAYERS' in os.environ:
        new_ngl = int(os.environ['N_GPU_LAYERS'])
        if context.debug:
@ -111,37 +114,37 @@ def step_n_gpu_layer(context, ngl):
@step('{n_threads:d} threads')
-def step_n_threads(context, n_threads):
+def step_n_threads(context, n_threads: int):
    context.n_thread = n_threads
@step('{draft:d} as draft')
-def step_draft(context, draft):
+def step_draft(context, draft: int):
    context.draft = draft
@step('{n_ctx:d} KV cache size')
-def step_n_ctx(context, n_ctx):
+def step_n_ctx(context, n_ctx: int):
    context.n_ctx = n_ctx
@step('{n_slots:d} slots')
-def step_n_slots(context, n_slots):
+def step_n_slots(context, n_slots: int):
    context.n_slots = n_slots
@step('{n_predict:d} server max tokens to predict')
-def step_server_n_predict(context, n_predict):
+def step_server_n_predict(context, n_predict: int):
    context.n_server_predict = n_predict
@step('{slot_save_path} as slot save path')
-def step_slot_save_path(context, slot_save_path):
+def step_slot_save_path(context, slot_save_path: str):
    context.slot_save_path = slot_save_path
@step('using slot id {id_slot:d}')
-def step_id_slot(context, id_slot):
+def step_id_slot(context, id_slot: int):
    context.id_slot = id_slot
@ -191,7 +194,7 @@ def step_start_server(context):
@step("the server is {expecting_status}")
@async_run_until_complete
-async def step_wait_for_the_server_to_be_started(context, expecting_status):
+async def step_wait_for_the_server_to_be_started(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str):
    match expecting_status:
        case 'healthy':
            await wait_for_health_status(context, context.base_url, 200, 'ok',
@ -221,7 +224,7 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status):
@step('all slots are {expected_slot_status_string}')
@async_run_until_complete
-async def step_all_slots_status(context, expected_slot_status_string):
+async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
    match expected_slot_status_string:
        case 'idle':
            expected_slot_status = 0
@ -237,7 +240,7 @@ async def step_all_slots_status(context, expected_slot_status_string):
@step('a completion request with {api_error} api error')
@async_run_until_complete
-async def step_request_completion(context, api_error):
+async def step_request_completion(context, api_error: Literal['raised'] | str):
    expect_api_error = api_error == 'raised'
    seeds = await completions_seed(context, num_seeds=1)
    completion = await request_completion(context.prompts.pop(),
@ -777,8 +780,8 @@ def step_assert_metric_value(context, metric_name, metric_value):
 def step_available_models(context):
    # openai client always expects an api_key
    openai.api_key = context.user_api_key if context.user_api_key is not None else 'nope'
-    openai.api_base = f'{context.base_url}/v1'
+    openai.base_url = f'{context.base_url}/v1/'
-    context.models = openai.Model.list().data
+    context.models = openai.models.list().data
@step('{n_model:d} models are supported')
@ -789,7 +792,7 @@ def step_supported_models(context, n_model):
@step('model {i_model:d} is {param} {preposition} {param_value}')
-def step_supported_models(context, i_model, param, preposition, param_value):
+def step_supported_models(context, i_model: int, param: Literal['identified', 'trained'] | str, preposition: str, param_value: str):
    assert i_model < len(context.models)
    model = context.models[i_model]
@ -798,7 +801,7 @@ def step_supported_models(context, i_model, param, preposition, param_value):
        case 'identified':
            value = model.id
        case 'trained':
-            value = str(model.meta.n_ctx_train)
+            value = str(model.meta["n_ctx_train"])
        case _:
            assert False, "param {param} not supported"
    assert param_value == value, f"model param {param} {value} != {param_value}"
@ -810,6 +813,7 @@ async def concurrent_requests(context, f_completion, *args, **kwargs):
        print(f"starting {context.n_prompts} concurrent completion requests...")
    assert context.n_prompts > 0
    seeds = await completions_seed(context)
    assert seeds is not None
    for prompt_no in range(context.n_prompts):
        shifted_args = [context.prompts.pop(), seeds[prompt_no], *args]
        context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
@ -861,7 +865,7 @@ async def request_completion(prompt,
                             id_slot=None,
                             expect_api_error=None,
                             user_api_key=None,
-                             temperature=None):
+                             temperature=None) -> int | dict[str, Any]:
    if debug:
        print(f"Sending completion request: {prompt}")
    origin = "my.super.domain"
@ -899,8 +903,8 @@ async def request_completion(prompt,
 async def oai_chat_completions(user_prompt,
                               seed,
                               system_prompt,
-                               base_url,
+                               base_url: str,
-                               base_path,
+                               base_path: str,
                               async_client,
                               debug=False,
                               temperature=None,
@ -909,7 +913,7 @@ async def oai_chat_completions(user_prompt,
                               enable_streaming=None,
                               response_format=None,
                               user_api_key=None,
-                               expect_api_error=None):
+                               expect_api_error=None) -> int | dict[str, Any]:
    if debug:
        print(f"Sending OAI Chat completions request: {user_prompt}")
    # openai client always expects an api key
@ -989,32 +993,35 @@ async def oai_chat_completions(user_prompt,
    else:
        try:
            openai.api_key = user_api_key
-            openai.api_base = f'{base_url}{base_path}'
+            openai.base_url = f'{base_url}{base_path.removesuffix("chat")}'
-            chat_completion = openai.Completion.create(
+            assert model is not None
            chat_completion = openai.chat.completions.create(
                messages=payload['messages'],
                model=model,
                max_tokens=n_predict,
                stream=enable_streaming,
-                response_format=payload.get('response_format'),
+                response_format=payload.get('response_format') or openai.NOT_GIVEN,
                seed=seed,
                temperature=payload['temperature']
            )
-        except openai.error.AuthenticationError as e:
+        except openai.AuthenticationError as e:
            if expect_api_error is not None and expect_api_error:
                return 401
            else:
                assert False, f'error raised: {e}'
        if enable_streaming:
            chat_completion = cast(openai.Stream[ChatCompletionChunk], chat_completion)
            for chunk in chat_completion:
                assert len(chunk.choices) == 1
                delta = chunk.choices[0].delta
-                if 'content' in delta:
+                if delta.content is not None:
-                    completion_response['content'] += delta['content']
+                    completion_response['content'] += delta.content
                    completion_response['timings']['predicted_n'] += 1
                completion_response['truncated'] = chunk.choices[0].finish_reason != 'stop'
        else:
            assert len(chat_completion.choices) == 1
            assert chat_completion.usage is not None
            completion_response = {
                'content': chat_completion.choices[0].message.content,
                'timings': {
@ -1028,7 +1035,7 @@ async def oai_chat_completions(user_prompt,
    return completion_response
-async def request_embedding(content, seed, base_url=None):
+async def request_embedding(content, seed, base_url=None) -> list[list[float]]:
    async with aiohttp.ClientSession() as session:
        async with session.post(f'{base_url}/embedding',
                                json={
@ -1041,7 +1048,7 @@ async def request_embedding(content, seed, base_url=None):
 async def request_oai_embeddings(input, seed,
                                 base_url=None, user_api_key=None,
-                                 model=None, async_client=False):
+                                 model=None, async_client=False) -> list[list[float]]:
    # openai client always expects an api_key
    user_api_key = user_api_key if user_api_key is not None else 'nope'
    if async_client:
@ -1063,7 +1070,7 @@ async def request_oai_embeddings(input, seed,
                response_json = await response.json()
                assert response_json['model'] == model, f"invalid model received: {response_json['model']}"
                assert response_json['object'] == 'list'
-                if isinstance(input, collections.abc.Sequence):
+                if isinstance(input, Sequence):
                    embeddings = []
                    for an_oai_embeddings in response_json['data']:
                        embeddings.append(an_oai_embeddings['embedding'])
@ -1072,19 +1079,14 @@ async def request_oai_embeddings(input, seed,
                return embeddings
    else:
        openai.api_key = user_api_key
-        openai.api_base = f'{base_url}/v1'
+        openai.base_url = f'{base_url}/v1/'
-        oai_embeddings = openai.Embedding.create(
+        assert model is not None
        oai_embeddings = openai.embeddings.create(
            model=model,
            input=input,
        )
-        if isinstance(input, collections.abc.Sequence):
+        return [e.embedding for e in oai_embeddings.data]
            embeddings = []
            for an_oai_embeddings in oai_embeddings.data:
                embeddings.append(an_oai_embeddings.embedding)
        else:
            embeddings = [oai_embeddings.data.embedding]
        return embeddings
 def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None):
@ -1122,7 +1124,7 @@ def assert_all_predictions_equal(completion_responses):
            if i == j:
                continue
            content_j = response_j['content']
-        assert content_i == content_j, "contents not equal"
+            assert content_i == content_j, "contents not equal"
 def assert_all_predictions_different(completion_responses):
@ -1136,7 +1138,7 @@ def assert_all_predictions_different(completion_responses):
            if i == j:
                continue
            content_j = response_j['content']
-        assert content_i != content_j, "contents not different"
+            assert content_i != content_j, "contents not different"
 def assert_all_token_probabilities_equal(completion_responses):
@ -1153,7 +1155,7 @@ def assert_all_token_probabilities_equal(completion_responses):
                if i == j:
                    continue
                probs_j = response_j['completion_probabilities'][pos]['probs']
-            assert probs_i == probs_j, "contents not equal"
+                assert probs_i == probs_j, "contents not equal"
 async def gather_tasks_results(context):
@ -1343,7 +1345,7 @@ def start_server_background(context):
    }
    context.server_process = subprocess.Popen(
        [str(arg) for arg in [context.server_path, *server_args]],
-        **pkwargs)
+        **pkwargs)  # pyright: ignore[reportArgumentType, reportCallIssue]
    def server_log(in_stream, out_stream):
        for line in iter(in_stream.readline, b''):
--- a/examples/server/tests/requirements.txt
+++ b/examples/server/tests/requirements.txt
@ -1,6 +1,6 @@
 aiohttp~=3.9.3
 behave~=1.2.6
 huggingface_hub~=0.20.3
-numpy~=1.24.4
+numpy~=1.26.4
-openai~=0.25.0
+openai~=1.30.3
 prometheus-client~=0.20.0
--- a/examples/server/themes/buttons-top/index.html
+++ b/examples/server/themes/buttons-top/index.html
@ -1054,4 +1054,3 @@
 </body>
 </html>
--- a/examples/server/themes/wild/index.html
+++ b/examples/server/themes/wild/index.html
@ -1058,4 +1058,3 @@
 </body>
 </html>
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -122,8 +122,26 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
    for (size_t i = 0; i < messages.size(); ++i) {
        const auto & curr_msg = messages[i];
-        std::string role    = json_value(curr_msg, "role",    std::string(""));
+
-        std::string content = json_value(curr_msg, "content", std::string(""));
+        std::string role = json_value(curr_msg, "role", std::string(""));
        std::string content;
        if (curr_msg.contains("content")) {
            if (curr_msg["content"].is_string()) {
                content = curr_msg["content"].get<std::string>();
            } else if (curr_msg["content"].is_array()) {
                for (const auto & part : curr_msg["content"]) {
                    if (part.contains("text")) {
                        content += "\n" + part["text"].get<std::string>();
                    }
                }
            } else {
                throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
            }
        } else {
            throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
        }
        chat.push_back({role, content});
    }
--- a/examples/server_embd.py
+++ b/examples/server_embd.py
@ -1,13 +1,15 @@
 import asyncio
 import asyncio.threads
 import requests
 import numpy as np
 n = 8
 result = []
 async def requests_post_async(*args, **kwargs):
-    return await asyncio.to_thread(requests.post, *args, **kwargs)
+    return await asyncio.threads.to_thread(requests.post, *args, **kwargs)
 async def main():
    model_url = "http://127.0.0.1:6900"
@ -31,4 +33,3 @@ for i in range(n-1):
        embedding2 = np.array(result[j])
        similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
        print(f"Similarity between {i} and {j}: {similarity:.2f}")
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@ -34,4 +34,3 @@ fi
 #use multiple GPUs with same max compute units
 #ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
--- a/examples/sycl/win-build-sycl.bat
+++ b/examples/sycl/win-build-sycl.bat
@ -31,4 +31,3 @@ exit /B 0
 :ERROR
 echo comomand error: %errorlevel%
 exit /B %errorlevel%
--- a/examples/sycl/win-run-llama2.bat
+++ b/examples/sycl/win-run-llama2.bat
@ -7,5 +7,3 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 .\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@ -29,7 +29,9 @@ static void print_usage_information(const char * argv0, FILE * stream) {
    fprintf(stream, "    -p PROMPT, --prompt PROMPT           read prompt from the argument.\n");
    fprintf(stream, "    --stdin                              read prompt from standard input.\n");
    fprintf(stream, "    --no-bos                             do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
    fprintf(stream, "    --no-parse-special                   do not parse control tokens.\n");
    fprintf(stream, "    --log-disable                        disable logs. Makes stderr quiet when loading the model.\n");
    fprintf(stream, "    --show-count                         print the total number of tokens.\n");
 }
 static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
@ -194,7 +196,9 @@ int main(int raw_argc, char ** raw_argv) {
    // variables where to put any arguments we see.
    bool printing_ids = false;
    bool no_bos = false;
    bool no_parse_special = false;
    bool disable_logging = false;
    bool show_token_count = false;
    const char * model_path = NULL;
    const char * prompt_path = NULL;
    const char * prompt_arg = NULL;
@ -227,6 +231,9 @@ int main(int raw_argc, char ** raw_argv) {
        else if (arg == "--no-bos") {
            no_bos = true;
        }
        else if (arg == "--no-parse-special") {
            no_parse_special = true;
        }
        else if (arg == "-p" || arg == "--prompt") {
            if (prompt_set) {
                fprintf(stderr, "Error: -p or --prompt specified multiple times.\n");
@ -249,6 +256,9 @@ int main(int raw_argc, char ** raw_argv) {
        else if (arg == "--log-disable") {
            disable_logging = true;
        }
        else if (arg == "--show-count") {
            show_token_count = true;
        }
        else {
            fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str());
            return 1;
@ -354,9 +364,10 @@ int main(int raw_argc, char ** raw_argv) {
    const bool model_wants_add_bos = llama_should_add_bos_token(model);
    const bool add_bos = model_wants_add_bos && !no_bos;
    const bool parse_special = !no_parse_special;
    std::vector<llama_token> tokens;
-    tokens = ::llama_tokenize(model, prompt, add_bos, true);
+    tokens = ::llama_tokenize(model, prompt, add_bos, parse_special);
    if (printing_ids) {
        printf("[");
@ -384,6 +395,9 @@ int main(int raw_argc, char ** raw_argv) {
        printf("]\n");
    }
    if (show_token_count) {
        printf("Total number of tokens: %ld\n", tokens.size());
    }
    // silence valgrind
    llama_free(ctx);
    llama_free_model(model);
--- a/examples/train-text-from-scratch/convert_train_checkpoint_to_gguf.py
+++ b/examples/train-text-from-scratch/convert_train_checkpoint_to_gguf.py
@ -66,7 +66,7 @@ class Tensor:
            if len(self.ne) == 0:
                self.nbytes = 0
            else:
-                self.nbytes = int(np.product(self.ne)) * 4
+                self.nbytes = int(np.prod(self.ne)) * 4
        else:
            raise ValueError(f"Unhandled data type '{self.dtype}'")
--- a/flake.lock
+++ b/flake.lock
@ -5,11 +5,11 @@
        "nixpkgs-lib": "nixpkgs-lib"
      },
      "locked": {
-        "lastModified": 1717285511,
+        "lastModified": 1719994518,
-        "narHash": "sha256-iKzJcpdXih14qYVcZ9QC9XuZYnPc6T8YImb6dX166kw=",
+        "narHash": "sha256-pQMhCCHyQGRzdfAkdJ4cIWiw+JNuWsTX7f0ZYSyz0VY=",
        "owner": "hercules-ci",
        "repo": "flake-parts",
-        "rev": "2a55567fcf15b1b1c7ed712a2c6fadaec7412ea8",
+        "rev": "9227223f6d922fee3c7b190b2cc238a99527bbb7",
        "type": "github"
      },
      "original": {
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1718895438,
+        "lastModified": 1720031269,
-        "narHash": "sha256-k3JqJrkdoYwE3fHE6xGDY676AYmyh4U2Zw+0Bwe5DLU=",
+        "narHash": "sha256-rwz8NJZV+387rnWpTYcXaRNvzUSnnF9aHONoJIYmiUQ=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "d603719ec6e294f034936c0d0dc06f689d91b6c3",
+        "rev": "9f4128e00b0ae8ec65918efeba59db998750ead6",
        "type": "github"
      },
      "original": {
@ -36,14 +36,14 @@
    },
    "nixpkgs-lib": {
      "locked": {
-        "lastModified": 1717284937,
+        "lastModified": 1719876945,
-        "narHash": "sha256-lIbdfCsf8LMFloheeE6N31+BMIeixqyQWbSr2vk79EQ=",
+        "narHash": "sha256-Fm2rDDs86sHy0/1jxTOKB1118Q0O3Uc7EC0iXvXKpbI=",
        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/eb9ceca17df2ea50a250b6b27f7bf6ab0186f198.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/5daf0514482af3f97abaefc78a6606365c9108e2.tar.gz"
      },
      "original": {
        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/eb9ceca17df2ea50a250b6b27f7bf6ab0186f198.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/5daf0514482af3f97abaefc78a6606365c9108e2.tar.gz"
      }
    },
    "root": {
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -104,7 +104,7 @@ option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"
 option(GGML_BLAS                            "ggml: use BLAS"                                  ${GGML_BLAS_DEFAULT})
 set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
                                            "ggml: BLAS library vendor")
-option(GGML_LLAMAFILE                       "ggml: use ggml SGEMM"                            OFF)
+option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"                             OFF)
 option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
 option(GGML_CUDA_FORCE_DMMV                 "ggml: use dmmv instead of mmvq CUDA kernels"     OFF)
--- a/ggml/ggml_vk_generate_shaders.py
+++ b/ggml/ggml_vk_generate_shaders.py
@ -99,6 +99,8 @@ async def main():
    tasks = []
    base_dict = {"FLOAT_TYPE": "float"}
    for fp16 in (False, True):
        # MUL_MAT
        matmul_shaders(tasks, fp16, False)
@ -106,8 +108,6 @@ async def main():
        matmul_shaders(tasks, fp16, True)
    for tname in type_names:
        base_dict = {"FLOAT_TYPE": "float"}
        # mul mat vec
        data_a_key = f"DATA_A_{tname.upper()}"
        shader = f"mul_mat_vec_{tname}.comp" if tname.endswith("_k") else "mul_mat_vec.comp"
--- a/ggml/include/ggml-metal.h
+++ b/ggml/include/ggml-metal.h
@ -63,4 +63,3 @@ GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -383,6 +383,9 @@ extern "C" {
        GGML_TYPE_F64     = 28,
        GGML_TYPE_IQ1_M   = 29,
        GGML_TYPE_BF16    = 30,
        GGML_TYPE_Q4_0_4_4 = 31,
        GGML_TYPE_Q4_0_4_8 = 32,
        GGML_TYPE_Q4_0_8_8 = 33,
        GGML_TYPE_COUNT,
    };
@ -424,6 +427,9 @@ extern "C" {
        GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
        GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
        GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
        GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
        GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
    };
    // available tensor operations:
@ -708,9 +714,9 @@ extern "C" {
    GGML_API GGML_CALL size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
    GGML_API           size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
-    GGML_API GGML_CALL int    ggml_blck_size(enum ggml_type type);
+    GGML_API GGML_CALL int64_t ggml_blck_size(enum ggml_type type);
-    GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
+    GGML_API GGML_CALL size_t  ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
-    GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
+    GGML_API GGML_CALL size_t  ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
    GGML_DEPRECATED(
    GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
@ -2404,20 +2410,31 @@ extern "C" {
 #endif
    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
-    typedef void (*ggml_vec_dot_t)   (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
+    typedef void (*ggml_from_float_to_mat_t)
-                                      const void * GGML_RESTRICT y, size_t by, int nrc);
+                                     (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
    typedef void (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
                                       const void * GGML_RESTRICT y, size_t by, int nrc);
    typedef void (*ggml_gemv_t)     (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
                                       const void * GGML_RESTRICT y, int nr, int nc);
    typedef void (*ggml_gemm_t)     (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
                                       const void * GGML_RESTRICT y, int nr, int nc);
    typedef struct {
-        const char      * type_name;
+        const char             * type_name;
-        int               blck_size;
+        int64_t                  blck_size;
-        size_t            type_size;
+        int64_t                  blck_size_interleave; // interleave elements in blocks
-        bool              is_quantized;
+        size_t                   type_size;
-        ggml_to_float_t   to_float;
+        bool                     is_quantized;
-        ggml_from_float_t from_float;
+        ggml_to_float_t          to_float;
-        ggml_from_float_t from_float_reference;
+        ggml_from_float_t        from_float;
-        ggml_vec_dot_t    vec_dot;
+        ggml_from_float_t        from_float_ref;
-        enum ggml_type    vec_dot_type;
+        ggml_from_float_to_mat_t from_float_to_mat;
-        int64_t           nrows; // number of rows to process simultaneously;
+        ggml_vec_dot_t           vec_dot;
        enum ggml_type           vec_dot_type;
        int64_t                  nrows; // number of rows to process simultaneously
        int64_t                  ncols; // number of columns to process simultaneously
        ggml_gemv_t              gemv;
        ggml_gemm_t              gemm;
    } ggml_type_traits_t;
    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -238,12 +238,12 @@ if (GGML_BLAS)
 endif()
 if (GGML_LLAMAFILE)
-    message(STATUS "Using ggml SGEMM")
+    message(STATUS "Using llamafile")
    add_compile_definitions(GGML_USE_LLAMAFILE)
-    set(GGML_HEADERS_LLAMAFILE sgemm.h)
+    set(GGML_HEADERS_LLAMAFILE llamafile/sgemm.h)
-    set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
+    set(GGML_SOURCES_LLAMAFILE llamafile/sgemm.cpp)
 endif()
 if (GGML_CUDA)
@ -486,9 +486,11 @@ if (GGML_SYCL)
    add_compile_options(-I./) #include DPCT
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
    if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
        add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
    else()
        add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
    endif()
    file(GLOB   GGML_HEADERS_SYCL "ggml-sycl/*.hpp")
@ -1151,6 +1153,7 @@ add_library(ggml
            ${GGML_SOURCES_ROCM}      ${GGML_HEADERS_ROCM}
            ${GGML_SOURCES_BLAS}      ${GGML_HEADERS_BLAS}
            ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
            ggml-aarch64.c            ggml-aarch64.h
            )
 if (EMSCRIPTEN)
@ -1166,9 +1169,12 @@ target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS})
 find_library(MATH_LIBRARY m)
 if (MATH_LIBRARY)
-    target_link_libraries(ggml PRIVATE ${MATH_LIBRARY})
+    if (NOT WIN32 OR NOT GGML_SYCL)
        target_link_libraries(ggml PRIVATE ${MATH_LIBRARY})
    endif()
 endif()
 if (BUILD_SHARED_LIBS)
    set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
    target_compile_definitions(ggml PRIVATE GGML_SHARED GGML_BUILD)
 endif()
--- a/Show More
+++ b/Show More
`@ -87,4 +87,4 @@ The LORA rank can be configured for each model tensor type separately with these`

	`The LORA rank of 'norm' tensors should always be 1.`	`The LORA rank of 'norm' tensors should always be 1.`

	To see all available options use `finetune --help`.	To see all available options use `llama-finetune --help`.
`@ -10,4 +10,3 @@ More info:`

	`https://github.com/ggerganov/llama.cpp/pull/4484`	`https://github.com/ggerganov/llama.cpp/pull/4484`
	`https://github.com/ggerganov/llama.cpp/issues/4226`	`https://github.com/ggerganov/llama.cpp/issues/4226`
`@ -34,4 +34,3 @@ fi`

	`#use multiple GPUs with same max compute units`	`#use multiple GPUs with same max compute units`
	`#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0`	`#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0`
`@ -7,5 +7,3 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"`


	`.\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0`	`.\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0`