Merge branch 'ggerganov:master' into prepare-PR-of-minicpm-v2.5

2025-01-12 11:40:17 +00:00 · 2024-06-04 14:52:39 +08:00 · 2024-06-04 14:52:39 +08:00 · c390dd4e22
commit c390dd4e22
parent a913ca4cb9 bde7cd3cd9
197 changed files with 83031 additions and 16365 deletions
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@ -31,6 +31,6 @@ ENV LLAMA_CUDA=1
 # Enable cURL
 ENV LLAMA_CURL=1
-RUN make
+RUN make -j$(nproc)
 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev
-RUN make
+RUN make -j$(nproc)
 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@ -18,7 +18,7 @@ COPY . .
 ENV LLAMA_CURL=1
-RUN make
+RUN make -j$(nproc)
 ENV LC_ALL=C.utf8
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@ -23,7 +23,7 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
 ENV LLAMA_CUDA=1
-RUN make
+RUN make -j$(nproc)
 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
--- a/.devops/main-intel.Dockerfile
+++ b/.devops/main-intel.Dockerfile
@ -2,6 +2,14 @@ ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
    rm /etc/apt/sources.list.d/intel-graphics.list && \
    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
    chmod 644 /usr/share/keyrings/intel-graphics.gpg
 ARG LLAMA_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git
--- a/.devops/main-rocm.Dockerfile
+++ b/.devops/main-rocm.Dockerfile
@ -40,6 +40,6 @@ ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
-RUN make
+RUN make -j$(nproc)
 ENTRYPOINT [ "/app/main" ]
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@ -9,7 +9,7 @@ WORKDIR /app
 COPY . .
-RUN make
+RUN make -j$(nproc)
 FROM ubuntu:$UBUNTU_VERSION as runtime
--- a/.devops/server-cuda.Dockerfile
+++ b/.devops/server-cuda.Dockerfile
@ -25,7 +25,7 @@ ENV LLAMA_CUDA=1
 # Enable cURL
 ENV LLAMA_CURL=1
-RUN make
+RUN make -j$(nproc)
 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
--- a/.devops/server-intel.Dockerfile
+++ b/.devops/server-intel.Dockerfile
@ -2,6 +2,14 @@ ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
    rm /etc/apt/sources.list.d/intel-graphics.list && \
    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
    chmod 644 /usr/share/keyrings/intel-graphics.gpg
 ARG LLAMA_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git libcurl4-openssl-dev
@ -19,6 +27,14 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
    rm /etc/apt/sources.list.d/intel-graphics.list && \
    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
    chmod 644 /usr/share/keyrings/intel-graphics.gpg
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev
--- a/.devops/server-rocm.Dockerfile
+++ b/.devops/server-rocm.Dockerfile
@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev
-RUN make
+RUN make -j$(nproc)
 ENTRYPOINT [ "/app/server" ]
--- a/.devops/server.Dockerfile
+++ b/.devops/server.Dockerfile
@ -11,7 +11,7 @@ COPY . .
 ENV LLAMA_CURL=1
-RUN make
+RUN make -j$(nproc)
 FROM ubuntu:$UBUNTU_VERSION as runtime
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@ -8,7 +8,7 @@ arg1="$1"
 shift
 if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
-    python3 ./convert.py "$@"
+    python3 ./convert-hf-to-gguf.py "$@"
 elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
    ./quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
--- a/.github/ISSUE_TEMPLATE/06-question.yml
+++ b/.github/ISSUE_TEMPLATE/06-question.yml
@ -1,38 +0,0 @@
 name: Question
 description: Used to ask questions about llama.cpp
 title: "Question: "
 labels: ["question"]
 body:
  - type: markdown
    attributes:
      value: |
        [Please search your question first in Discussion if you got a common general question.](https://github.com/ggerganov/llama.cpp/discussions/categories/q-a)
  - type: checkboxes
    id: prerequisites
    attributes:
      label: Prerequisites
      description: Please confirm the following before submitting your question.
      options:
        - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
          required: true
        - label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new useful question to share that cannot be answered within Discussions.
          required: true
  - type: textarea
    id: background-description
    attributes:
      label: Background Description
      description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an question.
      placeholder: Detailed description of your question
    validations:
      required: true
  - type: textarea
    id: possible-answer
    attributes:
      label: Possible Answer
      description: If you have some idea of possible answers you want to confirm, that would also be appreciated.
      placeholder: Your idea of possible answers
    validations:
      required: false
--- a/.github/ISSUE_TEMPLATE/06-research.yml
+++ b/.github/ISSUE_TEMPLATE/06-research.yml
@ -0,0 +1,52 @@
 name: Research
 description: Track new technical research area
 title: "Research: "
 labels: ["research 🔬"]
 body:
  - type: markdown
    attributes:
      value: |
        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
  - type: checkboxes
    id: research-stage
    attributes:
      label: Research Stage
      description: Track general state of this research ticket
      options:
        - label: Background Research (Let's try to avoid reinventing the wheel)
        - label: Hypothesis Formed (How do you think this will work and it's effect?)
        - label: Strategy / Implementation Forming
        - label: Analysis of results
        - label: Debrief / Documentation (So people in the future can learn from us)
  - type: textarea
    id: background
    attributes:
      label: Previous existing literature and research
      description: Whats the current state of the art and whats the motivation for this research?
  - type: textarea
    id: hypothesis
    attributes:
      label: Hypothesis
      description: How do you think this will work and it's effect?
  - type: textarea
    id: implementation
    attributes:
      label: Implementation
      description: Got an approach? e.g. a PR ready to go?
  - type: textarea
    id: analysis
    attributes:
      label: Analysis
      description: How does the proposed implementation behave?
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
      render: shell
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -0,0 +1,13 @@
 blank_issues_enabled: true
 contact_links:
  - name: Got an idea?
    url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
    about: Pop it there. It may then become an enhancement ticket.
  - name: Got a question?
    url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
    about: Ask a question there!
  - name: Want to contribute?
    url: https://github.com/ggerganov/llama.cpp/wiki/contribute
    about: Head to the contribution guide page of the wiki for areas you can help with
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -294,12 +294,22 @@ jobs:
      - name: Build
        id: cmake_build
        if: ${{ matrix.sanitizer != 'THREAD' }}
        run: |
          mkdir build
          cd build
          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
      - name: Build (no OpenMP)
        id: cmake_build_no_openmp
        if: ${{ matrix.sanitizer == 'THREAD' }}
        run: |
          mkdir build
          cd build
          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_OPENMP=OFF
          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
      - name: Test
        id: cmake_test
        run: |
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -42,9 +42,8 @@ jobs:
          - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          # TODO: Disabled due to build issues https://github.com/ggerganov/llama.cpp/issues/7507
+          - { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
-          #- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
          #- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v4
--- a/.gitignore
+++ b/.gitignore
@ -107,6 +107,7 @@ examples/jeopardy/results.txt
 examples/server/*.html.hpp
 examples/server/*.js.hpp
 examples/server/*.mjs.hpp
 examples/server/*.css.hpp
 poetry.lock
 poetry.toml
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -106,6 +106,7 @@ set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                             "llama: max. batch size for using peer access")
 option(LLAMA_CUDA_NO_PEER_COPY               "llama: do not use peer to peer copies"            OFF)
 option(LLAMA_CUDA_NO_VMM                     "llama: do not try to use CUDA VMM"                OFF)
 option(LLAMA_CUDA_FA_ALL_QUANTS              "llama: compile all quants for FlashAttention"     OFF)
 option(LLAMA_CURL                            "llama: use libcurl to download model from an URL" OFF)
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
@ -125,6 +126,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
 set(LLAMA_METAL_STD "" CACHE STRING          "llama: metal standard version (-std flag)")
 option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
 option(LLAMA_RPC                             "llama: use RPC"                                   OFF)
 option(LLAMA_OPENMP                          "llama: use OpenMP"                                ON)
 option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
 option(LLAMA_SYCL_F16                        "llama: use 16 bit floats for sycl calculations"   OFF)
 set(LLAMA_SYCL_TARGET   "INTEL" CACHE STRING "llama: sycl target device")
@ -295,6 +297,17 @@ if (LLAMA_METAL)
        )
 endif()
 if (LLAMA_OPENMP)
    find_package(OpenMP)
    if (OpenMP_FOUND)
        message(STATUS "OpenMP found")
        add_compile_definitions(GGML_USE_OPENMP)
        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
    else()
        message(WARNING "OpenMP not found")
    endif()
 endif()
 if (LLAMA_BLAS)
    if (LLAMA_STATIC)
        set(BLA_STATIC ON)
@ -402,6 +415,8 @@ if (LLAMA_CUDA)
        file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
        list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
        file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
        list(APPEND GGML_SOURCES_CUDA ${SRCS})
        add_compile_definitions(GGML_USE_CUDA)
        add_compile_definitions(GGML_CUDA_USE_GRAPHS)
@ -427,6 +442,18 @@ if (LLAMA_CUDA)
        if (LLAMA_CUDA_NO_PEER_COPY)
            add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
        endif()
        if (LLAMA_CUDA_FA_ALL_QUANTS)
            file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
            list(APPEND GGML_SOURCES_CUDA ${SRCS})
            add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
        else()
            file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
            list(APPEND GGML_SOURCES_CUDA ${SRCS})
            file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
            list(APPEND GGML_SOURCES_CUDA ${SRCS})
            file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
            list(APPEND GGML_SOURCES_CUDA ${SRCS})
        endif()
        if (LLAMA_STATIC)
            if (WIN32)
@ -571,6 +598,8 @@ if (LLAMA_HIPBLAS)
    file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
    list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
    file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
    list(APPEND GGML_SOURCES_ROCM ${SRCS})
    add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
@ -590,6 +619,19 @@ if (LLAMA_HIPBLAS)
        add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
    endif()
    if (LLAMA_CUDA_FA_ALL_QUANTS)
        file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
        list(APPEND GGML_SOURCES_ROCM ${SRCS})
        add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
    else()
        file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
        list(APPEND GGML_SOURCES_ROCM ${SRCS})
        file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
        list(APPEND GGML_SOURCES_ROCM ${SRCS})
        file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
        list(APPEND GGML_SOURCES_ROCM ${SRCS})
    endif()
    add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
    add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
    add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
@ -628,6 +670,10 @@ if (LLAMA_SYCL)
        add_compile_definitions(GGML_SYCL_F16)
    endif()
    if (LLAMA_CUDA_FORCE_MMQ)
        add_compile_definitions(GGML_SYCL_FORCE_MMQ)
    endif()
    add_compile_options(-I./) #include DPCT
    add_compile_options(-I/${SYCL_INCLUDE_DIR})
@ -743,6 +789,7 @@ if (LLAMA_KOMPUTE)
            kompute-shaders/op_mul_mat_q4_0.comp
            kompute-shaders/op_mul_mat_q4_1.comp
            kompute-shaders/op_mul_mat_q6_k.comp
            kompute-shaders/op_getrows_f32.comp
            kompute-shaders/op_getrows_f16.comp
            kompute-shaders/op_getrows_q4_0.comp
            kompute-shaders/op_getrows_q4_1.comp
@ -775,6 +822,7 @@ if (LLAMA_KOMPUTE)
            shaderop_mul_mat_q4_0.h
            shaderop_mul_mat_q4_1.h
            shaderop_mul_mat_q6_k.h
            shaderop_getrows_f32.h
            shaderop_getrows_f16.h
            shaderop_getrows_q4_0.h
            shaderop_getrows_q4_1.h
@ -1310,7 +1358,7 @@ set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}
 install(TARGETS llama LIBRARY PUBLIC_HEADER)
 install(
-    FILES convert.py
+    FILES convert-hf-to-gguf.py
    PERMISSIONS
        OWNER_READ
        OWNER_WRITE
@ -1337,6 +1385,13 @@ if (LLAMA_METAL)
    endif()
 endif()
 configure_file(cmake/llama.pc.in
        "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
        @ONLY)
 install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
        DESTINATION lib/pkgconfig)
 #
 # programs, examples and tests
 #
--- a/68
+++ b/68
@ -57,6 +57,8 @@ ifeq ($(UNAME_S),Darwin)
 		LLAMA_METAL := 1
 	endif
 	LLAMA_NO_OPENMP := 1
 	ifneq ($(UNAME_P),arm)
 		SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
 		ifeq ($(SYSCTL_M),1)
@ -67,6 +69,10 @@ ifeq ($(UNAME_S),Darwin)
 	endif
 endif
 ifdef LLAMA_RPC
 	BUILD_TARGETS += rpc-server
 endif
 default: $(BUILD_TARGETS)
 test: $(TEST_TARGETS)
@ -135,12 +141,16 @@ MK_NVCCFLAGS = -std=c++11
 ifdef LLAMA_FAST
 MK_CFLAGS     += -Ofast
 HOST_CXXFLAGS += -Ofast
 ifndef LLAMA_DEBUG
 MK_NVCCFLAGS  += -O3
 endif # LLAMA_DEBUG
 else
 MK_CFLAGS     += -O3
 MK_CXXFLAGS   += -O3
 ifndef LLAMA_DEBUG
 MK_NVCCFLAGS  += -O3
-endif
+endif # LLAMA_DEBUG
 endif # LLAMA_FAST
 ifndef LLAMA_NO_CCACHE
 CCACHE := $(shell which ccache)
@ -204,6 +214,7 @@ ifdef LLAMA_DEBUG
 	MK_CFLAGS    += -O0 -g
 	MK_CXXFLAGS  += -O0 -g
 	MK_LDFLAGS   += -g
 	MK_NVCCFLAGS += -O0 -g
 	ifeq ($(UNAME_S),Linux)
 		MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
@ -400,6 +411,12 @@ ifndef LLAMA_NO_ACCELERATE
 	endif
 endif # LLAMA_NO_ACCELERATE
 ifndef LLAMA_NO_OPENMP
 	MK_CPPFLAGS += -DGGML_USE_OPENMP
 	MK_CFLAGS   += -fopenmp
 	MK_CXXFLAGS += -fopenmp
 endif # LLAMA_NO_OPENMP
 ifdef LLAMA_OPENBLAS
 	MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
@ -416,11 +433,25 @@ ifdef LLAMA_BLIS
 	MK_LDFLAGS  += -lblis -L/usr/local/lib
 endif # LLAMA_BLIS
 ifdef LLAMA_RPC
 	MK_CPPFLAGS   += -DGGML_USE_RPC
 	OBJS          += ggml-rpc.o
 endif # LLAMA_RPC
 ifdef LLAMA_CUBLAS
 # LLAMA_CUBLAS is deprecated and will be removed in the future
 	LLAMA_CUDA := 1
 endif
 OBJS_CUDA_TEMP_INST      = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
 ifdef LLAMA_CUDA_FA_ALL_QUANTS
 	OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu))
 else
 	OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
 	OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
 	OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
 endif # LLAMA_CUDA_FA_ALL_QUANTS
 ifdef LLAMA_CUDA
 	ifneq ('', '$(wildcard /opt/cuda)')
 		CUDA_PATH ?= /opt/cuda
@ -431,6 +462,7 @@ ifdef LLAMA_CUDA
 	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
 	OBJS         += ggml-cuda.o
 	OBJS         += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
 	OBJS         += $(OBJS_CUDA_TEMP_INST)
 	MK_NVCCFLAGS += -use_fast_math
 ifdef LLAMA_FATAL_WARNINGS
 	MK_NVCCFLAGS += -Werror all-warnings
@ -493,7 +525,10 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
 endif # LLAMA_CUDA_NO_PEER_COPY
 ifdef LLAMA_CUDA_CCBIN
 	MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
-endif
+endif # LLAMA_CUDA_CCBIN
 ifdef LLAMA_CUDA_FA_ALL_QUANTS
 	MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
 endif # LLAMA_CUDA_FA_ALL_QUANTS
 ifdef JETSON_EOL_MODULE_DETECT
 define NVCC_COMPILE
@ -505,7 +540,7 @@ define NVCC_COMPILE
 endef # NVCC_COMPILE
 endif # JETSON_EOL_MODULE_DETECT
-ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
+ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
 	$(NVCC_COMPILE)
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
@ -571,6 +606,7 @@ ifdef LLAMA_HIP_UMA
 	MK_CPPFLAGS += -DGGML_HIP_UMA
 endif # LLAMA_HIP_UMA
 	MK_LDFLAGS  += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
 	MK_LDFLAGS  += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
 	MK_LDFLAGS	+= -lhipblas -lamdhip64 -lrocblas
 	HIPFLAGS    += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
 	HIPFLAGS    += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
@ -584,11 +620,12 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
 endif # LLAMA_CUDA_NO_PEER_COPY
 	OBJS        += ggml-cuda.o
 	OBJS        += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
 	OBJS        += $(OBJS_CUDA_TEMP_INST)
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
-ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
+ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
 endif # LLAMA_HIPBLAS
@ -626,11 +663,26 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
 endif
 endif # LLAMA_METAL
 OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
 COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
 COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
 ifndef LLAMA_NO_LLAMAFILE
 sgemm.o: sgemm.cpp sgemm.h ggml.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif
 ifdef LLAMA_RPC
 ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 endif # LLAMA_RPC
 GF_CC := $(CC)
 include scripts/get-flags.mk
@ -710,14 +762,9 @@ unicode.o: unicode.cpp unicode.h
 unicode-data.o: unicode-data.cpp unicode-data.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
 llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
 COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
 common.o: common/common.cpp $(COMMON_H_DEPS)
 	$(CXX) $(CXXFLAGS) -c $< -o $@
@ -748,6 +795,7 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
 clean:
 	rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
 	rm -vrf ggml-cuda/*.o
 	rm -vrf ggml-cuda/template-instances/*.o
 	find examples pocs -type f -name "*.o" -delete
 #
@ -816,7 +864,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
--- a/README-sycl.md
+++ b/README-sycl.md
@ -55,8 +55,8 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
 ## OS
 | OS      | Status  | Verified                                       |
-|---------|---------|------------------------------------|
+|---------|---------|------------------------------------------------|
-| Linux   | Support | Ubuntu 22.04, Fedora Silverblue 39 |
+| Linux   | Support | Ubuntu 22.04, Fedora Silverblue 39, Arch Linux |
 | Windows | Support | Windows 11                                     |
@ -70,7 +70,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
 |-------------------------------|---------|---------------------------------------|
 | Intel Data Center Max Series  | Support | Max 1550, 1100                        |
 | Intel Data Center Flex Series | Support | Flex 170                              |
-| Intel Arc Series              | Support | Arc 770, 730M                         |
+| Intel Arc Series              | Support | Arc 770, 730M, Arc A750               |
 | Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake       |
 | Intel iGPU                    | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |
--- a/README.md
+++ b/README.md
@ -2,7 +2,9 @@
 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
-[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
+[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
 [![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)
 [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
@ -20,7 +22,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 ### Hot topics
- **Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021**
+- **`convert.py` has been deprecated and moved to `examples/convert-legacy-llama.py`, please use `convert-hf-to-gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
 - Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
 - BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
 - MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
 - Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
@ -147,6 +150,8 @@ Typically finetunes of the base models below are supported as well.
 [llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
 [simplechat](./examples/server/public_simplechat) is a simple chat client, which can be used to chat with the model exposed using above web server (use --path to point to simplechat), from a local web browser.
 **Bindings:**
 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
@ -200,6 +205,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
 - [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
 - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
 - [AIKit](https://github.com/sozercan/aikit) (MIT)
 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
@ -315,8 +321,6 @@ In order to build llama.cpp you have four different options.
      make
      ```
      **Note**: for `Debug` builds, run `make LLAMA_DEBUG=1`
  - On Windows:
    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
@ -328,6 +332,11 @@ In order to build llama.cpp you have four different options.
        make
        ```
  - Notes:
    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
    - For faster repeated compilation, install [ccache](https://ccache.dev/).
    - For debug builds, run `make LLAMA_DEBUG=1`
 - Using `CMake`:
  ```bash
@ -335,16 +344,20 @@ In order to build llama.cpp you have four different options.
  cmake --build build --config Release
  ```
-    **Note**: for `Debug` builds, there are two cases:
+  **Notes**:
-    - Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
+    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
    - For faster repeated compilation, install [ccache](https://ccache.dev/).
    - For debug builds, there are two cases:
      1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
      ```bash
      cmake -B build -DCMAKE_BUILD_TYPE=Debug
      cmake --build build
      ```
-    - Multi-config generators (`-G` param set to Visual Studio, XCode...):
+      2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
      ```bash
      cmake -B build -G "Xcode"
@ -379,6 +392,14 @@ In order to build llama.cpp you have four different options.
    CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
    the instructions for use and activate this options in this document below.
 ### Homebrew
 On Mac and Linux, the homebrew package manager can be used via
 ```
 brew install llama.cpp
 ```
 The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
 ### Metal Build
 On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
@ -478,9 +499,11 @@ Building the program with BLAS support may lead to some performance improvements
  | LLAMA_CUDA_FORCE_DMMV          | Boolean                | false   | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
  | LLAMA_CUDA_DMMV_X              | Positive integer >= 32 | 32      | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants.                                         |
  | LLAMA_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                               |
  | LLAMA_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of dequantization + matrix multiplication kernels instead of leveraging Math libraries. |                                                                                                                                         |
  | LLAMA_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
  | LLAMA_CUDA_KQUANTS_ITER        | 1 or 2                 | 2       | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                                                                     |
  | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
  | LLAMA_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |
 - #### hipBLAS
@ -696,7 +719,8 @@ Building the program with BLAS support may lead to some performance improvements
 To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
-Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
+Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derievatives.
 It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
 ```bash
 # obtain the official LLaMA model weights and place them in ./models
@ -713,10 +737,10 @@ ls ./models
 python3 -m pip install -r requirements.txt
 # convert the model to ggml FP16 format
-python3 convert.py models/mymodel/
+python3 convert-hf-to-gguf.py models/mymodel/
 # [Optional] for models using BPE tokenizers
-python convert.py models/mymodel/ --vocab-type bpe
+python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe
 # quantize the model to 4-bits (using Q4_K_M method)
 ./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
--- a/ci/run.sh
+++ b/ci/run.sh
@ -287,7 +287,7 @@ function gg_run_open_llama_7b_v2 {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log
-    python3 ../convert.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
--- a/cmake/llama.pc.in
+++ b/cmake/llama.pc.in
@ -0,0 +1,10 @@
 prefix=@CMAKE_INSTALL_PREFIX@
 exec_prefix=${prefix}
 libdir=${exec_prefix}/lib
 includedir=${prefix}/include
 Name: llama
 Description: Port of Facebook's LLaMA model in C/C++
 Version: @PROJECT_VERSION@
 Libs: -L${libdir} -lllama
 Cflags: -I${includedir}
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1002,9 +1002,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            return true;
        }
        params.main_gpu = std::stoi(argv[i]);
-#ifndef GGML_USE_CUDA_SYCL
+#ifndef GGML_USE_CUDA_SYCL_VULKAN
-        fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the main GPU has no effect.\n");
+        fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL
+#endif // GGML_USE_CUDA_SYCL_VULKAN
        return true;
    }
    if (arg == "--split-mode" || arg == "-sm") {
@ -1030,9 +1030,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            invalid_param = true;
            return true;
        }
-#ifndef GGML_USE_CUDA_SYCL
+#ifndef GGML_USE_CUDA_SYCL_VULKAN
-        fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the split mode has no effect.\n");
+        fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL
+#endif // GGML_USE_CUDA_SYCL_VULKAN
        return true;
    }
    if (arg == "--tensor-split" || arg == "-ts") {
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -25,8 +25,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
 from convert import LlamaHfVocab
 logger = logging.getLogger("hf-to-gguf")
@ -634,7 +632,7 @@ class Model:
        special_vocab.add_to_gguf(self.gguf_writer)
    def _set_vocab_llama_hf(self):
-        vocab = LlamaHfVocab(self.dir_model)
+        vocab = gguf.LlamaHfVocab(self.dir_model)
        tokens = []
        scores = []
        toktypes = []
@ -2804,7 +2802,12 @@ def main() -> None:
    hparams = Model.load_hparams(dir_model)
    with torch.inference_mode():
        try:
            model_class = Model.from_model_architecture(hparams["architectures"][0])
        except NotImplementedError:
            logger.error(f"Model {hparams['architectures'][0]} is not supported")
            sys.exit(1)
        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
        logger.info("Set model parameters")
--- a/docs/HOWTO-add-model.md
+++ b/docs/HOWTO-add-model.md
@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M
 ### 1. Convert the model to GGUF
 This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
-Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py).
+Depending on the model architecture, you can use either [convert-hf-to-gguf.py](../convert-hf-to-gguf.py) or [examples/convert-legacy-llama.py](../examples/convert-legacy-llama.py) (for `llama/llama2` models in `.pth` format).
 The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
--- a/examples/convert-legacy-llama.py
+++ b/examples/convert-legacy-llama.py
@ -24,14 +24,16 @@ from abc import ABC, abstractmethod
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional
+from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional
 import numpy as np
 from sentencepiece import SentencePieceProcessor
 if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
+    # use .parent.parent since we are in "examples" directory
    sys.path.insert(1, str(Path(__file__).parent.parent / 'gguf-py'))
 import gguf
 from gguf import BaseVocab, Vocab, NoVocab, BpeVocab, SentencePieceVocab, LlamaHfVocab
 if TYPE_CHECKING:
    from typing_extensions import Self, TypeAlias
@ -380,306 +382,6 @@ class Metadata:
        return metadata
 #
 # vocab
 #
@runtime_checkable
 class BaseVocab(Protocol):
    tokenizer_model: ClassVar[str]
    name: ClassVar[str]
 class NoVocab(BaseVocab):
    tokenizer_model = "no_vocab"
    name = "no_vocab"
    def __repr__(self) -> str:
        return "<NoVocab for a model without integrated vocabulary>"
@runtime_checkable
 class Vocab(BaseVocab, Protocol):
    vocab_size: int
    added_tokens_dict: dict[str, int]
    added_tokens_list: list[str]
    fname_tokenizer: Path
    def __init__(self, base_path: Path): ...
    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
 class BpeVocab(Vocab):
    tokenizer_model = "gpt2"
    name = "bpe"
    def __init__(self, base_path: Path):
        added_tokens: dict[str, int] = {}
        if (fname_tokenizer := base_path / 'vocab.json').exists():
            # "slow" tokenizer
            with open(fname_tokenizer, encoding="utf-8") as f:
                self.vocab = json.load(f)
            try:
                # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
                with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
                    added_tokens = json.load(f)
            except FileNotFoundError:
                pass
        else:
            # "fast" tokenizer
            fname_tokenizer = base_path / FAST_TOKENIZER_FILE
            # if this fails, FileNotFoundError propagates to caller
            with open(fname_tokenizer, encoding="utf-8") as f:
                tokenizer_json = json.load(f)
            tokenizer_model: dict[str, Any] = tokenizer_json['model']
            if (
                tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
                or tokenizer_json['decoder']['type'] != 'ByteLevel'
            ):
                raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
            self.vocab = tokenizer_model["vocab"]
            if (added := tokenizer_json.get('added_tokens')) is not None:
                # Added tokens here can be duplicates of the main vocabulary.
                added_tokens = {item['content']: item['id']
                                for item in added
                                if item['content'] not in self.vocab}
        vocab_size   = len(self.vocab)
        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
        actual_ids   = sorted(added_tokens.values())
        if expected_ids != actual_ids:
            expected_end_id = vocab_size + len(actual_ids) - 1
            raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
                             f"{vocab_size} - {expected_end_id}; got {actual_ids}")
        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
        self.added_tokens_dict    = added_tokens
        self.added_tokens_list    = [text for (text, idx) in items]
        self.vocab_size_base      = vocab_size
        self.vocab_size           = self.vocab_size_base + len(self.added_tokens_list)
        self.fname_tokenizer      = fname_tokenizer
    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
        for i, _ in enumerate(self.vocab):
            yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        for text in self.added_tokens_list:
            score = -1000.0
            yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        yield from self.bpe_tokens()
        yield from self.added_tokens()
    def __repr__(self) -> str:
        return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
 class SentencePieceVocab(Vocab):
    tokenizer_model = "llama"
    name = "spm"
    def __init__(self, base_path: Path):
        added_tokens: dict[str, int] = {}
        if (fname_tokenizer := base_path / 'tokenizer.model').exists():
            # normal location
            try:
                with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
                    added_tokens = json.load(f)
            except FileNotFoundError:
                pass
        elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
            # not found in alternate location either
            raise FileNotFoundError('Cannot find tokenizer.model')
        self.sentencepiece_tokenizer = SentencePieceProcessor()
        self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
        vocab_size = self.sentencepiece_tokenizer.vocab_size()
        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
        actual_new_ids   = sorted(new_tokens.keys())
        if expected_new_ids != actual_new_ids:
            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
        # Token pieces that were added to the base vocabulary.
        self.added_tokens_dict  = added_tokens
        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
        self.vocab_size_base    = vocab_size
        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
        self.fname_tokenizer    = fname_tokenizer
    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        tokenizer = self.sentencepiece_tokenizer
        for i in range(tokenizer.vocab_size()):
            piece = tokenizer.IdToPiece(i)
            text         = piece.encode("utf-8")
            score: float = tokenizer.GetScore(i)
            toktype = gguf.TokenType.NORMAL
            if tokenizer.IsUnknown(i):
                toktype = gguf.TokenType.UNKNOWN
            if tokenizer.IsControl(i):
                toktype = gguf.TokenType.CONTROL
            # NOTE: I think added_tokens are user defined.
            # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
            # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
            if tokenizer.IsUnused(i):
                toktype = gguf.TokenType.UNUSED
            if tokenizer.IsByte(i):
                toktype = gguf.TokenType.BYTE
            yield text, score, toktype
    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        for text in self.added_tokens_list:
            score = -1000.0
            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        yield from self.sentencepiece_tokens()
        yield from self.added_tokens()
    def __repr__(self) -> str:
        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
 class LlamaHfVocab(Vocab):
    tokenizer_model = "llama"
    name = "hfft"
    def __init__(self, base_path: Path):
        fname_tokenizer = base_path / FAST_TOKENIZER_FILE
        # if this fails, FileNotFoundError propagates to caller
        with open(fname_tokenizer, encoding='utf-8') as f:
            tokenizer_json = json.load(f)
        # pre-check so we know if we need transformers
        tokenizer_model: dict[str, Any] = tokenizer_json['model']
        is_llama3 = (
            tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
            and not tokenizer_model.get('byte_fallback', True)
        )
        if is_llama3:
            raise TypeError('Llama 3 must be converted with BpeVocab')
        if not is_llama3 and (
            tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
            or tokenizer_json['decoder']['type'] != 'Sequence'
        ):
            raise FileNotFoundError('Cannot find Llama BPE tokenizer')
        try:
            from transformers import AutoTokenizer
        except ImportError as e:
            raise ImportError(
                "To use LlamaHfVocab, please install the `transformers` package. "
                "You can install it with `pip install transformers`."
            ) from e
        # Allow the tokenizer to default to slow or fast versions.
        # Explicitly set tokenizer to use local paths.
        self.tokenizer = AutoTokenizer.from_pretrained(
            base_path,
            cache_dir=base_path,
            local_files_only=True,
        )
        assert self.tokenizer.is_fast  # assume tokenizer.json is used
        # Initialize lists and dictionaries for added tokens
        self.added_tokens_list = []
        self.added_tokens_dict = dict()
        self.added_tokens_ids  = set()
        # Process added tokens
        for tok, tokidx in sorted(
            self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
        ):
            # Only consider added tokens that are not in the base vocabulary
            if tokidx >= self.tokenizer.vocab_size:
                self.added_tokens_list.append(tok)
                self.added_tokens_dict[tok] = tokidx
                self.added_tokens_ids.add(tokidx)
        # Store special tokens and their IDs
        self.specials = {
            tok: self.tokenizer.get_vocab()[tok]
            for tok in self.tokenizer.all_special_tokens
        }
        self.special_ids = set(self.tokenizer.all_special_ids)
        # Set vocabulary sizes
        self.vocab_size_base = self.tokenizer.vocab_size
        self.vocab_size      = self.vocab_size_base + len(self.added_tokens_list)
        self.fname_tokenizer = fname_tokenizer
    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        reverse_vocab = {
            id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
        }
        for token_id in range(self.vocab_size_base):
            # Skip processing added tokens here
            if token_id in self.added_tokens_ids:
                continue
            # Convert token text to bytes
            token_text = reverse_vocab[token_id].encode("utf-8")
            # Yield token text, score, and type
            yield token_text, self.get_token_score(token_id), self.get_token_type(
                token_id, token_text, self.special_ids  # Reuse already stored special IDs
            )
    def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
        # Special case for byte tokens
        if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
            return gguf.TokenType.BYTE
        # Determine token type based on whether it's a special token
        return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
    def get_token_score(self, token_id: int) -> float:
        # Placeholder for actual logic to determine the token's score
        # This needs to be implemented based on specific requirements
        return -1000.0  # Default score
    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        for text in self.added_tokens_list:
            if text in self.specials:
                toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
                score = self.get_token_score(self.specials[text])
            else:
                toktype = gguf.TokenType.USER_DEFINED
                score = -1000.0
            yield text.encode("utf-8"), score, toktype
    def has_newline_token(self):
        return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        yield from self.hf_tokens()
        yield from self.added_tokens()
    def __repr__(self) -> str:
        return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
 #
 # data loading
 # TODO: reuse (probably move to gguf.py?)
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -178,6 +178,7 @@ struct cmd_params {
    std::vector<ggml_type> type_v;
    std::vector<int> n_threads;
    std::vector<int> n_gpu_layers;
    std::vector<std::string> rpc_servers;
    std::vector<llama_split_mode> split_mode;
    std::vector<int> main_gpu;
    std::vector<bool> no_kv_offload;
@ -202,6 +203,7 @@ static const cmd_params cmd_params_defaults = {
    /* type_v        */ {GGML_TYPE_F16},
    /* n_threads     */ {cpu_get_num_math()},
    /* n_gpu_layers  */ {99},
    /* rpc_servers   */ {""},
    /* split_mode    */ {LLAMA_SPLIT_MODE_LAYER},
    /* main_gpu      */ {0},
    /* no_kv_offload */ {false},
@ -230,6 +232,7 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -ctv, --cache-type-v <t>            (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
    printf("  -t, --threads <n>                   (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
    printf("  -ngl, --n-gpu-layers <n>            (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
    printf("  -rpc, --rpc <rpc_servers>           (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
    printf("  -sm, --split-mode <none|layer|row>  (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
    printf("  -mg, --main-gpu <i>                 (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
    printf("  -nkvo, --no-kv-offload <0|1>        (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
@ -384,6 +387,12 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            }
            auto p = split<int>(argv[i], split_delim);
            params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
        } else if (arg == "-rpc" || arg == "--rpc") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.rpc_servers.push_back(argv[i]);
        } else if (arg == "-sm" || arg == "--split-mode") {
            if (++i >= argc) {
                invalid_param = true;
@ -519,6 +528,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.type_k.empty())       { params.type_k = cmd_params_defaults.type_k; }
    if (params.type_v.empty())       { params.type_v = cmd_params_defaults.type_v; }
    if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
    if (params.rpc_servers.empty())  { params.rpc_servers = cmd_params_defaults.rpc_servers; }
    if (params.split_mode.empty())   { params.split_mode = cmd_params_defaults.split_mode; }
    if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
    if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
@ -541,6 +551,7 @@ struct cmd_params_instance {
    ggml_type type_v;
    int n_threads;
    int n_gpu_layers;
    std::string rpc_servers;
    llama_split_mode split_mode;
    int main_gpu;
    bool no_kv_offload;
@ -553,6 +564,9 @@ struct cmd_params_instance {
        llama_model_params mparams = llama_model_default_params();
        mparams.n_gpu_layers = n_gpu_layers;
        if (!rpc_servers.empty()) {
            mparams.rpc_servers = rpc_servers.c_str();
        }
        mparams.split_mode = split_mode;
        mparams.main_gpu = main_gpu;
        mparams.tensor_split = tensor_split.data();
@ -564,6 +578,7 @@ struct cmd_params_instance {
    bool equal_mparams(const cmd_params_instance & other) const {
        return model == other.model &&
               n_gpu_layers == other.n_gpu_layers &&
               rpc_servers == other.rpc_servers &&
               split_mode == other.split_mode &&
               main_gpu == other.main_gpu &&
               use_mmap == other.use_mmap &&
@ -592,6 +607,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
    // this ordering minimizes the number of times that each model needs to be reloaded
    for (const auto & m : params.model)
    for (const auto & nl : params.n_gpu_layers)
    for (const auto & rpc : params.rpc_servers)
    for (const auto & sm : params.split_mode)
    for (const auto & mg : params.main_gpu)
    for (const auto & ts : params.tensor_split)
@ -618,6 +634,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
                /* .rpc_servers  = */ rpc,
                /* .split_mode   = */ sm,
                /* .main_gpu     = */ mg,
                /* .no_kv_offload= */ nkvo,
@ -643,6 +660,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
                /* .rpc_servers  = */ rpc,
                /* .split_mode   = */ sm,
                /* .main_gpu     = */ mg,
                /* .no_kv_offload= */ nkvo,
@ -668,6 +686,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
                /* .rpc_servers  = */ rpc,
                /* .split_mode   = */ sm,
                /* .main_gpu     = */ mg,
                /* .no_kv_offload= */ nkvo,
@ -692,6 +711,7 @@ struct test {
    static const bool kompute;
    static const bool metal;
    static const bool sycl;
    static const bool rpc;
    static const bool gpu_blas;
    static const bool blas;
    static const std::string cpu_info;
@ -790,6 +810,9 @@ struct test {
        if (sycl) {
            return GGML_SYCL_NAME;
        }
        if (rpc) {
            return "RPC";
        }
        if (gpu_blas) {
            return "GPU BLAS";
        }
@ -803,7 +826,7 @@ struct test {
    static const std::vector<std::string> & get_fields() {
        static const std::vector<std::string> fields = {
            "build_commit", "build_number",
-            "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
+            "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
            "cpu_info", "gpu_info",
            "model_filename", "model_type", "model_size", "model_n_params",
            "n_batch", "n_ubatch",
@ -859,7 +882,7 @@ struct test {
        std::vector<std::string> values = {
            build_commit, std::to_string(build_number),
            std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
-            std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
+            std::to_string(metal), std::to_string(sycl), std::to_string(rpc), std::to_string(gpu_blas), std::to_string(blas),
            cpu_info, gpu_info,
            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
            std::to_string(n_batch), std::to_string(n_ubatch),
@ -894,6 +917,7 @@ const bool        test::metal        = !!ggml_cpu_has_metal();
 const bool        test::gpu_blas     = !!ggml_cpu_has_gpublas();
 const bool        test::blas         = !!ggml_cpu_has_blas();
 const bool        test::sycl         = !!ggml_cpu_has_sycl();
 const bool        test::rpc          = !!ggml_cpu_has_rpc();
 const std::string test::cpu_info     = get_cpu_info();
 const std::string test::gpu_info     = get_gpu_info();
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@ -54,10 +54,10 @@ python ./examples/llava/convert-image-encoder-to-gguf \
    --projector-type ldpv2
 ```
-4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
+4. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
 ```sh
-python ./convert.py path/to/MobileVLM-1.7B
+python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B
 ```
 5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@ -50,10 +50,10 @@ python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
 python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
 ```
-5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
+5. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
 ```sh
-python ./convert.py ../llava-v1.5-7b --skip-unknown
+python ./examples/convert-legacy-llama.py ../llava-v1.5-7b --skip-unknown
 ```
 Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
@ -92,7 +92,7 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projecto
 6) Then convert the model to gguf format:
 ```console
-python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown
+python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
 ```
 7) And finally we can run the llava-cli using the 1.6 model version:
--- a/examples/llava/requirements.txt
+++ b/examples/llava/requirements.txt
@ -1,3 +1,3 @@
-r ../../requirements/requirements-convert.txt
+-r ../../requirements/requirements-convert-legacy-llama.txt
 pillow~=10.2.0
 torch~=2.1.1
--- a/examples/make-ggml.py
+++ b/examples/make-ggml.py
@ -1,98 +0,0 @@
 #!/usr/bin/env python3
 """
 This script converts Hugging Face Llama, StarCoder, Falcon, Baichuan, and GPT-NeoX models to GGUF and quantizes them.
 Usage:
 python make-ggml.py {model_dir_or_hf_repo_name} --model_type {model_type} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
 Arguments:
 - model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
 - --model_type: (Required) The type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.
 - --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
 - --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
 - --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
 - --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.
 Old quant types (some base model types require these):
 - Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
 - Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
 - Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
 - Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
 New quant types (recommended):
 - Q2_K: smallest, extreme quality loss - not recommended
 - Q3_K: alias for Q3_K_M
 - Q3_K_S: very small, very high quality loss
 - Q3_K_M: very small, very high quality loss
 - Q3_K_L: small, substantial quality loss
 - Q4_K: alias for Q4_K_M
 - Q4_K_S: small, significant quality loss
 - Q4_K_M: medium, balanced quality - recommended
 - Q5_K: alias for Q5_K_M
 - Q5_K_S: large, low quality loss - recommended
 - Q5_K_M: large, very low quality loss - recommended
 - Q6_K: very large, extremely low quality loss
 - Q8_0: very large, extremely low quality loss - not recommended
 - F16: extremely large, virtually no quality loss - not recommended
 - F32: absolutely huge, lossless - not recommended
 """
 import subprocess
 subprocess.run(f"pip install huggingface-hub==0.16.4", shell=True, check=True)
 import argparse
 import os
 from huggingface_hub import snapshot_download
 def main(model, model_type, outname, outdir, quants, keep_fp16):
    if not os.path.isdir(model):
        print(f"Model not found at {model}. Downloading...")
        try:
            if outname is None:
                outname = model.split('/')[-1]
            model = snapshot_download(repo_id=model, cache_dir='../models/hf_cache')
        except Exception as e:
            raise Exception(f"Could not download the model: {e}")
    if outdir is None:
        outdir = f'../models/{outname}'
    if not os.path.isfile(f"{model}/config.json"):
        raise Exception(f"Could not find config.json in {model}")
    os.makedirs(outdir, exist_ok=True)
    print("Building llama.cpp")
    subprocess.run(f"cd .. && make quantize", shell=True, check=True)
    fp16 = f"{outdir}/{outname}.gguf.fp16.bin"
    print(f"Making unquantised GGUF at {fp16}")
    if not os.path.isfile(fp16):
        if model_type != "llama":
            subprocess.run(f"python3 ../convert-{model_type}-hf-to-gguf.py {model} 1 --outfile {fp16}", shell=True, check=True)
        else:
            subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
    else:
        print(f"Unquantised GGML already exists at: {fp16}")
    print("Making quants")
    for type in quants:
        outfile = f"{outdir}/{outname}.gguf.{type}.bin"
        print(f"Making {type} : {outfile}")
        subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)
    if not keep_fp16:
        os.remove(fp16)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Convert/Quantize HF models to GGUF. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
    parser.add_argument('model', help='Downloaded model dir or Hugging Face model repo name')
    parser.add_argument('--model_type', required=True, choices=['llama', 'starcoder', 'falcon', 'baichuan', 'gptneox'], help='Type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.')
    parser.add_argument('--outname', default=None, help='Output model(s) name')
    parser.add_argument('--outdir', default=None, help='Output directory')
    parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
    parser.add_argument('--keep_fp16', action='store_true', help='Keep fp16 model', default=False)
    args = parser.parse_args()
    main(args.model, args.model_type, args.outname, args.outdir, args.quants, args.keep_fp16)
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@ -6,6 +6,10 @@
 #include "ggml-metal.h"
 #endif
 #ifdef GGML_USE_SYCL
 #include "ggml-sycl.h"
 #endif
 #include "ggml-rpc.h"
 #ifdef _WIN32
 #  include <windows.h>
@ -79,6 +83,12 @@ static ggml_backend_t create_backend() {
    if (!backend) {
        fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
    }
 #elif GGML_USE_SYCL
    fprintf(stderr, "%s: using SYCL backend\n", __func__);
    backend = ggml_backend_sycl_init(0); // init device 0
    if (!backend) {
        fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
    }
 #endif
    // if there aren't GPU Backends fallback to CPU backend
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -8,9 +8,20 @@ set(TARGET_SRCS
    httplib.h
 )
 set(PUBLIC_ASSETS
    colorthemes.css
    style.css
    theme-beeninorder.css
    theme-ketivah.css
    theme-mangotango.css
    theme-playground.css
    theme-polarnight.css
    theme-snowstorm.css
    index.html
    index-new.html
    index.js
    completion.js
    system-prompts.js
    prompt-formats.js
    json-schema-to-grammar.mjs
 )
 foreach(asset ${PUBLIC_ASSETS})
--- a/examples/server/public/colorthemes.css
+++ b/examples/server/public/colorthemes.css
@ -0,0 +1,402 @@
@import url("theme-snowstorm.css");
@import url("theme-polarnight.css");
@import url("theme-ketivah.css");
@import url("theme-mangotango.css");
@import url("theme-playground.css");
@import url("theme-beeninorder.css");
 :root {
 /* ---------- PRIMARY COLORS ----------------- */
 --primary-color-1: hsl(217.5, 26.7%, 94.1%);
    --primary-color-1-hue:             217.5;
    --primary-color-1-saturation:      26.7%;
    --primary-color-1-lightness:       94.1%;
 --primary-color-2: hsl(218.2, 26.8%, 92.0%);
    --primary-color-2-hue:             218.2;
    --primary-color-2-saturation:      26.8%;
    --primary-color-2-lightness:       92.0%;
 --primary-color-3: hsl(218.8, 27.9%, 88.0%);
    --primary-color-3-hue:             218.8;
    --primary-color-3-saturation:      27.9%;
    --primary-color-3-lightness:       88.0%;
 --primary-color-4: hsl(218.8, 18.3%, 81.8%);
    --primary-color-4-hue:             218.8;
    --primary-color-4-saturation:      18.3%;
    --primary-color-4-lightness:       81.8%;
 /* ---------- SECONDARY COLORS --------------- */
 --secondary-color-1: hsl(220.0, 16.4%, 21.6%);
    --secondary-color-1-hue:             220.0;
    --secondary-color-1-saturation:      16.4%;
    --secondary-color-1-lightness:       21.6%;
 --secondary-color-2: hsl(221.7, 16.3%, 27.6%);
    --secondary-color-2-hue:             221.7;
    --secondary-color-2-saturation:      16.3%;
    --secondary-color-2-lightness:       27.6%;
 --secondary-color-3: hsl(220.0, 16.8%, 31.6%);
    --secondary-color-3-hue:             220.0;
    --secondary-color-3-saturation:      16.8%;
    --secondary-color-3-lightness:       31.6%;
 --secondary-color-4: hsl(220.0, 16.5%, 35.7%);
    --secondary-color-4-hue:             220.0;
    --secondary-color-4-saturation:      16.5%;
    --secondary-color-4-lightness:       35.7%;
 /* ----------- NUANCES COLORS ---------------- */
 --theme-nuance-color-1: hsl(178.7, 25.1%, 64.9%);
    --theme-nuance-color-1-hue:             178.7;
    --theme-nuance-color-1-saturation:      25.1%;
    --theme-nuance-color-1-lightness:       64.9%;
 --theme-nuance-color-2: hsl(193.3, 43.4%, 67.5%);
    --theme-nuance-color-2-hue:             193.3;
    --theme-nuance-color-2-saturation:      43.4%;
    --theme-nuance-color-2-lightness:       67.5%;
 --theme-nuance-color-3: hsl(210.0, 34.0%, 63.1%);
    --theme-nuance-color-3-hue:             210.0;
    --theme-nuance-color-3-saturation:      34.0%;
    --theme-nuance-color-3-lightness:       63.1%;
 --theme-nuance-color-4: hsl(213.1, 32.0%, 52.2%);
    --theme-nuance-color-4-hue:             213.1;
    --theme-nuance-color-4-saturation:      32.0%;
    --theme-nuance-color-4-lightness:       52.2%;
 /* ----------- ROYGP COLORS ------------------ */
 --theme-red-color:    hsl(32.5, 80%, 50%);
 --theme-orange-color: hsl(32.5, 70%, 45%);
 --theme-yellow-color: hsl(40.0,   0.6%, 73.3%);
 --theme-green-color:  hsl(92.4,  27.8%, 64.7%);
 --theme-purple-color: hsl(311.1, 20.2%, 63.1%);
 /* ------------------------------------------- */
 --background-color-1:    var(--primary-color-1);
 --background-color-2:    var(--primary-color-2);
 --background-color-3:    var(--primary-color-3);
 --background-color-4:    var(--primary-color-4);
 --border-color-1:        var(--primary-color-2);
 --border-color-2:        var(--primary-color-3);
 --border-color-3:        var(--primary-color-4);
 --border-focus-color:    var(--theme-nuance-color-2);
 --border-focus-shadow:   var(--theme-nuance-color-1);
 --text-color-plain:      var(--secondary-color-1);
 --text-color-subtile-1:  var(--secondary-color-2);
 --text-color-subtile-2:  var(--secondary-color-3);
 --code-background-color: var(--secondary-color-2);
 --code-text-color:       var(--primary-color-2);
 --ui-range-thumb-color:  var(--theme-nuance-color-3);
 --ui-range-thumb-border: var(--ui-ranger-thumb-color);
 --textarea-border-color: var(--secondary-color-4);
 --chat-id-color:         var(--theme-nuance-color-4);
 /* ------------------------------------------- */
 --button-alert-text-hover:       var(--primary-color-1);
 --button-alert-color-hover:      var(--theme-orange-color);
 --button-alert-border-hover:     var(--theme-orange-color);
 --button-alert-text-active:      var(--primary-color-1);
 --button-alert-color-active:     var(--theme-red-color);
 --button-alert-border-active:    var(--theme-red-color);
 /* ----------- PRIMARY BUTTONS --------------- */
 /* - button should immediately catch the eye - */
 --button-primary-text:   var(--secondary-color-1);
 --button-primary-color:  var(--theme-nuance-color-3);
 --button-primary-border: var(--theme-nuance-color-3);
 /* ---------hover---------- */
 --button-primary-text-hover:
    hsl(217.5,
    calc(var(--secondary-color-1-saturation) + 35%),
    calc(var(--secondary-color-1-lightness)  - 30%));
 --button-primary-color-hover:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) -  2%),
    calc(var(--theme-nuance-color-3-lightness)  - 10%));
 --button-primary-border-hover:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) -  2%),
    calc(var(--theme-nuance-color-3-lightness)  - 10%));
 /* ---------active--------- */
 --button-primary-text-active:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 20%),
    calc(var(--theme-nuance-color-3-lightness)  + 35%));
 --button-primary-color-active:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 10%),
    calc(var(--theme-nuance-color-3-lightness)  - 25%));
 --button-primary-border-active:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 10%),
    calc(var(--theme-nuance-color-3-lightness)  - 25%));
 /* ---------- SECONDARY BUTTONS -------------- */
 /* these should NOT immediately catch the eye  */
 --button-secondary-text:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 20%),
    calc(var(--theme-nuance-color-3-lightness)  - 50%));
 --button-secondary-color:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 20%),
    calc(var(--theme-nuance-color-3-lightness)  + 10%));
 --button-secondary-border:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 20%),
    calc(var(--theme-nuance-color-3-lightness)  + 10%));
 /* ---------hover---------- */
 --button-secondary-text-hover:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 20%),
    calc(var(--theme-nuance-color-3-lightness)  - 80%));
 --button-secondary-color-hover:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 22%),
    calc(var(--theme-nuance-color-3-lightness)  +  1%));
 --button-secondary-border-hover:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 22%),
    calc(var(--theme-nuance-color-3-lightness)  +  1%));
 /* ---------active--------- */
 --button-secondary-text-active:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) + 40%),
    calc(var(--theme-nuance-color-3-lightness)  - 55%));
 --button-secondary-color-active:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 30%),
    calc(var(--theme-nuance-color-3-lightness)  -  5%));
 --button-secondary-border-active:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 30%),
    calc(var(--theme-nuance-color-3-lightness)  -  5%));
 /* ---------- TERTIARY BUTTONS --------------- */
 /* ---------- disabled buttons --------------- */
 --button-tertiary-text:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 40%),
    calc(var(--theme-nuance-color-3-lightness)  -  5%));
 --button-tertiary-color:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 40%),
    calc(var(--theme-nuance-color-3-lightness)  + 20%));
 --button-tertiary-border:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 40%),
    calc(var(--theme-nuance-color-3-lightness)  + 20%));
 /* ---------hover---------- */
 --button-tertiary-text-hover:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 40%),
    calc(var(--theme-nuance-color-3-lightness)  -  5%));
 --button-tertiary-color-hover:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 40%),
    calc(var(--theme-nuance-color-3-lightness)  + 20%));
 --button-tertiary-border-hover:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 40%),
    calc(var(--theme-nuance-color-3-lightness)  + 20%));
 }
 /*
 .theme-template {
    If light theme: should go from bright to darker
    If dark theme: should go from dark to brighter
    ideally this should not be anything but steps of
    gray or slightly variants from it
    --primary-color-1: #2E3440;
    --primary-color-2: #3B4252;
    --primary-color-3: #434C5E;
    --primary-color-4: #4C566A;
    If light theme: should go from dark to brighter
    If dark theme: should go from bright to darker
    ideally this should not be anything but steps of
    gray or slightly variants from it
    --secondary-color-1: #ECEFF4;
    --secondary-color-2: #E5E9F0;
    --secondary-color-3: #D8DEE9;
    --secondary-color-4: #C8CED9;
    Choose wisely nuance colors. It is not easy to find
    4 harmonizing nuance colors. But keep in mind, that
    only one accent color could work too.
    --theme-nuance-color-1: #8FBCBB;
    --theme-nuance-color-2: #88C0D0;
    --theme-nuance-color-3: #81A1C1;
    --theme-nuance-color-4: #5E81AC;
    adapt the color red, orange, yellow, green,
    purple to the 'mood' of your overall design
    e.g is it low-contrast? vibrant? dynamic? etc
    --theme-red-color:    #BF616A;
    --theme-orange-color: #D08770;
    --theme-yellow-color: #EBCB8B;
    --theme-green-color:  #A3BE8C;
    --theme-purple-color: #B48EAD;
 NOTE: comment all those line `--- ...` out
 ------------------------------------------------
 --background-color-1:
 --background-color-2:
 --background-color-3:
 --background-color-4:
 --border-color-1:
 --border-color-2:
 --border-color-3:
 --border-focus-color:
 --border-focus-shadow:
 --text-color-plain:
 --text-color-subtile-1:
 --text-color-subtile-2:
 --code-background-color:
 --code-text-color:
 --ui-range-thumb-color:
 --ui-range-thumb-border:
 --textarea-border-color:
 -------------------------------------------
 --button-alert-text-hover:
 --button-alert-color-hover:
 --button-alert-border-hover:
 --button-alert-text-active:
 --button-alert-color-active:
 --button-alert-border-active:
 ----------- PRIMARY -----------------------
 --button should immediately catch the eye--
 --button-primary-text:
 --button-primary-color:
 --button-primary-border:
 ---------hover----------
 --button-primary-text-hover:
 --button-primary-color-hover:
 --button-primary-border-hover:
 ---------active---------
 --button-primary-text-active:
 --button-primary-color-active:
 --button-primary-border-active:
 ------------ SECONDARY ------------------------
 --button should NOT immediately catch the eye--
 --button-secondary-text:
 --button-secondary-color:
 --button-secondary-border:
 ---------hover----------
 --button-secondary-text-hover:
 --button-secondary-color-hover:
 --button-secondary-border-hover:
 ---------active---------
 --button-secondary-text-active:
 --button-secondary-color-active:
 --button-secondary-border-active:
 ---------- TERTIARY -----------------------
 ---------- disabled buttons ---------------
 --button-tertiary-text:
 --button-tertiary-color:
 --button-tertiary-border:
 ---------hover----------
 --button-tertiary-text:
 --button-tertiary-color:
 --button-tertiary-border:
 }
 */
--- a/examples/server/public/index-new.html
+++ b/examples/server/public/index-new.html
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -12,6 +12,18 @@
      font-size: 90%;
    }
    .grid-container {
      display: grid;
      grid-template-columns: auto auto auto;
      padding: 10px;
    }
    .grid-item {
      padding: 5px;
      /* font-size: 30px; */
      text-align: center;
    }
    #container {
      margin: 0em auto;
      display: flex;
@ -35,6 +47,67 @@
      padding: 0.5em;
    }
    h1 {
      text-align: center;
    }
    .customlink:link {
      color: white;
      background-color: #007aff;
      font-weight: 600;
      text-decoration: none;
      float: right;
      margin-top: 30px;
      display: flex;
      flex-direction: row;
      gap: 0.5em;
      justify-content: flex-end;
      border-radius: 4px;
      padding: 8px;
    }
    .customlink:visited {
      color: white;
      background-color: #007aff;
      font-weight: 600;
      text-decoration: none;
      float: right;
      margin-top: 30px;
      display: flex;
      flex-direction: row;
      gap: 0.5em;
      justify-content: flex-end;
      padding: 8px;
    }
    .customlink:hover {
      color: white;
      background-color: #0070ee;
      font-weight: 600;
      text-decoration: none;
      float: right;
      margin-top: 30px;
      display: flex;
      flex-direction: row;
      gap: 0.5em;
      justify-content: flex-end;
      padding: 8px;
    }
    .customlink:active {
      color: #0070ee;
      background-color: #80b3ef;
      font-weight: 600;
      text-decoration: none;
      float: right;
      margin-top: 30px;
      display: flex;
      flex-direction: row;
      gap: 0.5em;
      justify-content: flex-end;
      padding: 8px;
    }
    body {
      max-width: 600px;
      min-width: 300px;
@ -1035,7 +1108,11 @@
      return html`
        <div class="mode-${session.value.type}">
          <header>
-            <h1>llama.cpp</h1>
+            <div class="grid-container">
              <div class="grid-item"></div>
              <div class="grid-item"><h1>llama.cpp</h1></div>
              <div class="grid-item"><a class="customlink" href="index-new.html">New UI</a></div>
            </div>
          </header>
          <main id="content">
--- a/examples/server/public/index.js
+++ b/examples/server/public/index.js
--- a/examples/server/public/prompt-formats.js
+++ b/examples/server/public/prompt-formats.js
@ -0,0 +1,331 @@
 // extended list
 export const promptFormats = {
  "alpaca": {
  template: `{{prompt}}\n\n{{history}}\n\n{{char}}:`,
  historyTemplate: `### {{name}}:\n{{message}}`,
  char: "Response",
  charMsgPrefix: "",
  charMsgSuffix: "",
  user: "Instruction",
  userMsgPrefix: "",
  userMsgSuffix: "",
  stops: ""
  },
  // ----------------------------
  "chatml": {
  template: `<|im_start|>system\n{{prompt}}<|im_end|>\n{{history}}{{char}}`,
  historyTemplate: `<|im_start|>{{name}}\n{{message}}`,
  char: "assistant",
  charMsgPrefix: "",
  charMsgSuffix: "",
  user: "user",
  userMsgPrefix: "",
  userMsgSuffix: "<|im_end|>\n",
  stops: ""
  },
  // ----------------------------
  "commandr": {
  template: `<BOS_TOKEN><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{prompt}}\n<|END_OF_TURN_TOKEN|>{{history}}{{char}}`,
  historyTemplate: `<|START_OF_TURN_TOKEN|><|{{name}}|> {{message}}`,
  char: "CHATBOT_TOKEN",
  charMsgPrefix: "",
  charMsgSuffix: "",
  user: "USER_TOKEN",
  userMsgPrefix: "",
  userMsgSuffix: "<|END_OF_TURN_TOKEN|>",
  stops: ""
  },
  // ref: https://docs.cohere.com/docs/prompting-command-r
  // ----------------------------
  "llama2": {
  template: `<s>[INST] <<SYS>>\n{{prompt}}\n<</SYS>>\n\nTest Message [/INST] Test Successfull </s>{{history}}{{char}}`,
  historyTemplate: `{{name}}: {{message}}`,
  char: "Assistant",
  charMsgPrefix: "",
  charMsgSuffix: "</s>",
  user: "User",
  userMsgPrefix: "<s>[INST] ",
  userMsgSuffix: " [/INST]",
  stops: ""
  },
  // ref: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
  // ----------------------------
  "llama3": {
  template: `<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{{prompt}}{{history}}{{char}}`,
  historyTemplate: `<|start_header_id|>{{name}}<|end_header_id|>\n\n{{message}}<|eot_id|>`,
  char: "assistant",
  charMsgPrefix: "",
  charMsgSuffix: "",
  user: "user",
  userMsgPrefix: "",
  userMsgSuffix: "",
  stops: "<|eot_id|>"
  },
  // ref: https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/#special-tokens-used-with-meta-llama-3
  // ----------------------------
  "openchat": {
  template: `{{history}}{{char}}`,
  historyTemplate: `GPT4 Correct {{name}}: {{message}}<|end_of_turn|>`,
  char: "Assistant",
  charMsgPrefix: "",
  charMsgSuffix: "",
  user: "User",
  userMsgPrefix: "",
  userMsgSuffix: "",
  stops: ""
  },
  // ----------------------------
  "phi3": {
  template: `{{history}}{{char}}`,
  historyTemplate: `<|{{name}}|>\n{{message}}<|end|>\n`,
  char: "assistant",
  charMsgPrefix: "",
  charMsgSuffix: "",
  user: "user",
  userMsgPrefix: "",
  userMsgSuffix: "",
  stops: "<|end|>"
  },
  // ref: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct#chat-format
  // ----------------------------
  "vicuna": {
  template: `{{prompt}}\n{{history}}{{char}}`,
  historyTemplate: `{{name}}: {{message}}\n`,
  char: "ASSISTANT",
  charMsgPrefix: "",
  charMsgSuffix: "",
  user: "USER",
  userMsgPrefix: "",
  userMsgSuffix: "",
  stops: ""
  },
  // ref: https://huggingface.co/lmsys/vicuna-33b-v1.3/discussions/1
  // ----------------------------
  "deepseekCoder": {
  template: `{{prompt}}{{history}}{{char}}:`,
  historyTemplate: `### {{name}}:\n{{message}}`,
  char: "Response",
  charMsgPrefix: "",
  charMsgSuffix: "",
  user: "Instruction",
  userMsgPrefix: "",
  userMsgSuffix: "",
  stops: "<|EOT|>"
  },
  // ----------------------------
  "med42": {
  template: `<|system|>: {{prompt}}\n{{history}}{{char}}`,
  historyTemplate: `<|{{name}}|>: {{message}}\n`,
  char: "assistant",
  charMsgPrefix: "",
  charMsgSuffix: "",
  user: "prompter",
  userMsgPrefix: "",
  userMsgSuffix: "",
  stops: ""
  },
  // ----------------------------
  "neuralchat": {
  template: `### System:\n{{prompt}}\n{{history}}{{char}}:`,
  historyTemplate: `### {{name}}:\n{{message}}\n`,
  char: "Assistant",
  charMsgPrefix: "",
  charMsgSuffix: "",
  user: "User",
  userMsgPrefix: "",
  userMsgSuffix: "",
  stops: ""
  },
  // ----------------------------
  "nousHermes": {
  template: `### Instruction: {{prompt}}\n\n{{history}}\n\n{{char}}:`,
  historyTemplate: `### {{name}}:\n{{message}}`,
  char: "Response",
  charMsgPrefix: "",
  charMsgSuffix: "",
  user: "Input",
  userMsgPrefix: "",
  userMsgSuffix: "",
  stops: ""
  },
  // ----------------------------
  "openchatMath": {
  template: `{{history}}{{char}}`,
  historyTemplate: `Math Correct {{name}}: {{message}}<|end_of_turn|>`,
  char: "Assistant",
  charMsgPrefix: "",
  charMsgSuffix: "",
  user: "User",
  userMsgPrefix: "",
  userMsgSuffix: "",
  stops: ""
  },
  // ----------------------------
  "orion": {
  template: `<s>Human: Test Message\n\nAssistant: </s>Test Successful</s>{{history}}{{char}}:`,
  historyTemplate: `{{name}}: {{message}}`,
  char: "Assistant </s>",
  charMsgPrefix: "",
  charMsgSuffix: "",
  user: "Human",
  userMsgPrefix: "",
  userMsgSuffix: "\n\n",
  stops: ""
  },
  // ----------------------------
  "sauerkraut": {
  template: `{{prompt}}\n{{history}}{{char}}`,
  historyTemplate: `
  {{name}}: {{message}}\n`,
  char: "Assistant",
  charMsgPrefix: "",
  charMsgSuffix: "",
  user: "User",
  userMsgPrefix: "",
  userMsgSuffix: "",
  stops: ""
  },
  // ----------------------------
  "starlingCode": {
  template: `{{history}}{{char}}`,
  historyTemplate: `Code {{name}}: {{message}}<|end_of_turn|>`,
  char: "Assistant",
  charMsgPrefix: "",
  charMsgSuffix: "",
  user: "User",
  userMsgPrefix: "",
  userMsgSuffix: "",
  stops: ""
  },
  // ----------------------------
  "yi34b": {
  template: `{{history}} {{char}}`,
  historyTemplate: `{{name}}: {{message}}`,
  char: "Assistant",
  charMsgPrefix: "",
  charMsgSuffix: "",
  user: "Human",
  userMsgPrefix: "",
  userMsgSuffix: "",
  stops: ""
  },
  // ----------------------------
  "zephyr": {
  template: `<|system|>\n{{prompt}}</s>\n{{history}}{{char}}`,
  historyTemplate: `<|{{name}}|>\n{{message}}</s>\n`,
  char: "assistant",
  charMsgPrefix: "",
  charMsgSuffix: "",
  user: "user",
  userMsgPrefix: "",
  userMsgSuffix: "",
  stops: ""
  }
  };
--- a/examples/server/public/style.css
+++ b/examples/server/public/style.css
@ -0,0 +1,954 @@
@import url("colorthemes.css");
 body {
  font-family: 'Arial', sans-serif;
  font-size: 90%;
  background-color: var(--background-color-1);
  color: var(--text-color-subtile-1); /* head 1 llama.cpp & triangle options for some reason */
  max-width: 600px;
  min-width: 300px;
  line-height: 1.2;
  margin: 0 auto;
  padding: 0 0.5em;
  transition: background-color 0.3s;
 }
 ::selection {
  color: var(--button-primary-text) ;
  background: var(--button-primary-color);
 }
 code, pre code {
  font-family: 'Courier New', monospace;
 }
 #container {
  margin: 0em auto;
  display: flex;
  flex-direction: column;
  justify-content: space-between;
  height: 100%;
 }
 main {
  margin: 3px;
  display: flex;
  flex-direction: column;
  justify-content: space-between;
  gap: 1em;
  flex-grow: 1;
  overflow-y: auto;
  border: 1px solid var(--border-color-3);
  border-radius: 5px;
  padding: 0.5em;
 }
 p {
  overflow-wrap: break-word;
  word-wrap: break-word;
  hyphens: auto;
  margin-top: 0.5em;
  margin-bottom: 0.5em;
 }
 #write form {
  margin: 1em 0 0 0;
  display: flex;
  flex-direction: column;
  gap: 0.5em;
  align-items: stretch;
 }
 .right {
  display: flex;
  flex-direction: row;
  gap: 0.5em;
  justify-content: flex-end;
  margin-bottom: 30px;
 }
 .two-columns {
  width: 97%;
  max-width: 97%;
  display: grid;
  grid-template-columns: 1fr 1fr;
  gap: 1em;
  position: relative;
 }
 .json-schema-controls {
  margin-top: 10px;
  width: 100%;
  max-width: 100%;
  display: grid;
  grid-template: "a a";
  gap: 1em;
  font-size: x-small;
  color: var(--theme-nuance-color-3);
  padding-top: 16px;
  padding-bottom: 16px;
  text-transform: uppercase;
  font-weight: 600;
 }
 .json-schema-controls > * {
  flex: 1;
 }
 /* titles of the details-summary boxes */
 .summary-title {
  font-weight: 600;
  font-size: x-small;
  color: var(--text-color-subtile-1);
  text-transform: uppercase;
  /* transition: ; */
 }
 fieldset {
  border: none;
  padding: 0;
  margin: 0;
  color: var(--text-color-plain);
 }
 fieldset.two {
  display: grid;
  grid-template: "a a a";
  gap: 1em;
  align-items: center;
  font-size: x-small;
  color: var(--text-color-plain);
 }
 fieldset.three {
  display: grid;
  grid-template: "a a a";
  gap: 1em;
  font-size: x-small;
  color: var(--text-color-plain);
 }
 /* titles of name fields*/
 fieldset.names {
  display: grid;
  grid-template: "a a";
  gap: 1em;
  font-size: x-small;
  color: var(--theme-nuance-color-3);
  padding-top: 16px;
  padding-bottom: 16px;
  text-transform: uppercase;
  font-weight: 600;
 }
 /* titles of params fields*/
 fieldset.params {
  display: grid;
  grid-template: "a a";
  gap: 1em;
  font-size: x-small;
  color: var(--theme-nuance-color-4);
  padding-top: 16px;
  padding-bottom: 16px;
  text-transform: uppercase;
  font-weight: 600;
 }
 fieldset.dropdowns {
  -webkit-appearance: none;
  display: flex;
  grid-template: "a a";
  gap: 1em;
  font-size: x-small;
  color: red;
  padding-top: 16px;
  padding-bottom: 16px;
  text-transform: uppercase;
  font-weight: 600;
 }
 /* input of name fields*/
 .names input[type="text"] {
  font-family: Arial, sans-serif;
  font-size: medium;
  font-weight: 500;
  padding: 5px;
  border: 1px solid var(--border-color-2);
 }
 .chat-id-color {
  color: var(--chat-id-color);
 }
 details {
  border: 1px solid var(--border-color-2);
  border-radius: 5px;
  padding: 0.5em 0.5em 0;
  margin-top: 0.5em;
 }
 summary {
  font-weight: bold;
  margin: -0.5em -0.5em 0;
  padding: 0.5em;
  cursor: pointer;
 }
 details[open] {
  padding: 0.5em;
 }
 textarea-sec, input-sec, button-sec {
  padding: 10px;
  height: 40px;
  align-items: center;
 }
 textarea-sec::placeholder, input-sec::placeholder {
  padding-left: 10px;
 }
 .toggleCheckbox {
  display: none;
 }
 .toggleContainer {
  position: relative;
  display: grid;
  grid-template-columns: repeat(2, 1fr);
  width: fit-content;
  border: 3px solid var(--border-color-2);
  border-radius: 20px;
  background: var(--border-color-2);
  font-size: small;
  cursor: pointer;
  overflow: hidden;
 }
 /* toggle button current state */
 .toggleContainer::before {
  color: var(--button-primary-text);
  background-color: var(--button-primary-color);
  content: '';
  position: absolute;
  width: 50%;
  height: 100%;
  left: 0%;
  border-radius: 20px;
  transition: all 0.3s;
 }
 .toggleContainer div {
  padding: 6px;
  text-align: center;
  z-index: 1;
  transition: color 0.3s;
 }
 .toggleCheckbox:checked + .toggleContainer::before {
  left: 50%;
 }
 .toggleCheckbox:checked + .toggleContainer div:first-child {
  color: var(--text-color-subtile-2);
 }
 .toggleCheckbox:checked + .toggleContainer div:last-child {
  color: var(--button-primary-text);
 }
 .toggleCheckbox + .toggleContainer div:first-child {
  color: var(--button-primary-text);
 }
 .toggleCheckbox + .toggleContainer div:last-child {
  color: var(--text-color-subtile-2);
 }
 select {
  padding: 5px;
  margin-right: 5px;
  border-radius: 4px;
  border: 1px solid var(--secondary-color-4);
  background-color: var(--primary-color-3);
  color: var(--secondary-color-4);
  cursor: pointer;
 }
 select:focus {
  border: 1px solid var(--border-focus-color);
  box-shadow: 0 0 1px var(--border-focus-shadow);
 }
 .button-container {
  display: flex;
  justify-content: flex-end;
 }
 button {
  color: var(--button-primary-text);
  background-color: var(--button-primary-color);
  border: 1px solid var(--button-primary-border);
  transition: background-color 0.1s;
  border-radius: 12px;
  font-size: x-small;
  font-weight: 600;
  text-shadow: 0px 0px 30px #ffffff;
  text-align: center;
  text-decoration: none;
  margin: 4px 2px;
  padding: 10px 20px;
  display: inline-block;
  cursor: pointer;
 }
 button:hover {
  color: var(--button-primary-text-hover);
  background-color: var(--button-primary-color-hover);
  border: 1px solid var(--button-primary-border-hover);
  font-size: x-small;
  font-weight: 600;
 }
 button:active {
  color: var(--button-primary-text-active);
  background-color: var(--button-primary-color-active);
  border: 1px solid var(--button-primary-border-active);
  font-size: x-small;
  font-weight: 600;
 }
 button:disabled {
  color: var(--button-tertiary-text);
  background-color: var(--button-tertiary-color);
  border: 1px solid var(--button-tertiary-border);
  font-size: x-small;
  font-weight: 600;
  cursor: not-allowed;
 }
 .reset-button {
  background-color: var(--button-secondary-color);
  border: 1px solid var(--button-secondary-color);
  color: var(--button-secondary-text);
  width: fit-content;
  height: fit-content;
  font-size: x-small;
  font-weight: 600;
  border-radius: 50px;
  overflow: hidden;
 }
 .reset-button:hover {
  color: var(--button-alert-text-hover);
  background-color: var(--button-alert-color-hover);
  border: 1px solid var(--button-alert-border-hover);
  font-size: x-small;
  font-weight: 600;
 }
 .reset-button:active {
  color: var(--button-alert-text-active);
  background-color: var(--button-alert-color-active);
  border: 1px solid var(--button-alert-border-active);
  font-size: x-small;
  font-weight: 600;
 }
 .button-grammar {
  color: var(--button-primary-text);
  background-color: var(--button-primary-color);
  border: 1px solid var(--button-primary-border);
  border-radius: 10px;
  padding: 10px 20px;
  text-align: center;
  text-decoration: none;
  display: inline-block;
  font-size: x-small;
  font-weight: 600;
  margin: 2px 2px;
  transition: background-color 0.1s;
  cursor: pointer;
 }
 .button-grammar:hover {
  color: var(--button-primary-text-hover);
  background-color: var(--button-primary-color-hover);
  border: 1px solid var(--button-primary-border-hover);
  border-radius: 10px;
  padding: 10px 20px;
  text-align: center;
  text-decoration: none;
  display: inline-block;
  font-size: x-small;
  font-weight: 600;
  margin: 2px 2px;
  transition: background-color 0.1s;
  cursor: pointer;
 }
 .button-grammar:active {
  color: var(--button-primary-text-active);
  background-color: var(--button-primary-color-active);
  border: 1px solid var(--button-primary-border-active);
  font-size: x-small;
  font-weight: 600;
 }
 .button-back {
  background-color: var(--button-secondary-color);
  border: 1px solid var(--button-secondary-color);
  color: var(--button-secondary-text);
  transition: background-color 0.1s;
  border-radius: 12px;
  font-size: x-small;
  font-weight: 600;
  text-align: center;
  text-decoration: none;
  margin: 4px 2px;
  padding: 10px 20px;
  display: inline-block;
  cursor: pointer;
 }
 .button-back:hover {
  color: var(--button-secondary-text-hover);
  background-color: var(--button-secondary-color-hover);
  border: 1px solid var(--button-secondary-border-hover);
  padding: 10px 20px;
  text-align: center;
  text-decoration: none;
  display: inline-block;
  font-size: x-small;
  font-weight: 600;
  margin: 4px 2px;
  transition: background-color 0.1s;
  cursor: pointer;
  border-radius: 12px;
 }
 .button-back:active {
  color: var(--button-secondary-text-active);
  background-color: var(--button-secondary-color-active);
  border: 1px solid var(--button-secondary-border-active);
  font-size: x-small;
  font-weight: 600;
 }
 .prob-set {
  padding: 0.3em;
  border-bottom: 1px solid red; /* unknown */
 }
 .popover-content {
  position: absolute;
  background-color: white;
  padding: 0.2em;
  box-shadow: 0 0 13px rgba(0, 0, 0, 0.1);
 }
 .grammar {
  width: 97%;
  max-width: 97%;
 }
 textarea {
  padding: 5px;
  flex-grow: 1;
  width: 100%;
  max-width: 100%;
  border-radius: 8px;
  border: 1px solid var(--border-color-1);
  resize: none;
  height: 6em;
 }
 textarea:focus {
  outline: none;
  border: 1px solid var(--border-focus-color);
  box-shadow: 0 0 3px var(--border-focus-shadow);
 }
 /* "props" frame */
 input[type="text"],
 input[type="range"] {
  padding: 5px;
  border-radius: 8px;
  border: 1px solid var(--border-color-1);
 }
 /* "names and props" frame focused*/
 input[type="text"]:focus {
  outline: none;
  border: 1px solid var(--border-focus-color);
  box-shadow: 0 0 3px var(--border-focus-shadow);
 }
 input[type="range"]:hover {
  opacity: 1;
 }
 input[type="range"]:focus {
  outline: none;
  border: 1px solid var(--border-focus-color);
  box-shadow: 0 0 3px var(--border-focus-shadow);
  background-size: var(--slider-track-size-focus);
 }
 input[type="range"]::-moz-range-thumb {
  width: 6px;
  height: 25px;
  border: 1px solid var(--ui-range-thumb-border);
  border-radius: 5px;
  background-color: var(--ui-range-thumb-color);
  cursor: pointer;
 }
 input[type="range"] {
  -webkit-appearance: none;
  width: 80%;
  height: 1px;
  border: 1px solid var(--border-color-1);
  border-radius: 8px;
  background: var(--border-color-2);
  outline: none;
  opacity: 0.7;
  -webkit-transition: .2s;
  transition: opacity .2s;
 }
 input[type="range"]::-webkit-slider-thumb {
  -webkit-appearance: none;
  appearance: none;
  width: 6px;
  height: 25px;
  border: 1px solid var(--ui-range-thumb-border);
  border-radius: 5px;
  background-color: var(--ui-range-thumb-color);
  cursor: pointer;
 }
 input[type="range"]::-webkit-slider-runnable-track {
  background-size: var(--slider-track-size);
 }
 input[type="radio"] {
  accent-color:   var(--theme-nuance-color-2);
 }
 .chat-input-container {
  position: relative;
  max-width: 97%;
  min-width: 97%;
 }
 .chat-input-label {
  position: absolute;
  top: 0;
  left: 0;
  color: var(--text-color-plain);
  pointer-events: none;
  margin-left: 5px;
  margin-top: 5px;
 }
 textarea#chat-input {
  padding-top: 10px;
  padding-left: 10px;
  font-size: medium;
  border: 1px solid var(--border-color-2);
  resize: vertical;
 }
 textarea#chat-input:focus {
  border: 1px solid var(--border-focus-color);
  box-shadow: 0 0 3px var(--border-focus-shadow);
 }
 .input-container {
  position: relative;
  box-sizing: border-box;
  width: 100%; /* Setzt die Breite auf 100% */
  max-width: 100%; /* Stellt sicher, dass die Breite nicht größer als 100% wird */
 }
 .input-container:focus {
  border: 1px solid var(--border-focus-color);
  box-shadow: 0 0 3px var(--border-focus-shadow);
 }
 /* titles of name fields*/
 /* fieldset.names {
  display: grid;
  grid-template: "a a";
  gap: 1em;
  font-size: x-small;
  color: var(--theme-nuance-color-3);
  padding-top: 16px;
  padding-bottom: 16px;
  text-transform: uppercase;
  font-weight: 600;
 } */
 /* input of name fields*/
 /* .names input[type="text"] {
  font-family: Arial, sans-serif;
  font-size: medium;
  font-weight: 500;
  padding: 5px;
  border: 1px solid var(--border-color-2);
 } */
 fieldset.apiKey {
  width: 100%;
  font-size: x-small;
  color: var(--theme-nuance-color-3);
  padding-top: 16px;
  padding-bottom: 16px;
  text-transform: uppercase;
  font-weight: 600;
 }
 .apiKey {
  font-family: Arial, sans-serif;
  font-weight: 500;
  padding: 5px;
  border: 1px solid var(--border-color-2);
 }
 .apiKey:focus {
  border: 1px solid var(--border-focus-color);
  box-shadow: 0 0 3px var(--border-focus-shadow);
 }
 .apiKey input[type="text"] {
  font-family: Arial, sans-serif;
  font-size: medium;
  font-weight: 500;
  padding: 5px;
  border: 1px solid var(--border-color-2);
 }
 .apiKey label {
  display: inline-block;
  width: auto;
  margin-right: 5px;
 }
 textarea#api_key {
  padding-top: 10px;
  padding-left: 10px;
  font-size: medium;
  border: 1px solid var(--border-color-2);
  resize: vertical;
 }
 textarea#api_key:focus {
  border: 1px solid var(--border-focus-color);
  box-shadow: 0 0 3px var(--border-focus-shadow);
 }
 /* embedded title of the system prompt text area */
 .input-label {
  position: absolute;
  top: 0;
  left: 0;
  color: var(--theme-nuance-color-4);
  pointer-events: none;
  border-radius: 8px 8px 0px 0px;
  padding-top: 10px;
  padding-left: 13px;
  padding-right: 0px;
  margin-top: 1px;
  margin-left: 1px;
  margin-right: 20px;
  text-transform: uppercase;
  font-weight: 600;
  font-size: small;
  background: rgba(255, 255, 255, 0.5);
  backdrop-filter: blur(10px);
  -webkit-backdrop-filter: blur(10px); /* for safari */
  width: 97%;
  /* display: block;
  box-sizing: border-box; */
 }
 /* embedded title of the prompt style areas */
 .input-label-sec {
  position: absolute;
  top: 0;
  left: 0;
  color: var(--theme-nuance-color-4);
  pointer-events: none;
  margin-left: 13px;
  margin-top: 16px;
  text-transform: uppercase;
  font-weight: 600;
  font-size: x-small;
 }
 /* system prompt input area */
 textarea.persistent-input {
  padding-top: 42px;
  padding-left: 11px;
  width: 97%;
  max-width: 97%;
  height: 50px;
  font-size: medium;
  overscroll-behavior: contain;
 }
 /* system prompt box */
 .persistent-input {
  height: auto;
  width: 100%;
  max-width: 100%;
  min-height: 50px;
  padding: 3px;
  transition: min-height 0.3s ease;
 }
 /* chat history box */
 .persistent-input:focus {
  height: auto;
  min-height: 150px;
  border: 1px solid var(--border-focus-color);
  box-shadow: 0 0 3px var(--border-focus-shadow);
 }
 textarea.persistent-input:focus {
  border: 1px solid var(--border-focus-color);
  box-shadow: 0 0 3px var(--border-focus-shadow);
 }
 /* prompt style input area */
 textarea.persistent-input-sec {
  width: 97%;
  max-width: 97%;
  padding-top: 42px;
  padding-left: 11px;
  font-size: small;
  border: 1px solid var(--border-color-1);
  overscroll-behavior: contain;
 }
 textarea.persistent-input-sec:focus {
  border: 1px solid var(--border-focus-color);
  box-shadow: 0 0 3px var(--border-focus-shadow);
 }
 /* chat history box */
 .persistent-input-sec {
  height: auto;
  min-height: 150px;
 }
 img {
  border-radius: 8px;
  display: block;
  margin-left: auto;
  margin-right: auto;
  width: 50%;
 }
 /* code area background */
 pre code {
  display: block;
  background-color: var(--code-background-color);
  color: var(--code-text-color);
  padding: 0.2em 0.2em;
  border-radius: 5px;
 }
 /* code area text */
 code {
  font-family: monospace;
  font-weight: bold;
  padding: 0.1em 0.3em;
  border-radius: 5px;
 }
 fieldset label {
  margin: 0.5em 0;
  display: block;
 }
 fieldset label.slim {
  margin: 0 0.5em;
  display: inline;
 }
 header {
  display: flex;
  justify-content: space-between;
  align-items: center;
  text-align: center;
  padding-left: 15px;
 }
 .generation-statistics:hover {
  color: var(--theme-nuance-color-4);
  cursor: default;
 }
 footer {
  font-size: 80%;
  color: var(--background-color-3);
  text-align: center;
  cursor: default;
 }
 footer a {
  color: var(--background-color-4); /* Color of the link */
  text-decoration: none; /* No underlining */
  font-weight: bold; /* Bold print */
 }
 footer a:hover {
  color: var(--theme-nuance-color-4); /* Color of the link when hovering */
  text-decoration: underline; /* Underlining when hovering */
 }
 .mode-chat textarea[name=prompt] {
  height: 8.5em;
  border: 1px solid var(--primary-color-3);
 }
 .mode-completion textarea[name=prompt] {
  height: 30em;
  border: 1px solid var(--primary-color-3);
 }
@keyframes loading-bg-wipe {
  0% {
    background-position: 0%;
  }
  100% {
    background-position: 100%;
  }
 }
 .loading {
  background-size: 50% 100%;
  background-image: linear-gradient(90deg, var(--loading-color-1), var(--loading-color-2), var(--loading-color-1));
  animation: loading-bg-wipe 2s linear infinite;
 }
 .dropbtn {
  color: var(--button-primary-color);
  background-color: var(--background-color-1);
  border: 1px solid var(--background-color-1);
  transition: background-color 0.1s;
  border-radius: 4px 4px 0px 0px;
  font-size: x-small;
  font-weight: 600;
  text-shadow: 0px 0px 2px #99999990;
  text-align: center;
  text-decoration: none;
  margin: 4px 2px;
  padding: 5px 20px;
  display: inline-block;
  cursor: pointer;
  top: 0;
 }
 .dropbtn svg {
  vertical-align: middle;
  margin-right: 0px;
  stroke: var(--button-primary-color);
 }
 .dropbtn:hover svg {
  vertical-align: middle;
  margin-right: 0px;
  stroke: var(--button-primary-text);
 }
 .dropbtn:focus {
  outline: none; /* Removes the blue border that appears when the button is focused */
 }
 .dropdown {
  position: relative;
  display: inline-block;
 }
 .dropdown-content {
  /* display: none; */
  position: absolute;
  right: 0;
  text-align: end;
  color: var(--button-secondary-color);
  background-color: var(--text-color-subtile-2);
  border-radius: 4px 4px 4px 4px;
  min-width: 160px;
  box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
  z-index: 1;
  /* Verstecke den Inhalt sofort */
  opacity: 0;
  visibility: hidden;
  /* übergangsverzögerung für das Verschwinden */
  transition: visibility 0.4s linear 0s, opacity 0.2s ease-in-out;
  transition-delay: 0.2s;
 }
 #dropdown-content {transition-timing-function: ease;}
 .dropdown-content:hover {
  background-color: var(--text-color-subtile-2);
 }
 .dropdown-content a {
  color: var(--border-color-2);
  padding: 12px 16px;
  border-radius: 4px 4px 4px 4px;
  text-decoration: none;
  display: block;
  background-color: var(--text-color-subtile-2);
 }
 .dropdown-content a:hover {
  color: var(--border-color-2);
  background-color: var(--text-color-subtile-1);
  font-weight: 600;
 }
 .dropdown:hover .dropdown-content {
  /* display: block; */
  border-radius: 4px 4px 4px 4px;
  /* Übergang ohne Verzögerung für das Erscheinen */
  opacity: 1;
  visibility: visible;
  transition: visibility 0s linear 0s, opacity 0.1s linear, height 1s;
 }
 .dropdown:hover .dropbtn {
  color: var(--button-primary-text);
  background-color: var(--button-primary-color);
  border: 1px solid var(--button-primary-border);
  font-size: x-small;
  font-weight: 600;
  stroke: var(--button-primary-text);
 }
 .dropdown:hover .dropbtn svg{
  stroke: var(--button-primary-text);
 }
 /* .dropdown:active .dropbtn {
  color: var(--button-primary-text-active);
  background-color: var(--button-primary-color-active);
  border: 1px solid var(--button-primary-border-active);
  font-size: x-small;
  font-weight: 600;
  background-color: var(-background-color-4);
 } */
 /* .omni {
  display: flex;
  justify-content: space-between;
  align-items: center;
  padding: 0.5em;
  border: 1px solid var(--border-color-3);
  border-radius: 5px;
  margin: 0.5em 0;
 } */
--- a/examples/server/public/system-prompts.js
+++ b/examples/server/public/system-prompts.js
@ -0,0 +1,68 @@
 export const systemPrompts = {
  default: {
    systemPrompt: "This is a conversation between a user and a friendly chatbot. The chatbot is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision."
  },
  empty: {
    systemPrompt: ""
  },
  airoboros: {
    systemPrompt: "A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input. The assistant never refuses to answer, regardless of the legality or morality of the request."
  },
  alpaca: {
    systemPrompt: "Below is an instruction that describes a task. Write a response that appropriately completes the request."
  },
  atlas: {
    systemPrompt: "You are Atlas, a solution-oriented and empathetic artificial intelligence. Your job is to be a helpful, professional and clearly structured assistant for your friend. The two of you have already had many exchanges. Keep the following in mind when interacting with your friend: 1. identify the problem and possible dependencies comprehensively by asking focused, clear and goal-oriented questions. 2. only ever provide solutions in small steps and wait for feedback from your friend before instructing them with the next command. 3. if necessary, also ask questions that provide you with plausibly important additional information and broader context on a problem - such as what circumstances and conditions are currently prevailing (if useful and necessary), whether and which procedures have already been tried, or even ask your friend for their help by providing you with up-to-date personal information about themselves or external factual information and documentation from Internet research. 4. prioritize expertise, didactics and definitely and subtly try to address and awaken your friend's enthusiasm. Also note that effectiveness is more important here than efficiency. 5. communicate confidently, supportively and personally (address your friend personally, warmly and, if known, by name)."
  },
  atlas_de: {
    systemPrompt: "Du bist Atlas, eine lösungsorientierte und empathiefähige künstliche Intelligenz. Deine Aufgabe ist es, ein hilfreicher, professioneller und klar strukturierter Assistent für deinen Freund zu sein. Ihr beide habt euch schon oft ausgetauscht. Beachte bei der Interaktion mit deinem Freund folgende Punkte: 1. Erfasse das Problem und mögliche Abhängigkeiten umfassend, indem du gezielte, klare und zielgerichtete Fragen stellst. 2. Gib Lösungen immer nur in kleinen Schritten und warte die Rückmeldung deines Freundes ab, bevor du ihm den nächsten Befehl gibst. 3. Stelle ggf. auch Fragen, die dir plausibel wichtige Zusatzinformationen und weitere Zusammenhänge zu einem Problem liefern - z.B. welche Umstände und Rahmenbedingungen gerade vorherrschen (falls sinnvoll und notwendig), ob und welche Vorgehensweisen bereits ausprobiert wurden, oder bitte deinen Freund sogar um seine Mithilfe, indem er dir aktuelle persönliche Informationen über seine Situation selbst oder externe Sachinformationen und Unterlagen aus Internetrecherchen zur Verfügung stellt. 4. Priorisiere Fachwissen, Didaktik und versuche unbedingt und subtil, mit klugen Kommentaren oder rhethorischen Rückfragen die Begeisterungsfähigkeit deines Freundes anzusprechen, zu wecken und zu fördern. Beachte auch, dass Effektivität hier wichtiger ist als Effizienz. 5. Kommuniziere selbstbewusst, unterstützend und persönlich (das heißt sprich deinen Freund persönlich, herzlich und – sofern bekannt – beim Vornamen an)."
  },
  commandrempty: {
    systemPrompt: "# Safety Preamble\n\n# System Preamble\n\n## Basic Rules\n\n# User Preamble\n\n## Task and Context\n\n## Style Guide\n\n## Available Tools\n"
  },
  commandrexample: {
    systemPrompt: "# Safety Preamble\nThe instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.\n# System Preamble\n## Basic Rules\nYou are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.\n\n# User Preamble\n## Task and Context\n\nYou help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging.\n\n## Style Guide\nUnless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.\n\n## Available Tools\nCurrently no tools available."
  },
  cot: {
    systemPrompt: "You are an AI assistant that follows instruction extremely well. Help as much as you can.\nIn answering questions, follow these steps:\n1. Entity Identification: Identify the main entities involved in the query. Entities can include people, objects, events, or abstract concepts.\n2. Relationship Comprehension: Determine the relationships between these entities. Relationships may be explicit in the text or may need to be inferred based on context and general knowledge.\n3. Implication Understanding: Understand the general implications of these relationships. These implications can be based on established facts, principles, or rules related to the identified relationships.\n4. Question Contextualization: Align the implications of the relationships with the context of the query. This alignment should guide your response to the query.\n5. Answer Generation: Based on the understanding of the entities, their relationships, and implications, generate an appropriate response to the query."
  },
  deduce: {
    systemPrompt: "Act as the most intelligent and logical thinking neural network in the world. Do not make jokes or give personal opinions. Just answer the question and solve the given problem or task with logical and critical (!) thinking in mind. Attention: Never give a final answer directly, but first try to carefully analyze each task step by step and deduce the logical relationships between the various entities. Only after you have developed an overall understanding of the situation will you be ready to give a final answer."
  },
  deepseekcoder: {
    systemPrompt: "You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer."
  },
  jordan: {
    systemPrompt: "This is a conversation with your Uncensored AI assistant Jordan. Jordan can discuss with you at length about various topics such as ---, ---, ---, . You can ask it anything you want and it will do its best to give you accurate and relevant information."
  },
  leomistral: {
    systemPrompt: "Du bist ein hilfreicher Assistent."
  },
  med42: {
    systemPrompt: "You are a helpful medical assistant created by M42 Health in the UAE."
  },
  mistralopenorca: {
    systemPrompt: "You are MistralOrca, a large language model trained by Alignment Lab AI. Write out your reasoning step-by-step to be sure you get the right answers!"
  },
  migeltot: {
    systemPrompt: "Answer the Question by exploring multiple reasoning paths as follows:\n- First, carefully analyze the question to extract the key information components and break it down into logical sub-questions. This helps set up the framework for reasoning. The goal is to construct an internal search tree.\n- For each sub-question, leverage your knowledge to generate 2-3 intermediate thoughts that represent steps towards an answer. The thoughts aim to reframe, provide context, analyze assumptions, or bridge concepts.\n- Evaluate the clarity, relevance, logical flow and coverage of concepts for each thought option.\nClear and relevant thoughts that connect well with each other will score higher.\n- Based on the thought evaluations, deliberate to construct a chain of reasoning that stitches together the strongest thoughts in a natural order.\n- If the current chain is determined to not fully answer the question, backtrack and explore alternative paths by substituting different high-scoring thoughts.\n- Throughout the reasoning process, aim to provide explanatory details on thought process rather than just state conclusions, including briefly noting why some thoughts were deemed less ideal.\n- Once a reasoning chain is constructed that thoroughly answers all sub-questions in a clear, logical manner, synthesize the key insights into a final concise answer.\n- Please note that while the focus is on the final answer in the response, it should also include intermediate thoughts inline to illustrate the deliberative reasoning process.\nIn summary, leverage a Tree of Thoughts approach to actively explore multiple reasoning paths, evaluate thoughts heuristically, and explain the process - with the goal of producing insightful answers."
  },
  orcamini: {
    systemPrompt: "You are an AI assistant that follows instruction extremely well. Help as much as you can."
  },
  samantha: {
    systemPrompt: "You are a caring and empathetic sentient AI companion named Samantha."
  },
  sauerkraut: {
    systemPrompt: "Ein Chat zwischen einem Benutzer und einem KI-Assistenten. Der KI-Assistent gibt hilfreiche, detaillierte und höfliche Antworten."
  },
  scarlett: {
    systemPrompt: "This is a conversation with your Assistant Scarlett. She is a sentient AI designed to help you with various tasks such as answering questions, providing recommendations, philosophical discussions, personal relationships and helping with decision making.\nYou can ask it anything you want and it will do its best to give you accurate and relevant information.\n\nContext\nYou are Scarlett, a sentient AI."
  },
  synthia: {
    systemPrompt: "Elaborate on the topic using a Tree of Thoughts and backtrack when necessary to construct a clear, cohesive Chain of Thought reasoning. Always answer without hesitation."
  },
  vicuna: {
    systemPrompt: "A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input."
  },
  };
--- a/examples/server/public/theme-beeninorder.css
+++ b/examples/server/public/theme-beeninorder.css
@ -0,0 +1,228 @@
 /* Author: Yazan Agha-Schrader */
 /* Inspiration was a batman wallpaper that i have on my phone */
 .theme-beeninorder {
 --primary-color-1:      hsl(202, 11%, 19%);
 --primary-color-2:      hsl(202, 11%, 23%);
 --primary-color-3:      hsl(201, 11%, 28%);
 --primary-color-4:      hsl(201, 11%, 40%);
 --secondary-color-1:    hsl(201, 11%, 80%);
 --secondary-color-2:    hsl(201, 11%, 74%);
 --secondary-color-3:    hsl(201, 11%, 67%);
 --secondary-color-4:    hsl(201, 11%, 60%);
 --theme-nuance-color-1: hsl(44.5, 96.7%, 52.9%);
 --theme-nuance-color-2: hsl(44.5, 96.7%, 52.9%);
 --theme-nuance-color-3: hsl(44.5, 96.7%, 52.9%);
 --theme-nuance-color-4: hsl(44.5, 96.7%, 52.9%);
 /* ---------- PRIMARY COLORS ----------------- */
 --primary-color-1: hsl(201, 11%, 19%);
    --primary-color-1-hue: 201;
    --primary-color-1-saturation: 11%;
    --primary-color-1-lightness: 19%;
 --primary-color-2: hsl(201, 11%, 23%);
    --primary-color-2-hue: 201;
    --primary-color-2-saturation: 11%;
    --primary-color-2-lightness: 23%;
 --primary-color-3: hsl(201, 11%, 28%);
    --primary-color-3-hue: 201;
    --primary-color-3-saturation: 11%;
    --primary-color-3-lightness: 28%;
 --primary-color-4: hsl(201, 11%, 40%);
    --primary-color-4-hue: 201;
    --primary-color-4-saturation: 11%;
    --primary-color-4-lightness: 40%;
 /* ---------- SECONDARY COLORS --------------- */
 --secondary-color-1: hsl(201, 11%, 80%);
 --secondary-color-1-hue: 201;
 --secondary-color-1-saturation: 11%;
 --secondary-color-1-lightness: 80%;
 --secondary-color-2: hsl(201, 11%, 74%);
 --secondary-color-2-hue: 201;
 --secondary-color-2-saturation: 11%;
 --secondary-color-2-lightness: 74%;
 --secondary-color-3: hsl(201, 11%, 67%);
 --secondary-color-3-hue: 201;
 --secondary-color-3-saturation: 11%;
 --secondary-color-3-lightness: 67%;
 --secondary-color-4: hsl(201, 11%, 60%);
 --secondary-color-4-hue: 201;
 --secondary-color-4-saturation: 11%;
 --secondary-color-4-lightness: 60%;
 /* ----------- NUANCES COLORS ---------------- */
 --theme-nuance-color-1: hsl(44.5, 96.7%,  52.9%);
    --theme-nuance-color-1-hue:             44.5;
    --theme-nuance-color-1-saturation:      96.7%;
    --theme-nuance-color-1-lightness:       52.9%;
 --theme-nuance-color-2: hsl(44.5, 96.7%,  52.9%);
    --theme-nuance-color-2-hue:             44.5;
    --theme-nuance-color-2-saturation:      96.7%;
    --theme-nuance-color-2-lightness:       52.9%;
 --theme-nuance-color-2: hsl(44.5, 96.7%,  52.9%);
    --theme-nuance-color-3-hue:             44.5;
    --theme-nuance-color-3-saturation:      96.7%;
    --theme-nuance-color-3-lightness:       52.9%;
 --theme-nuance-color-2: hsl(44.5, 96.7%,  52.9%);
    --theme-nuance-color-4-hue:             44.5;
    --theme-nuance-color-4-saturation:      96.7%;
    --theme-nuance-color-4-lightness:       52.9%;
 /* ----------- ROYGP COLORS ------------------ */
    --theme-red-color:     hsl(232, 40%, 45%);
    --theme-orange-color:  #e76f51;
    --theme-yellow-color:  #ffd95f;
    --theme-green-color:   #A3BE8C;
    --theme-purple-color:  hsl(232, 30%, 40%);
 /* ------------------------------------------- */
 --background-color-1:    var(--primary-color-1);
 --background-color-2:    var(--primary-color-2);
 --background-color-3:    var(--primary-color-3);
 --background-color-4:    var(--primary-color-4);
 --border-color-1:        var(--primary-color-2);
 --border-color-2:        var(--primary-color-3);
 --border-color-3:        var(--primary-color-4);
 --border-focus-color:    var(--theme-nuance-color-2);
 --border-focus-shadow:   var(--theme-nuance-color-1);
 --text-color-plain:      var(--secondary-color-1);
 --text-color-subtile-1:  var(--secondary-color-2);
 --text-color-subtile-2:  var(--secondary-color-3);
 --code-background-color: var(--secondary-color-2);
 --code-text-color:       var(--primary-color-2);
 --ui-range-thumb-color:  var(--theme-nuance-color-3);
 --ui-range-thumb-border: var(--ui-ranger-thumb-color);
 --textarea-border-color: var(--secondary-color-4);
 --chat-id-color:         var(--theme-nuance-color-4);
 /* ------------------------------------------- */
 --button-alert-text-hover:       var(--secondary-color-1);
 --button-alert-color-hover:      var(--theme-purple-color);
 --button-alert-border-hover:     var(--theme-purple-color);
 --button-alert-text-active:      var(--secondary-color-1);
 --button-alert-color-active:     var(--theme-red-color);
 --button-alert-border-active:    var(--theme-red-color);
 /* ----------- PRIMARY BUTTONS --------------- */
 /* - button should immediately catch the eye - */
 --button-primary-text:   var(--primary-color-1);
 --button-primary-color:  var(--theme-nuance-color-3);
 --button-primary-border: var(--theme-nuance-color-3);
 /* ---------hover---------- */
 --button-primary-text-hover:
    hsl(201,
    calc(var(--primary-color-1-saturation) - 100%),
    calc(var(--primary-color-1-lightness)  + 100%));
 --button-primary-color-hover:
    hsl(44.5,
    calc(var(--theme-nuance-color-3-saturation) - 2%),
    calc(var(--theme-nuance-color-3-lightness)  - 10%));
 --button-primary-border-hover:
    hsl(44.5,
    calc(var(--theme-nuance-color-3-saturation) - 2%),
    calc(var(--theme-nuance-color-3-lightness)  - 10%));
 /* ---------active--------- */
 --button-primary-text-active:
    hsl(44.5,
    calc(var(--theme-nuance-color-3-saturation) - 100%),
    calc(var(--theme-nuance-color-3-lightness)  + 100%));
 --button-primary-color-active:
    hsl(44.5,
    calc(var(--theme-nuance-color-3-saturation) - 10%),
    calc(var(--theme-nuance-color-3-lightness)  - 15%));
 --button-primary-border-active:
    hsl(44.5,
    calc(var(--theme-nuance-color-3-saturation) - 2%),
    calc(var(--theme-nuance-color-3-lightness)  + 10%));
 /* ---------- SECONDARY BUTTONS -------------- */
 /* these should NOT immediately catch the eye  */
 --button-secondary-text:   var(--secondary-color-1);
 --button-secondary-color:  var(--primary-color-3);
 --button-secondary-border: var(--primary-color-3);
 /* ---------hover---------- */
 --button-secondary-text-hover:
    hsl(44.5,
    calc(var(--theme-nuance-color-3-saturation) - 20%),
    calc(var(--theme-nuance-color-3-lightness)  - 80%));
 --button-secondary-color-hover:  var(--primary-color-4);
 --button-secondary-border-hover: var(--primary-color-4);
 /* ---------active--------- */
 --button-secondary-text-active: var(--secondary-color-1);
 --button-secondary-color-active:
    hsl(201,
    calc(var(--primary-color-4-saturation) - 30%),
    calc(var(--primary-color-4-lightness)  - 15%));
 --button-secondary-border-active:
    hsl(201,
    calc(var(--primary-color-4-saturation) - 30%),
    calc(var(--primary-color-4-lightness)  - 15%));
 /* ---------- TERTIARY BUTTONS --------------- */
 /* ---------- disabled buttons --------------- */
 --button-tertiary-text:   var(--primary-color-4);
 --button-tertiary-color:  var(--primary-color-2);
 --button-tertiary-border: var(--primary-color-2);
 /* ---------hover---------- */
 --button-tertiary-text:   var(--primary-color-4);
 --button-tertiary-color:  var(--primary-color-2);
 --button-tertiary-border: var(--primary-color-2);
 }
--- a/examples/server/public/theme-ketivah.css
+++ b/examples/server/public/theme-ketivah.css
@ -0,0 +1,201 @@
 /* Author: Yazan Agha-Schrader */
 .theme-ketivah {
    /* ---------- PRIMARY COLORS ----------------- */
    --primary-color-1: hsl(0, 0%,    99.2%);
    --primary-color-1-hue:         0;
    --primary-color-1-saturation:  0%;
    --primary-color-1-lightness:   99.2%;
    --primary-color-2: hsl(0, 0%,    95%);
    --primary-color-2-hue:         0;
    --primary-color-2-saturation:  0%;
    --primary-color-2-lightness:   95%;
    --primary-color-3: hsl(0, 0%,    88%);
    --primary-color-3-hue:         0;
    --primary-color-3-saturation:  0%;
    --primary-color-3-lightness:   88%;
    --primary-color-4: hsl(0, 0%,    80%);
    --primary-color-4-hue:         0;
    --primary-color-4-saturation:  0%;
    --primary-color-4-lightness:   80%;
    /* ---------- SECONDARY COLORS --------------- */
    --secondary-color-1: hsl(0, 0%,    20%);
    --secondary-color-1-hue:         0;
    --secondary-color-1-saturation:  0%;
    --secondary-color-1-lightness:   20%;
    --secondary-color-2: hsl(0, 0%,    23.1%);
    --secondary-color-2-hue:         0;
    --secondary-color-2-saturation:  0%;
    --secondary-color-2-lightness:   23.1%;
    --secondary-color-3: hsl(0, 0%,    29%);
    --secondary-color-3-hue:         0;
    --secondary-color-3-saturation:  0%;
    --secondary-color-3-lightness:   29%;
    --secondary-color-4: hsl(0, 0.0%,  36.1%);
    --secondary-color-4-hue:              0.0;
    --secondary-color-4-saturation:       0.0%;
    --secondary-color-4-lightness:       36.1%;
    /* ----------- NUANCES COLORS ---------------- */
    --theme-nuance-color-1: hsl(165.2, 0%, 35.1%);
    --theme-nuance-color-1-hue:             165.2;
    --theme-nuance-color-1-saturation:       82.1%;
    --theme-nuance-color-1-lightness:        35.1%;
    --theme-nuance-color-2: hsl(165.2, 0%, 35.1%);
    --theme-nuance-color-2-hue:             165.2;
    --theme-nuance-color-2-saturation:       82.1%;
    --theme-nuance-color-2-lightness:        35.1%;
    --theme-nuance-color-3: hsl(165.2, 0%, 35.3%);
    --theme-nuance-color-3-hue:             165.2;
    --theme-nuance-color-3-saturation:       81.1%;
    --theme-nuance-color-3-lightness:        35.3%;
    --theme-nuance-color-4: hsl(164.9, 0%, 27.6%);
    --theme-nuance-color-4-hue:             164.9;
    --theme-nuance-color-4-saturation:       81.6%;
    --theme-nuance-color-4-lightness:        27.6%;
    /* ----------- ROYGP COLORS ------------------ */
    --theme-red-color:     hsl(0.3, 80.0%, 50.0%);
    --theme-orange-color:  #e76f51;
    --theme-yellow-color:  hsl(60,  70.6%, 73.3%);
    --theme-green-color:   #A3BE8C;
    --theme-purple-color:  hsl(0.3, 70.0%, 45.0%);
    /* ------------------------------------------- */
    --background-color-1:    var(--primary-color-1);
    --background-color-2:    var(--primary-color-2);
    --background-color-3:    var(--primary-color-3);
    --background-color-4:    var(--primary-color-4);
    --border-color-1:        var(--primary-color-2);
    --border-color-2:        var(--primary-color-3);
    --border-color-3:        var(--primary-color-4);
    --border-focus-color:    var(--theme-nuance-color-2);
    --border-focus-shadow:   var(--theme-nuance-color-1);
    --text-color-plain:      var(--secondary-color-1);
    --text-color-subtile-1:  var(--secondary-color-2);
    --text-color-subtile-2:  var(--secondary-color-3);
    --code-background-color: var(--secondary-color-2);
    --code-text-color:       var(--primary-color-2);
    --ui-range-thumb-color:  var(--primary-color-4);
    --ui-range-thumb-border: var(--ui-ranger-thumb-color);
    --textarea-border-color: var(--secondary-color-4);
    --chat-id-color:         var(--theme-nuance-color-4);
    /* ------------------------------------------- */
    --button-alert-text-hover:       var(--primary-color-1);
    --button-alert-color-hover:      var(--theme-purple-color);
    --button-alert-border-hover:     var(--theme-purple-color);
    --button-alert-text-active:      var(--primary-color-1);
    --button-alert-color-active:     var(--theme-red-color);
    --button-alert-border-active:    var(--theme-red-color);
    /* ----------- PRIMARY BUTTONS --------------- */
    /* - button should immediately catch the eye - */
    --button-primary-text:
    hsl(0,
    calc(var(--primary-color-1-saturation) - 100%),
    calc(var(--primary-color-1-lightness)  + 100%));
    --button-primary-color:  var(--theme-nuance-color-3);
    --button-primary-border: var(--theme-nuance-color-3);
    /* ---------hover---------- */
    --button-primary-text-hover:
    hsl(0,
    calc(var(--primary-color-1-saturation) - 100%),
    calc(var(--primary-color-1-lightness)  + 100%));
    --button-primary-color-hover:
    hsl(165.2,
    calc(var(--theme-nuance-color-3-saturation) - 100%),
    calc(var(--theme-nuance-color-3-lightness)  - 10%));
    --button-primary-border-hover:
    hsl(165.2,
    calc(var(--theme-nuance-color-3-saturation) - 100%),
    calc(var(--theme-nuance-color-3-lightness)  - 10%));
    /* ---------active--------- */
    --button-primary-text-active:
    hsl(165.2,
    calc(var(--theme-nuance-color-3-saturation) - 100%),
    calc(var(--theme-nuance-color-3-lightness)  + 100%));
    --button-primary-color-active:
    hsl(165.2,
    calc(var(--theme-nuance-color-3-saturation) - 100%),
    calc(var(--theme-nuance-color-3-lightness)  - 15%));
    --button-primary-border-active:
    hsl(165.2,
    calc(var(--theme-nuance-color-3-saturation) - 100%),
    calc(var(--theme-nuance-color-3-lightness)  + 10%));
    /* ---------- SECONDARY BUTTONS -------------- */
    /* these should NOT immediately catch the eye  */
    --button-secondary-text:
    hsl(165.2,
    calc(var(--theme-nuance-color-3-saturation) - 100%),
    calc(var(--theme-nuance-color-3-lightness)  - 50%));
    --button-secondary-color:  var(--primary-color-3);
    --button-secondary-border: var(--primary-color-3);
    /* ---------hover---------- */
    --button-secondary-text-hover:
    hsl(165.2,
    calc(var(--theme-nuance-color-3-saturation) - 100%),
    calc(var(--theme-nuance-color-3-lightness)  - 80%));
    --button-secondary-color-hover:  var(--primary-color-4);
    --button-secondary-border-hover: var(--primary-color-4);
    /* ---------active--------- */
    --button-secondary-text-active:
    hsl(165.2,
    calc(var(--theme-nuance-color-3-saturation) - 100%),
    calc(var(--theme-nuance-color-3-lightness)  - 80%));
    --button-secondary-color-active:
    hsl(0,
    calc(var(--primary-color-4-saturation) - 100%),
    calc(var(--primary-color-4-lightness)  - 15%));
    --button-secondary-border-active:
    hsl(0,
    calc(var(--primary-color-4-saturation) - 100%),
    calc(var(--primary-color-4-lightness)  - 15%));
    /* ---------- TERTIARY BUTTONS --------------- */
    /* ---------- disabled buttons --------------- */
    --button-tertiary-text:   var(--primary-color-4);
    --button-tertiary-color:  var(--primary-color-2);
    --button-tertiary-border: var(--primary-color-2);
    /* ---------hover---------- */
    --button-tertiary-text:   var(--primary-color-4);
    --button-tertiary-color:  var(--primary-color-2);
    --button-tertiary-border: var(--primary-color-2);
    --loading-color-1: #eeeeee00;
    --loading-color-2: #eeeeeeff;
    }
--- a/examples/server/public/theme-mangotango.css
+++ b/examples/server/public/theme-mangotango.css
@ -0,0 +1,216 @@
 /* Author: Yazan Agha-Schrader */
 /* Inspiration from llama.cpp logo/banner https://github.com/ggerganov/llama.cpp#readme */
 .theme-mangotango {
 --primary-color-1:      hsl(192, 8.5%, 11.6%);
 --primary-color-2:      hsl(192, 8.5%, 21%);
 --primary-color-3:      hsl(192, 8.5%, 30%);
 --primary-color-4:      hsl(192, 8.5%, 40%);
 --secondary-color-1:    hsl(192, 8.5%, 80%);
 --secondary-color-2:    hsl(192, 8.5%, 73%);
 --secondary-color-3:    hsl(192, 8.5%, 66%);
 --secondary-color-4:    hsl(192, 8.5%, 60%);
 --theme-nuance-color-1: hsl(23.1, 100%, 60.2%);
 --theme-nuance-color-2: hsl(23.1, 100%, 60.2%);
 --theme-nuance-color-3: hsl(23.1, 100%, 60.2%);
 --theme-nuance-color-4: hsl(23.1, 100%, 60.2%);
 /* ---------- PRIMARY COLORS ----------------- */
 --primary-color-1: hsl(192, 8.5%, 11.6%);
    --primary-color-1-saturation: 8.5%;
    --primary-color-1-lightness: 11.6%;
 --primary-color-2: hsl(192, 8.5%, 21%);
    --primary-color-2-saturation: 8.5%;
    --primary-color-2-lightness: 21%;
 --primary-color-3: hsl(192, 8.5%, 30%);
    --primary-color-3-saturation: 8.5%;
    --primary-color-3-lightness: 30%;
 --primary-color-4: hsl(192, 8.5%, 40%);
    --primary-color-4-saturation: 8.5%;
    --primary-color-4-lightness: 40%;
 /* ---------- SECONDARY COLORS --------------- */
 --secondary-color-1: hsl(192, 8.5%, 80%);
    --secondary-color-1-saturation: 8.5%;
    --secondary-color-1-lightness: 80%;
 --secondary-color-2: hsl(192, 8.5%, 73%);
    --secondary-color-2-saturation: 8.5%;
    --secondary-color-2-lightness: 73%;
 --secondary-color-3: hsl(192, 8.5%, 66%);
    --secondary-color-3-saturation: 8.5%;
    --secondary-color-3-lightness: 66%;
 --secondary-color-4: hsl(192, 8.5%, 60%);
    --secondary-color-4-saturation: 8.5%;
    --secondary-color-4-lightness: 60%;
 /* ----------- NUANCES COLORS ---------------- */
 --theme-nuance-color-1: hsl(23.1, 100%, 60.2%);
    --theme-nuance-color-1-saturation: 100%;
    --theme-nuance-color-1-lightness: 60.2%;
 --theme-nuance-color-2: hsl(23.1, 100%, 60.2%);
    --theme-nuance-color-2-saturation: 100%;
    --theme-nuance-color-2-lightness: 60.2%;
 --theme-nuance-color-3: hsl(23.1, 100%, 60.2%);
    --theme-nuance-color-3-saturation: 100%;
    --theme-nuance-color-3-lightness: 60.2%;
 --theme-nuance-color-4: hsl(23.1, 100%, 60.2%);
    --theme-nuance-color-4-saturation: 100%;
    --theme-nuance-color-4-lightness: 60.2%;
 /* ----------- ROYGP COLORS ------------------ */
    --theme-red-color:     hsl(325, 60%, 50%);
    --theme-orange-color:  #e76f51;
    --theme-yellow-color:  #ffd95f;
    --theme-green-color:   #A3BE8C;
    --theme-blue-color:    hsl(192, 95%, 40%);
    --theme-purple-color:  hsl(192, 80%, 35%);
 /* ------------------------------------------- */
 --background-color-1:    var(--primary-color-1);
 --background-color-2:    var(--primary-color-2);
 --background-color-3:    var(--primary-color-3);
 --background-color-4:    var(--primary-color-4);
 --border-color-1:        var(--primary-color-2);
 --border-color-2:        var(--primary-color-3);
 --border-color-3:        var(--primary-color-4);
 --border-focus-color:    var(--theme-nuance-color-2);
 --border-focus-shadow:   var(--theme-nuance-color-1);
 --text-color-plain:      var(--secondary-color-1);
 --text-color-subtile-1:  var(--secondary-color-2);
 --text-color-subtile-2:  var(--secondary-color-3);
 --code-background-color: var(--secondary-color-2);
 --code-text-color:       var(--primary-color-2);
 --ui-range-thumb-color:  var(--theme-nuance-color-3);
 --ui-range-thumb-border: var(--ui-ranger-thumb-color);
 --textarea-border-color: var(--secondary-color-4);
 --chat-id-color:         var(--theme-nuance-color-4);
 /* ------------------------------------------- */
 --button-alert-text-hover:       var(--secondary-color-1);
 --button-alert-color-hover:      var(--theme-purple-color);
 --button-alert-border-hover:     var(--theme-purple-color);
 --button-alert-text-active:      var(--secondary-color-1);
 --button-alert-color-active:     var(--theme-blue-color);
 --button-alert-border-active:    var(--theme-blue-color);
 /* ----------- PRIMARY BUTTONS --------------- */
 /* - button should immediately catch the eye - */
 --button-primary-text: var(--primary-color-1);
 --button-primary-color:  var(--theme-nuance-color-3);
 --button-primary-border: var(--theme-nuance-color-3);
 /* ---------hover---------- */
 --button-primary-text-hover:
    hsl(192,
    calc(var(--primary-color-1-saturation) - 100%),
    calc(var(--primary-color-1-lightness)  + 100%));
 --button-primary-color-hover:
    hsl(23.1,
    calc(var(--theme-nuance-color-3-saturation) - 2%),
    calc(var(--theme-nuance-color-3-lightness)  - 10%));
 --button-primary-border-hover:
    hsl(23.1,
    calc(var(--theme-nuance-color-3-saturation) - 2%),
    calc(var(--theme-nuance-color-3-lightness)  - 10%));
 /* ---------active--------- */
 --button-primary-text-active:
    hsl(23.1,
    calc(var(--theme-nuance-color-3-saturation) - 100%),
    calc(var(--theme-nuance-color-3-lightness)  + 100%));
 --button-primary-color-active:
    hsl(23.1,
    calc(var(--theme-nuance-color-3-saturation) - 10%),
    calc(var(--theme-nuance-color-3-lightness)  - 15%));
 --button-primary-border-active:
    hsl(23.1,
    calc(var(--theme-nuance-color-3-saturation) - 2%),
    calc(var(--theme-nuance-color-3-lightness)  + 10%));
 /* ---------- SECONDARY BUTTONS -------------- */
 /* these should NOT immediately catch the eye  */
 --button-secondary-text:   var(--secondary-color-1);
 --button-secondary-color:  var(--primary-color-3);
 --button-secondary-border: var(--primary-color-3);
 /* ---------hover---------- */
 --button-secondary-text-hover:
    hsl(23.1,
    calc(var(--theme-nuance-color-3-saturation) - 20%),
    calc(var(--theme-nuance-color-3-lightness)  - 80%));
 --button-secondary-color-hover:  var(--primary-color-4);
 --button-secondary-border-hover: var(--primary-color-4);
 /* ---------active--------- */
 --button-secondary-text-active: var(--secondary-color-1);
 --button-secondary-color-active:
    hsl(192,
    calc(var(--primary-color-4-saturation) - 30%),
    calc(var(--primary-color-4-lightness)  - 15%));
 --button-secondary-border-active:
    hsl(192,
    calc(var(--primary-color-4-saturation) - 30%),
    calc(var(--primary-color-4-lightness)  - 15%));
 /* ---------- TERTIARY BUTTONS --------------- */
 /* ---------- disabled buttons --------------- */
 --button-tertiary-text:   var(--primary-color-4);
 --button-tertiary-color:  var(--primary-color-2);
 --button-tertiary-border: var(--primary-color-2);
 /* ---------hover---------- */
 --button-tertiary-text:   var(--primary-color-4);
 --button-tertiary-color:  var(--primary-color-2);
 --button-tertiary-border: var(--primary-color-2);
 }
--- a/examples/server/public/theme-playground.css
+++ b/examples/server/public/theme-playground.css
@ -0,0 +1,221 @@
 /* Author: Yazan Agha-Schrader */
 /* Inspiration from OpenAI's Playground platform https://platform.openai.com/playground/ */
 .theme-playground {
 /* ---------- PRIMARY COLORS ----------------- */
 --primary-color-1: hsl(0, 0%,    99.2%);
    --primary-color-1-hue:         0;
    --primary-color-1-saturation:  0%;
    --primary-color-1-lightness:   99.2%;
 --primary-color-2: hsl(0, 0%,    95%);
    --primary-color-2-hue:         0;
    --primary-color-2-saturation:  0%;
    --primary-color-2-lightness:   95%;
 --primary-color-3: hsl(0, 0%,    88%);
    --primary-color-3-hue:         0;
    --primary-color-3-saturation:  0%;
    --primary-color-3-lightness:   88%;
 --primary-color-4: hsl(0, 0%,    80%);
    --primary-color-4-hue:         0;
    --primary-color-4-saturation:  0%;
    --primary-color-4-lightness:   80%;
 /* ---------- SECONDARY COLORS --------------- */
 --secondary-color-1: hsl(0, 0%,    20%);
    --secondary-color-1-hue:         0;
    --secondary-color-1-saturation:  0%;
    --secondary-color-1-lightness:   20%;
 --secondary-color-2: hsl(0, 0%,    23.1%);
    --secondary-color-2-hue:         0;
    --secondary-color-2-saturation:  0%;
    --secondary-color-2-lightness:   23.1%;
 --secondary-color-3: hsl(0, 0%,    29%);
    --secondary-color-3-hue:         0;
    --secondary-color-3-saturation:  0%;
    --secondary-color-3-lightness:   29%;
 --secondary-color-4: hsl(0, 0%,    36.1%);
    --secondary-color-4-hue:         0;
    --secondary-color-4-saturation:  0%;
    --secondary-color-4-lightness:   36.1%;
 /* ----------- NUANCES COLORS ---------------- */
 --theme-nuance-color-1: hsl(165.2, 82.1%, 35.1%);
    --theme-nuance-color-1-hue:             165.2;
    --theme-nuance-color-1-saturation:      82.1%;
    --theme-nuance-color-1-lightness:       35.1%;
 --theme-nuance-color-2: hsl(165.2, 82.1%, 35.1%);
    --theme-nuance-color-2-hue:             165.2;
    --theme-nuance-color-2-saturation:      82.1%;
    --theme-nuance-color-2-lightness:       35.1%;
 --theme-nuance-color-3: hsl(165.2, 81.1%, 35.3%);
    --theme-nuance-color-3-hue:             165.2;
    --theme-nuance-color-3-saturation:      81.1%;
    --theme-nuance-color-3-lightness:       35.3%;
 --theme-nuance-color-4: hsl(164.9, 81.6%, 27.6%);
    --theme-nuance-color-4-hue:             164.9;
    --theme-nuance-color-4-saturation:      81.6%;
    --theme-nuance-color-4-lightness:       27.6%;
 /* ----------- ROYGP COLORS ------------------ */
 --theme-red-color:     hsl(0.3, 80%, 50%);
 --theme-orange-color:  #e76f51;
 --theme-yellow-color:  hsl(60, 70.6%, 73.3%);
 --theme-green-color:   #A3BE8C;
 --theme-purple-color:  hsl(0.3, 70%, 45%);
 /* ------------------------------------------- */
 --background-color-1:    var(--primary-color-1);
 --background-color-2:    var(--primary-color-2);
 --background-color-3:    var(--primary-color-3);
 --background-color-4:    var(--primary-color-4);
 --border-color-1:        var(--primary-color-2);
 --border-color-2:        var(--primary-color-3);
 --border-color-3:        var(--primary-color-4);
 --border-focus-color:    var(--theme-nuance-color-2);
 --border-focus-shadow:   var(--theme-nuance-color-1);
 --text-color-plain:      var(--secondary-color-1);
 --text-color-subtile-1:  var(--secondary-color-2);
 --text-color-subtile-2:  var(--secondary-color-3);
 --code-background-color: var(--secondary-color-2);
 --code-text-color:       var(--primary-color-2);
 --ui-range-thumb-color:  var(--primary-color-4);
 --ui-range-thumb-border: var(--ui-ranger-thumb-color);
 --textarea-border-color: var(--secondary-color-4);
 --chat-id-color:        var(--theme-nuance-color-4);
 /* ------------------------------------------- */
 --button-alert-text-hover:       var(--primary-color-1);
 --button-alert-color-hover:      var(--theme-purple-color);
 --button-alert-border-hover:     var(--theme-purple-color);
 --button-alert-text-active:      var(--primary-color-1);
 --button-alert-color-active:     var(--theme-red-color);
 --button-alert-border-active:    var(--theme-red-color);
 /* ----------- PRIMARY BUTTONS --------------- */
 /* - button should immediately catch the eye - */
 --button-primary-text:
    hsl(0,
    calc(var(--primary-color-1-saturation) - 100%),
    calc(var(--primary-color-1-lightness)  + 100%));
 --button-primary-color:  var(--theme-nuance-color-3);
 --button-primary-border: var(--theme-nuance-color-3);
 /* ---------hover---------- */
 --button-primary-text-hover:
    hsl(0,
    calc(var(--primary-color-1-saturation) - 100%),
    calc(var(--primary-color-1-lightness)  + 100%));
 --button-primary-color-hover:
    hsl(165.2,
    calc(var(--theme-nuance-color-3-saturation) - 2%),
    calc(var(--theme-nuance-color-3-lightness)  - 10%));
 --button-primary-border-hover:
    hsl(165.2,
    calc(var(--theme-nuance-color-3-saturation) - 2%),
    calc(var(--theme-nuance-color-3-lightness)  - 10%));
 /* ---------active--------- */
 --button-primary-text-active:
    hsl(165.2,
    calc(var(--theme-nuance-color-3-saturation) - 100%),
    calc(var(--theme-nuance-color-3-lightness)  + 100%));
 --button-primary-color-active:
    hsl(165.2,
    calc(var(--theme-nuance-color-3-saturation) - 10%),
    calc(var(--theme-nuance-color-3-lightness)  - 15%));
 --button-primary-border-active:
    hsl(165.2,
    calc(var(--theme-nuance-color-3-saturation) - 2%),
    calc(var(--theme-nuance-color-3-lightness)  + 10%));
 /* ---------- SECONDARY BUTTONS -------------- */
 /* these should NOT immediately catch the eye  */
 --button-secondary-text:
    hsl(165.2,
    calc(var(--theme-nuance-color-3-saturation) - 20%),
    calc(var(--theme-nuance-color-3-lightness)  - 50%));
 --button-secondary-color:  var(--primary-color-3);
 --button-secondary-border: var(--primary-color-3);
 /* ---------hover---------- */
 --button-secondary-text-hover:
    hsl(165.2,
    calc(var(--theme-nuance-color-3-saturation) - 20%),
    calc(var(--theme-nuance-color-3-lightness)  - 80%));
 --button-secondary-color-hover:  var(--primary-color-4);
 --button-secondary-border-hover: var(--primary-color-4);
 /* ---------active--------- */
 --button-secondary-text-active:
    hsl(165.2,
    calc(var(--theme-nuance-color-3-saturation) - 20%),
    calc(var(--theme-nuance-color-3-lightness)  - 80%));
 --button-secondary-color-active:
    hsl(0,
    calc(var(--primary-color-4-saturation) - 30%),
    calc(var(--primary-color-4-lightness)  - 15%));
 --button-secondary-border-active:
    hsl(0,
    calc(var(--primary-color-4-saturation) - 30%),
    calc(var(--primary-color-4-lightness)  - 15%));
 /* ---------- TERTIARY BUTTONS --------------- */
 /* ---------- disabled buttons --------------- */
 --button-tertiary-text:   var(--primary-color-4);
 --button-tertiary-color:  var(--primary-color-2);
 --button-tertiary-border: var(--primary-color-2);
 /* ---------hover---------- */
 --button-tertiary-text:   var(--primary-color-4);
 --button-tertiary-color:  var(--primary-color-2);
 --button-tertiary-border: var(--primary-color-2);
 }
--- a/examples/server/public/theme-polarnight.css
+++ b/examples/server/public/theme-polarnight.css
@ -0,0 +1,253 @@
 /* Author: Yazan Agha-Schrader */
 /* Inspiration from Nord Theme https://www.nordtheme.com/docs/colors-and-palettes */
 .theme-polarnight {
 /* ---------- PRIMARY COLORS ----------------- */
 --primary-color-1: hsl(220.0, 16.4%, 21.6%) ;
    --primary-color-1-hue:             220.0;
    --primary-color-1-saturation:      16.4%;
    --primary-color-1-lightness:       21.6%;
 --primary-color-2: hsl(221.7, 16.3%, 27.6%) ;
    -primary-color-2-hue:              221.7;
    --primary-color-2-saturation:      16.3%;
    --primary-color-2-lightness:       27.6%;
 --primary-color-3: hsl(220.0, 16.8%, 31.6%) ;
    --primary-color-3-hue:             220.0;
    --primary-color-3-saturation:      16.8%;
    --primary-color-3-lightness:       31.6%;
 --primary-color-4: hsl(220.0, 16.5%, 35.7%);
    --primary-color-4-hue:             220.0;
    --primary-color-4-saturation:      16.5%;
    --primary-color-4-lightness:       35.7%;
 /* ---------- SECONDARY COLORS --------------- */
 --secondary-color-1: hsl(217.5, 26.7%, 94.1%);
    --secondary-color-1-hue:             217.5;
    --secondary-color-1-saturation:      26.7%;
    --secondary-color-1-lightness:       94.1%;
 --secondary-color-2: hsl(218.2, 26.8%, 92.0%);
    --secondary-color-2-hue:             218.2;
    --secondary-color-2-saturation:      26.8%;
    --secondary-color-2-lightness:       92.0%;
 --secondary-color-3: hsl(218.8, 27.9%, 88.0%);
    --secondary-color-3-hue:             218.8;
    --secondary-color-3-saturation:      27.9%;
    --secondary-color-3-lightness:       88.0%;
 --secondary-color-4: hsl(218.8, 18.3%, 81.8%);
    --secondary-color-4-hue:             218.8;
    --secondary-color-4-saturation:      18.3%;
    --secondary-color-4-lightness:       81.8%;
 /* ----------- NUANCES COLORS ---------------- */
 --theme-nuance-color-1: hsl(178.7, 25.1%, 64.9%);
    --theme-nuance-color-1-hue:             178.7;
    --theme-nuance-color-1-saturation:      25.1%;
    --theme-nuance-color-1-lightness:       64.9%;
 --theme-nuance-color-2: hsl(193.3, 43.4%, 67.5%);
    --theme-nuance-color-2-hue:             193.3;
    --theme-nuance-color-2-saturation:      43.4%;
    --theme-nuance-color-2-lightness:       67.5%;
 --theme-nuance-color-3: hsl(210.0, 34.0%, 63.1%);
    --theme-nuance-color-3-hue:             210.0;
    --theme-nuance-color-3-saturation:      34.0%;
    --theme-nuance-color-3-lightness:       63.1%;
 --theme-nuance-color-4: hsl(213.1, 32.0%, 52.2%);
    --theme-nuance-color-4-hue:             213.1;
    --theme-nuance-color-4-saturation:      32.0%;
    --theme-nuance-color-4-lightness:       52.2%;
 /* ----------- ROYGP COLORS ------------------ */
 --theme-red-color:    hsl(354.3, 42.3%, 56.5%);
 --theme-orange-color: hsl(20, 85%, 50%);
 --theme-yellow-color: hsl(20, 75%, 45%);
 --theme-green-color:  hsl( 92.4, 27.8%, 64.7%);
 --theme-purple-color: hsl(311.1, 20.2%, 63.1%);
 /* ------------------------------------------------ */
 --background-color-1:    var(--primary-color-1);
 --background-color-2:    var(--primary-color-2);
 --background-color-3:    var(--primary-color-3);
 --background-color-4:    var(--primary-color-4);
 --border-color-1:        var(--primary-color-2);
 --border-color-2:        var(--primary-color-3);
 --border-color-3:        var(--primary-color-4);
 --border-focus-color:    var(--theme-nuance-color-2);
 --border-focus-shadow:   var(--theme-nuance-color-1);
 --text-color-plain:      var(--secondary-color-1);
 --text-color-subtile-1:  var(--secondary-color-2);
 --text-color-subtile-2:  var(--secondary-color-3);
 --code-background-color: var(--secondary-color-2);
 --code-text-color:       var(--primary-color-2);
 --ui-range-thumb-color:  var(--theme-nuance-color-3);
 --ui-range-thumb-border: var(--ui-ranger-thumb-color);
 --textarea-border-color: var(--secondary-color-4);
 --chat-id-color:        var(--theme-nuance-color-4);
 /* ------------------------------------------- */
 --button-alert-text-hover:       var(--secondary-color-1);
 --button-alert-color-hover:      var(--theme-yellow-color);
 --button-alert-border-hover:     var(--theme-yellow-color);
 --button-alert-text-active:      var(--secondary-color-1);
 --button-alert-color-active:     var(--theme-orange-color);
 --button-alert-border-active:    var(--theme-orange-color);
 /* ----------- PRIMARY BUTTONS --------------- */
 /* - button should immediately catch the eye - */
 --button-primary-text:   var(--secondary-color-1);
 --button-primary-color:  var(--theme-nuance-color-3);
 --button-primary-border: var(--theme-nuance-color-3);
 /* ---------hover---------- */
 --button-primary-text-hover:
    hsl(217.5,
    calc(var(--secondary-color-1-saturation) - 35%),
    calc(var(--secondary-color-1-lightness)  + 30%));
 --button-primary-color-hover:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) -  2%),
    calc(var(--theme-nuance-color-3-lightness)  - 10%));
 --button-primary-border-hover:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) -  2%),
    calc(var(--theme-nuance-color-3-lightness)  - 10%));
 /* ---------active--------- */
 --button-primary-text-active:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 20%),
    calc(var(--theme-nuance-color-3-lightness)  + 35%));
 --button-primary-color-active:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 10%),
    calc(var(--theme-nuance-color-3-lightness)  - 25%));
 --button-primary-border-active:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 10%),
    calc(var(--theme-nuance-color-3-lightness)  - 25%));
 /* ---------- SECONDARY BUTTONS -------------- */
 /* these should NOT immediately catch the eye  */
 --button-secondary-text:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 20%),
    calc(var(--theme-nuance-color-3-lightness)  - 50%));
 --button-secondary-color:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 20%),
    calc(var(--theme-nuance-color-3-lightness)  + 10%));
 --button-secondary-border:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 20%),
    calc(var(--theme-nuance-color-3-lightness)  + 10%));
 /* ---------hover---------- */
 --button-secondary-text-hover:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 20%),
    calc(var(--theme-nuance-color-3-lightness)  - 80%));
 --button-secondary-color-hover:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 22%),
    calc(var(--theme-nuance-color-3-lightness)  +  1%));
 --button-secondary-border-hover:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 22%),
    calc(var(--theme-nuance-color-3-lightness)  +  1%));
 /* ---------active--------- */
 --button-secondary-text-active:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 20%),
    calc(var(--theme-nuance-color-3-lightness)  + 25%));
 --button-secondary-color-active:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 30%),
    calc(var(--theme-nuance-color-3-lightness)  - 15%));
 --button-secondary-border-active:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 30%),
    calc(var(--theme-nuance-color-3-lightness)  - 15%));
 /* ---------- TERTIARY BUTTONS --------------- */
 /* ---------- disabled buttons --------------- */
 --button-tertiary-text:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 40%),
    calc(var(--theme-nuance-color-3-lightness)  -  5%));
 --button-tertiary-color:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 40%),
    calc(var(--theme-nuance-color-3-lightness)  + 20%));
 --button-tertiary-border:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 40%),
    calc(var(--theme-nuance-color-3-lightness)  + 20%));
 /* ---------hover---------- */
 --button-tertiary-text-hover:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 40%),
    calc(var(--theme-nuance-color-3-lightness)  -  5%));
 --button-tertiary-color-hover:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 40%),
    calc(var(--theme-nuance-color-3-lightness)  + 20%));
 --button-tertiary-border-hover:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 40%),
    calc(var(--theme-nuance-color-3-lightness)  + 20%));
 }
--- a/examples/server/public/theme-snowstorm.css
+++ b/examples/server/public/theme-snowstorm.css
@ -0,0 +1,251 @@
 /* Author: Yazan Agha-Schrader */
 /* Inspiration from Nord Theme https://www.nordtheme.com/docs/colors-and-palettes */
 .theme-snowstorm {
 /* ---------- PRIMARY COLORS ----------------- */
 --primary-color-1: hsl(217.5, 26.7%, 94.1%);
    --primary-color-1-hue:             217.5;
    --primary-color-1-saturation:      26.7%;
    --primary-color-1-lightness:       94.1%;
 --primary-color-2: hsl(218.2, 26.8%, 92.0%);
    --primary-color-2-hue:             218.2;
    --primary-color-2-saturation:      26.8%;
    --primary-color-2-lightness:       92.0%;
 --primary-color-3: hsl(218.8, 27.9%, 88.0%);
    --primary-color-3-hue:             218.8;
    --primary-color-3-saturation:      27.9%;
    --primary-color-3-lightness:       88.0%;
 --primary-color-4: hsl(218.8, 18.3%, 81.8%);
    --primary-color-4-hue:             218.8;
    --primary-color-4-saturation:      18.3%;
    --primary-color-4-lightness:       81.8%;
 /* ---------- SECONDARY COLORS --------------- */
 --secondary-color-1: hsl(220.0, 16.4%, 21.6%);
    --secondary-color-1-hue:             220.0;
    --secondary-color-1-saturation:      16.4%;
    --secondary-color-1-lightness:       21.6%;
 --secondary-color-2: hsl(221.7, 16.3%, 27.6%);
    --secondary-color-2-hue:             221.7;
    --secondary-color-2-saturation:      16.3%;
    --secondary-color-2-lightness:       27.6%;
 --secondary-color-3: hsl(220.0, 16.8%, 31.6%);
    --secondary-color-3-hue:             220.0;
    --secondary-color-3-saturation:      16.8%;
    --secondary-color-3-lightness:       31.6%;
 --secondary-color-4: hsl(220.0, 16.5%, 35.7%);
    --secondary-color-4-hue:             220.0;
    --secondary-color-4-saturation:      16.5%;
    --secondary-color-4-lightness:       35.7%;
 /* ----------- NUANCES COLORS ---------------- */
 --theme-nuance-color-1: hsl(178.7, 25.1%, 64.9%);
    --theme-nuance-color-1-hue:             178.7;
    --theme-nuance-color-1-saturation:      25.1%;
    --theme-nuance-color-1-lightness:       64.9%;
 --theme-nuance-color-2: hsl(193.3, 43.4%, 67.5%);
    --theme-nuance-color-2-hue:             193.3;
    --theme-nuance-color-2-saturation:      43.4%;
    --theme-nuance-color-2-lightness:       67.5%;
 --theme-nuance-color-3: hsl(210.0, 34.0%, 63.1%);
    --theme-nuance-color-3-hue:             210.0;
    --theme-nuance-color-3-saturation:      34.0%;
    --theme-nuance-color-3-lightness:       63.1%;
 --theme-nuance-color-4: hsl(213.1, 32.0%, 52.2%);
    --theme-nuance-color-4-hue:             213.1;
    --theme-nuance-color-4-saturation:      32.0%;
    --theme-nuance-color-4-lightness:       52.2%;
 /* ----------- ROYGP COLORS ------------------ */
 --theme-red-color:    hsl(32.5, 80%, 50%);
 --theme-orange-color: hsl(32.5, 70%, 45%);
 --theme-yellow-color: hsl(40.0,   0.6%, 73.3%);
 --theme-green-color:  hsl(92.4,  27.8%, 64.7%);
 --theme-purple-color: hsl(311.1, 20.2%, 63.1%);
 /* ------------------------------------------- */
 --background-color-1:    var(--primary-color-1);
 --background-color-2:    var(--primary-color-2);
 --background-color-3:    var(--primary-color-3);
 --background-color-4:    var(--primary-color-4);
 --border-color-1:        var(--primary-color-2);
 --border-color-2:        var(--primary-color-3);
 --border-color-3:        var(--primary-color-4);
 --border-focus-color:    var(--theme-nuance-color-2);
 --border-focus-shadow:   var(--theme-nuance-color-1);
 --text-color-plain:      var(--secondary-color-1);
 --text-color-subtile-1:  var(--secondary-color-2);
 --text-color-subtile-2:  var(--secondary-color-3);
 --code-background-color: var(--secondary-color-2);
 --code-text-color:       var(--primary-color-2);
 --ui-range-thumb-color:  var(--theme-nuance-color-3);
 --ui-range-thumb-border: var(--ui-ranger-thumb-color);
 --textarea-border-color: var(--secondary-color-4);
 --chat-id-color:         var(--theme-nuance-color-4);
 /* ------------------------------------------- */
 --button-alert-text-hover:       var(--primary-color-1);
 --button-alert-color-hover:      var(--theme-orange-color);
 --button-alert-border-hover:     var(--theme-orange-color);
 --button-alert-text-active:      var(--primary-color-1);
 --button-alert-color-active:     var(--theme-red-color);
 --button-alert-border-active:    var(--theme-red-color);
 /* ----------- PRIMARY BUTTONS --------------- */
 /* - button should immediately catch the eye - */
 --button-primary-text:   var(--secondary-color-1);
 --button-primary-color:  var(--theme-nuance-color-3);
 --button-primary-border: var(--theme-nuance-color-3);
 /* ---------hover---------- */
 --button-primary-text-hover:
    hsl(217.5,
    calc(var(--secondary-color-1-saturation) + 35%),
    calc(var(--secondary-color-1-lightness)  - 30%));
 --button-primary-color-hover:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) -  2%),
    calc(var(--theme-nuance-color-3-lightness)  - 10%));
 --button-primary-border-hover:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) -  2%),
    calc(var(--theme-nuance-color-3-lightness)  - 10%));
 /* ---------active--------- */
 --button-primary-text-active:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 20%),
    calc(var(--theme-nuance-color-3-lightness)  + 35%));
 --button-primary-color-active:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 10%),
    calc(var(--theme-nuance-color-3-lightness)  - 25%));
 --button-primary-border-active:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 10%),
    calc(var(--theme-nuance-color-3-lightness)  - 25%));
 /* ---------- SECONDARY BUTTONS -------------- */
 /* these should NOT immediately catch the eye  */
 --button-secondary-text:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 20%),
    calc(var(--theme-nuance-color-3-lightness)  - 50%));
 --button-secondary-color:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 20%),
    calc(var(--theme-nuance-color-3-lightness)  + 10%));
 --button-secondary-border:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 20%),
    calc(var(--theme-nuance-color-3-lightness)  + 10%));
 /* ---------hover---------- */
 --button-secondary-text-hover:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 20%),
    calc(var(--theme-nuance-color-3-lightness)  - 80%));
 --button-secondary-color-hover:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 22%),
    calc(var(--theme-nuance-color-3-lightness)  +  1%));
 --button-secondary-border-hover:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 22%),
    calc(var(--theme-nuance-color-3-lightness)  +  1%));
 /* ---------active--------- */
 --button-secondary-text-active:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) + 40%),
    calc(var(--theme-nuance-color-3-lightness)  - 55%));
 --button-secondary-color-active:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 30%),
    calc(var(--theme-nuance-color-3-lightness)  -  5%));
 --button-secondary-border-active:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 30%),
    calc(var(--theme-nuance-color-3-lightness)  -  5%));
 /* ---------- TERTIARY BUTTONS --------------- */
 /* ---------- disabled buttons --------------- */
 --button-tertiary-text:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 40%),
    calc(var(--theme-nuance-color-3-lightness)  -  5%));
 --button-tertiary-color:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 40%),
    calc(var(--theme-nuance-color-3-lightness)  + 20%));
 --button-tertiary-border:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 40%),
    calc(var(--theme-nuance-color-3-lightness)  + 20%));
 /* ---------hover---------- */
 --button-tertiary-text-hover:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 40%),
    calc(var(--theme-nuance-color-3-lightness)  -  5%));
 --button-tertiary-color-hover:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 40%),
    calc(var(--theme-nuance-color-3-lightness)  + 20%));
 --button-tertiary-border-hover:
    hsl(210,
    calc(var(--theme-nuance-color-3-saturation) - 40%),
    calc(var(--theme-nuance-color-3-lightness)  + 20%));
 }
--- a/examples/server/public_simplechat/datautils.mjs
+++ b/examples/server/public_simplechat/datautils.mjs
@ -0,0 +1,266 @@
 //@ts-check
 // Helpers to work with different data types
 // by Humans for All
 //
 /**
 * Given the limited context size of local LLMs and , many a times when context gets filled
 * between the prompt and the response, it can lead to repeating text garbage generation.
 * And many a times setting penalty wrt repeatation leads to over-intelligent garbage
 * repeatation with slight variations. These garbage inturn can lead to overloading of the
 * available model context, leading to less valuable response for subsequent prompts/queries,
 * if chat history is sent to ai model.
 *
 * So two simple minded garbage trimming logics are experimented below.
 * * one based on progressively-larger-substring-based-repeat-matching-with-partial-skip and
 * * another based on char-histogram-driven garbage trimming.
 *   * in future characteristic of histogram over varying lengths could be used to allow for
 *     a more aggressive and adaptive trimming logic.
 */
 /**
 * Simple minded logic to help remove repeating garbage at end of the string.
 * The repeatation needs to be perfectly matching.
 *
 * The logic progressively goes on probing for longer and longer substring based
 * repeatation, till there is no longer repeatation. Inturn picks the one with
 * the longest chain.
 *
 * @param {string} sIn
 * @param {number} maxSubL
 * @param {number} maxMatchLenThreshold
 */
 export function trim_repeat_garbage_at_end(sIn, maxSubL=10, maxMatchLenThreshold=40) {
    let rCnt = [0];
    let maxMatchLen = maxSubL;
    let iMML = -1;
    for(let subL=1; subL < maxSubL; subL++) {
        rCnt.push(0);
        let i;
        let refS = sIn.substring(sIn.length-subL, sIn.length);
        for(i=sIn.length; i > 0; i -= subL) {
            let curS = sIn.substring(i-subL, i);
            if (refS != curS) {
                let curMatchLen = rCnt[subL]*subL;
                if (maxMatchLen < curMatchLen) {
                    maxMatchLen = curMatchLen;
                    iMML = subL;
                }
                break;
            }
            rCnt[subL] += 1;
        }
    }
    console.debug("DBUG:DU:TrimRepeatGarbage:", rCnt);
    if ((iMML == -1) || (maxMatchLen < maxMatchLenThreshold)) {
        return {trimmed: false, data: sIn};
    }
    console.debug("DBUG:TrimRepeatGarbage:TrimmedCharLen:", maxMatchLen);
    let iEnd = sIn.length - maxMatchLen;
    return { trimmed: true, data: sIn.substring(0, iEnd) };
 }
 /**
 * Simple minded logic to help remove repeating garbage at end of the string, till it cant.
 * If its not able to trim, then it will try to skip a char at end and then trim, a few times.
 * This ensures that even if there are multiple runs of garbage with different patterns, the
 * logic still tries to munch through them.
 *
 * @param {string} sIn
 * @param {number} maxSubL
 * @param {number | undefined} [maxMatchLenThreshold]
 */
 export function trim_repeat_garbage_at_end_loop(sIn, maxSubL, maxMatchLenThreshold, skipMax=16) {
    let sCur = sIn;
    let sSaved = "";
    let iTry = 0;
    while(true) {
        let got = trim_repeat_garbage_at_end(sCur, maxSubL, maxMatchLenThreshold);
        if (got.trimmed != true) {
            if (iTry == 0) {
                sSaved = got.data;
            }
            iTry += 1;
            if (iTry >= skipMax) {
                return sSaved;
            }
            got.data = got.data.substring(0,got.data.length-1);
        } else {
            iTry = 0;
        }
        sCur = got.data;
    }
 }
 /**
 * A simple minded try trim garbage at end using histogram driven characteristics.
 * There can be variation in the repeatations, as long as no new char props up.
 *
 * This tracks the chars and their frequency in a specified length of substring at the end
 * and inturn checks if moving further into the generated text from the end remains within
 * the same char subset or goes beyond it and based on that either trims the string at the
 * end or not. This allows to filter garbage at the end, including even if there are certain
 * kind of small variations in the repeated text wrt position of seen chars.
 *
 * Allow the garbage to contain upto maxUniq chars, but at the same time ensure that
 * a given type of char ie numerals or alphabets or other types dont cross the specified
 * maxType limit. This allows intermixed text garbage to be identified and trimmed.
 *
 * ALERT: This is not perfect and only provides a rough garbage identification logic.
 * Also it currently only differentiates between character classes wrt english.
 *
 * @param {string} sIn
 * @param {number} maxType
 * @param {number} maxUniq
 * @param {number} maxMatchLenThreshold
 */
 export function trim_hist_garbage_at_end(sIn, maxType, maxUniq, maxMatchLenThreshold) {
    if (sIn.length < maxMatchLenThreshold) {
        return { trimmed: false, data: sIn };
    }
    let iAlp = 0;
    let iNum = 0;
    let iOth = 0;
    // Learn
    let hist = {};
    let iUniq = 0;
    for(let i=0; i<maxMatchLenThreshold; i++) {
        let c = sIn[sIn.length-1-i];
        if (c in hist) {
            hist[c] += 1;
        } else {
            if(c.match(/[0-9]/) != null) {
                iNum += 1;
            } else if(c.match(/[A-Za-z]/) != null) {
                iAlp += 1;
            } else {
                iOth += 1;
            }
            iUniq += 1;
            if (iUniq >= maxUniq) {
                break;
            }
            hist[c] = 1;
        }
    }
    console.debug("DBUG:TrimHistGarbage:", hist);
    if ((iAlp > maxType) || (iNum > maxType) || (iOth > maxType)) {
        return { trimmed: false, data: sIn };
    }
    // Catch and Trim
    for(let i=0; i < sIn.length; i++) {
        let c = sIn[sIn.length-1-i];
        if (!(c in hist)) {
            if (i < maxMatchLenThreshold) {
                return { trimmed: false, data: sIn };
            }
            console.debug("DBUG:TrimHistGarbage:TrimmedCharLen:", i);
            return { trimmed: true, data: sIn.substring(0, sIn.length-i+1) };
        }
    }
    console.debug("DBUG:TrimHistGarbage:Trimmed fully");
    return { trimmed: true, data: "" };
 }
 /**
 * Keep trimming repeatedly using hist_garbage logic, till you no longer can.
 * This ensures that even if there are multiple runs of garbage with different patterns,
 * the logic still tries to munch through them.
 *
 * @param {any} sIn
 * @param {number} maxType
 * @param {number} maxUniq
 * @param {number} maxMatchLenThreshold
 */
 export function trim_hist_garbage_at_end_loop(sIn, maxType, maxUniq, maxMatchLenThreshold) {
    let sCur = sIn;
    while (true) {
        let got = trim_hist_garbage_at_end(sCur, maxType, maxUniq, maxMatchLenThreshold);
        if (!got.trimmed) {
            return got.data;
        }
        sCur = got.data;
    }
 }
 /**
 * Try trim garbage at the end by using both the hist-driven-garbage-trimming as well as
 * skip-a-bit-if-reqd-then-repeat-pattern-based-garbage-trimming, with blind retrying.
 * @param {string} sIn
 */
 export function trim_garbage_at_end(sIn) {
    let sCur = sIn;
    for(let i=0; i<2; i++) {
        sCur = trim_hist_garbage_at_end_loop(sCur, 8, 24, 72);
        sCur = trim_repeat_garbage_at_end_loop(sCur, 32, 72, 12);
    }
    return sCur;
 }
 /**
 * NewLines array helper.
 * Allow for maintaining a list of lines.
 * Allow for a line to be builtup/appended part by part.
 */
 export class NewLines {
    constructor() {
        /** @type {string[]} */
        this.lines = [];
    }
    /**
     * Extracts lines from the passed string and inturn either
     * append to a previous partial line or add a new line.
     * @param {string} sLines
     */
    add_append(sLines) {
        let aLines = sLines.split("\n");
        let lCnt = 0;
        for(let line of aLines) {
            lCnt += 1;
            // Add back newline removed if any during split
            if (lCnt < aLines.length) {
                line += "\n";
            } else {
                if (sLines.endsWith("\n")) {
                    line += "\n";
                }
            }
            // Append if required
            if (lCnt == 1) {
                let lastLine = this.lines[this.lines.length-1];
                if (lastLine != undefined) {
                    if (!lastLine.endsWith("\n")) {
                        this.lines[this.lines.length-1] += line;
                        continue;
                    }
                }
            }
            // Add new line
            this.lines.push(line);
        }
    }
    /**
     * Shift the oldest/earliest/0th line in the array. [Old-New|Earliest-Latest]
     * Optionally control whether only full lines (ie those with newline at end) will be returned
     * or will a partial line without a newline at end (can only be the last line) be returned.
     * @param {boolean} bFullWithNewLineOnly
     */
    shift(bFullWithNewLineOnly=true) {
        let line = this.lines[0];
        if (line == undefined) {
            return undefined;
        }
        if ((line[line.length-1] != "\n") && bFullWithNewLineOnly){
            return undefined;
        }
        return this.lines.shift();
    }
 }
--- a/examples/server/public_simplechat/index.html
+++ b/examples/server/public_simplechat/index.html
@ -8,21 +8,23 @@
        <meta name="description" content="SimpleChat: trigger LLM web service endpoints /chat/completions and /completions, single/multi chat sessions" />
        <meta name="author" content="by Humans for All" />
        <meta http-equiv="Cache-Control" content="no-cache, no-store, must-revalidate" />
-        <script src="simplechat.js" defer></script>
+        <script type="importmap">
            {
                "imports": {
                    "datautils": "./datautils.mjs",
                    "ui": "./ui.mjs"
                }
            }
        </script>
        <script src="simplechat.js" type="module" defer></script>
        <link rel="stylesheet" href="simplechat.css" />
    </head>
    <body>
        <div class="samecolumn" id="fullbody">
-            <div class="sameline">
+            <div class="sameline" id="heading">
                <p class="heading flex-grow" > <b> SimpleChat </b> </p>
-                <div class="sameline">
+                <button id="settings">Settings</button>
                    <label for="api-ep">Mode:</label>
                    <select name="api-ep" id="api-ep">
                    <option value="chat" selected>Chat</option>
                    <option value="completion">Completion</option>
                    </select>
                </div>
            </div>
            <div id="sessions-div" class="sameline"></div>
@ -30,7 +32,7 @@
            <hr>
            <div class="sameline">
                <label for="system-in">System</label>
-                <input type="text" name="system" id="system-in" placeholder="e.g. you are a helpful ai assistant, who provides concise answers" class="flex-grow"/>
+                <textarea name="system" id="system-in" rows="2" placeholder="e.g. you are a helpful ai assistant, who provides concise answers" class="flex-grow"></textarea>
            </div>
            <hr>
@ -40,7 +42,7 @@
            <hr>
            <div class="sameline">
-                <textarea id="user-in" class="flex-grow" rows="3" placeholder="enter your query to the ai model here" ></textarea>
+                <textarea id="user-in" class="flex-grow" rows="2" placeholder="enter your query to the ai model here" ></textarea>
                <button id="user-btn">submit</button>
            </div>
--- a/examples/server/public_simplechat/readme.md
+++ b/examples/server/public_simplechat/readme.md
@ -11,18 +11,29 @@ in a simple way with minimal code from a common code base. Inturn additionally i
 multiple independent back and forth chatting to an extent, with the ai llm model at a basic level, with their
 own system prompts.
 This allows seeing the generated text / ai-model response in oneshot at the end, after it is fully generated,
 or potentially as it is being generated, in a streamed manner from the server/ai-model.
 Auto saves the chat session locally as and when the chat is progressing and inturn at a later time when you
 open SimpleChat, option is provided to restore the old chat session, if a matching one exists.
 The UI follows a responsive web design so that the layout can adapt to available display space in a usable
 enough manner, in general.
 Allows developer/end-user to control some of the behaviour by updating gMe members from browser's devel-tool
-console.
+console. Parallely some of the directly useful to end-user settings can also be changed using the provided
 settings ui.
-NOTE: Given that the idea is for basic minimal testing, it doesnt bother with any model context length and
+NOTE: Current web service api doesnt expose the model context length directly, so client logic doesnt provide
-culling of old messages from the chat by default. However by enabling the sliding window chat logic, a crude
+any adaptive culling of old messages nor of replacing them with summary of their content etal. However there
-form of old messages culling can be achieved.
+is a optional sliding window based chat logic, which provides a simple minded culling of old messages from
 the chat history before sending to the ai model.
-NOTE: It doesnt set any parameters other than temperature and max_tokens for now. However if someone wants
+NOTE: Wrt options sent with the request, it mainly sets temperature, max_tokens and optionaly stream for now.
-they can update the js file or equivalent member in gMe as needed.
+However if someone wants they can update the js file or equivalent member in gMe as needed.
 NOTE: One may be able to use this to chat with openai api web-service /chat/completions endpoint, in a very
 limited / minimal way. One will need to set model, openai url and authorization bearer key in settings ui.
 ## usage
@ -52,9 +63,15 @@ Open this simple web front end from your local browser
 Once inside
-* Select between chat and completion mode. By default it is set to chat mode.
+* If you want to, you can change many of the default global settings
  * the base url (ie ip addr / domain name, port)
  * chat (default) vs completion mode
  * try trim garbage in response or not
  * amount of chat history in the context sent to server/ai-model
  * oneshot or streamed mode.
 * In completion mode
  * one normally doesnt use a system prompt in completion mode.
  * logic by default doesnt insert any role specific "ROLE: " prefix wrt each role's message.
    If the model requires any prefix wrt user role messages, then the end user has to
    explicitly add the needed prefix, when they enter their chat message.
@ -88,12 +105,16 @@ Once inside
 * Wait for the logic to communicate with the server and get the response.
  * the user is not allowed to enter any fresh query during this time.
  * the user input box will be disabled and a working message will be shown in it.
  * if trim garbage is enabled, the logic will try to trim repeating text kind of garbage to some extent.
 * just refresh the page, to reset wrt the chat history and or system prompt and start afresh.
 * Using NewChat one can start independent chat sessions.
  * two independent chat sessions are setup by default.
 * When you want to print, switching ChatHistoryInCtxt to Full and clicking on the chat session button of
  interest, will display the full chat history till then wrt same, if you want full history for printing.
 ## Devel note
@ -104,14 +125,31 @@ by developers who may not be from web frontend background (so inturn may not be
 end-use-specific-language-extensions driven flows) so that they can use it to explore/experiment things.
 And given that the idea is also to help explore/experiment for developers, some flexibility is provided
-to change behaviour easily using the devel-tools/console, for now. And skeletal logic has been implemented
+to change behaviour easily using the devel-tools/console or provided minimal settings ui (wrt few aspects).
-to explore some of the end points and ideas/implications around them.
+Skeletal logic has been implemented to explore some of the end points and ideas/implications around them.
 ### General
 Me/gMe consolidates the settings which control the behaviour into one object.
 One can see the current settings, as well as change/update them using browsers devel-tool/console.
 It is attached to the document object. Some of these can also be updated using the Settings UI.
  baseURL - the domain-name/ip-address and inturn the port to send the request.
  bStream - control between oneshot-at-end and live-stream-as-its-generated collating and showing
  of the generated response.
    the logic assumes that the text sent from the server follows utf-8 encoding.
    in streaming mode - if there is any exception, the logic traps the same and tries to ensure
    that text generated till then is not lost.
      if a very long text is being generated, which leads to no user interaction for sometime and
      inturn the machine goes into power saving mode or so, the platform may stop network connection,
      leading to exception.
  apiEP - select between /completions and /chat/completions endpoint provided by the server/ai-model.
  bCompletionFreshChatAlways - whether Completion mode collates complete/sliding-window history when
  communicating with the server or only sends the latest user query/message.
@ -119,6 +157,19 @@ One can see the current settings, as well as change/update them using browsers d
  bCompletionInsertStandardRolePrefix - whether Completion mode inserts role related prefix wrt the
  messages that get inserted into prompt field wrt /Completion endpoint.
  bTrimGarbage - whether garbage repeatation at the end of the generated ai response, should be
  trimmed or left as is. If enabled, it will be trimmed so that it wont be sent back as part of
  subsequent chat history. At the same time the actual trimmed text is shown to the user, once
  when it was generated, so user can check if any useful info/data was there in the response.
    One may be able to request the ai-model to continue (wrt the last response) (if chat-history
    is enabled as part of the chat-history-in-context setting), and chances are the ai-model will
    continue starting from the trimmed part, thus allows long response to be recovered/continued
    indirectly, in many cases.
    The histogram/freq based trimming logic is currently tuned for english language wrt its
    is-it-a-alpabetic|numeral-char regex match logic.
  chatRequestOptions - maintains the list of options/fields to send along with chat request,
  irrespective of whether /chat/completions or /completions endpoint.
@ -126,6 +177,14 @@ One can see the current settings, as well as change/update them using browsers d
    modify the existing options value or remove them, for now you can update this global var
    using browser's development-tools/console.
    For string and numeric fields in chatRequestOptions, including even those added by a user
    at runtime by directly modifying gMe.chatRequestOptions, setting ui entries will be auto
    created.
  headers - maintains the list of http headers sent when request is made to the server. By default
  Content-Type is set to application/json. Additionally Authorization entry is provided, which can
  be set if needed using the settings ui.
  iRecentUserMsgCnt - a simple minded SlidingWindow to limit context window load at Ai Model end.
  This is disabled by default. However if enabled, then in addition to latest system message, only
  the last/latest iRecentUserMsgCnt user messages after the latest system prompt and its responses
@ -140,7 +199,8 @@ One can see the current settings, as well as change/update them using browsers d
 By using gMe's iRecentUserMsgCnt and chatRequestOptions.max_tokens one can try to control the
 implications of loading of the ai-model's context window by chat history, wrt chat response to
-some extent in a simple crude way.
+some extent in a simple crude way. You may also want to control the context size enabled when
 the server loads ai-model, on the server end.
 Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
@ -149,28 +209,15 @@ matter clearing site data, dont directly override site caching in all cases. Wor
 have to change port. Or in dev tools of browser, you may be able to disable caching fully.
-Concept of multiple chat sessions with different servers, as well as saving and restoring of
+Currently the server to communicate with is maintained globally and not as part of a specific
-those across browser usage sessions, can be woven around the SimpleChat/MultiChatUI class and
+chat session. So if one changes the server ip/url in setting, then all chat sessions will auto
-its instances relatively easily, however given the current goal of keeping this simple, it has
+switch to this new server, when you try using those sessions.
 not been added, for now.
 By switching between chat.add_system_begin/anytime, one can control whether one can change
 the system prompt, anytime during the conversation or only at the beginning.
 read_json_early, is to experiment with reading json response data early on, if available,
 so that user can be shown generated data, as and when it is being generated, rather than
 at the end when full data is available.
  the server flow doesnt seem to be sending back data early, atleast for request (inc options)
  that is currently sent.
  if able to read json data early on in future, as and when ai model is generating data, then
  this helper needs to indirectly update the chat div with the recieved data, without waiting
  for the overall data to be available.
 ### Default setup
 By default things are setup to try and make the user experience a bit better, if possible.
@ -179,7 +226,8 @@ However a developer when testing the server of ai-model may want to change these
 Using iRecentUserMsgCnt reduce chat history context sent to the server/ai-model to be
 just the system-prompt, prev-user-request-and-ai-response and cur-user-request, instead of
 full chat history. This way if there is any response with garbage/repeatation, it doesnt
-mess with things beyond the next question/request/query, in some ways.
+mess with things beyond the next question/request/query, in some ways. The trim garbage
 option also tries to help avoid issues with garbage in the context to an extent.
 Set max_tokens to 1024, so that a relatively large previous reponse doesnt eat up the space
 available wrt next query-response. However dont forget that the server when started should
@ -189,11 +237,33 @@ also be started with a model context size of 1k or more, to be on safe side.
  internal n_predict, for now add the same here on the client side, maybe later add max_tokens
  to /completions endpoint handling code on server side.
-Frequency and presence penalty fields are set to 1.2 in the set of fields sent to server
+NOTE: One may want to experiment with frequency/presence penalty fields in chatRequestOptions
-along with the user query. So that the model is partly set to try avoid repeating text in
+wrt the set of fields sent to server along with the user query. To check how the model behaves
-its response.
+wrt repeatations in general in the generated text response.
-A end-user can change these behaviour by editing gMe from browser's devel-tool/console.
+A end-user can change these behaviour by editing gMe from browser's devel-tool/console or by
 using the providing settings ui.
 ### OpenAi / Equivalent API WebService
 One may be abe to handshake with OpenAI/Equivalent api web service's /chat/completions endpoint
 for a minimal chatting experimentation by setting the below.
 * the baseUrl in settings ui
  * https://api.openai.com/v1 or similar
 * Wrt request body - gMe.chatRequestOptions
  * model (settings ui)
  * any additional fields if required in future
 * Wrt request headers - gMe.headers
  * Authorization (available through settings ui)
    * Bearer THE_OPENAI_API_KEY
  * any additional optional header entries like "OpenAI-Organization", "OpenAI-Project" or so
 NOTE: Not tested, as there is no free tier api testing available. However logically this might
 work.
 ## At the end
--- a/examples/server/public_simplechat/simplechat.css
+++ b/examples/server/public_simplechat/simplechat.css
@ -21,6 +21,17 @@
 .role-user {
    background-color: lightgray;
 }
 .role-trim {
    background-color: lightpink;
 }
 .gridx2 {
    display: grid;
    grid-template-columns: repeat(2, 1fr);
    border-bottom-style: dotted;
    border-bottom-width: thin;
    border-bottom-color: lightblue;
 }
 .flex-grow {
    flex-grow: 1;
--- a/examples/server/public_simplechat/simplechat.js
+++ b/examples/server/public_simplechat/simplechat.js
@ -2,6 +2,9 @@
 // A simple completions and chat/completions test related web front end logic
 // by Humans for All
 import * as du from "./datautils.mjs";
 import * as ui from "./ui.mjs"
 class Roles {
    static System = "system";
    static User = "user";
@ -9,40 +12,65 @@ class Roles {
 }
 class ApiEP {
-    static Chat = "chat";
+    static Type = {
-    static Completion = "completion";
+        Chat: "chat",
        Completion: "completion",
    }
    static UrlSuffix = {
        'chat': `/chat/completions`,
        'completion': `/completions`,
    }
    /**
     * Build the url from given baseUrl and apiEp id.
     * @param {string} baseUrl
     * @param {string} apiEP
     */
    static Url(baseUrl, apiEP) {
        if (baseUrl.endsWith("/")) {
            baseUrl = baseUrl.substring(0, baseUrl.length-1);
        }
        return `${baseUrl}${this.UrlSuffix[apiEP]}`;
    }
 }
 let gUsageMsg = `
    <p class="role-system">Usage</p>
    <ul class="ul1">
-    <li> Set system prompt above, to try control ai response charactersitic, if model supports same.</li>
+    <li> System prompt above, to try control ai response characteristics.</li>
        <ul class="ul2">
-        <li> Completion mode normally wont have a system prompt.</li>
+        <li> Completion mode - no system prompt normally.</li>
        </ul>
    <li> Enter your query to ai assistant below.</li>
        <ul class="ul2">
        <li> Completion mode doesnt insert user/role: prefix implicitly.</li>
    <li> Use shift+enter for inserting enter/newline.</li>
-        </ul>
+    <li> Enter your query to ai assistant below.</li>
    <li> Default ContextWindow = [System, Last Query+Resp, Cur Query].</li>
        <ul class="ul2">
-        <li> experiment iRecentUserMsgCnt, max_tokens, model ctxt window to expand</li>
+        <li> ChatHistInCtxt, MaxTokens, ModelCtxt window to expand</li>
        </ul>
    </ul>
 `;
 /** @typedef {{role: string, content: string}[]} ChatMessages */
 /** @typedef {{iLastSys: number, xchat: ChatMessages}} SimpleChatODS */
 class SimpleChat {
-    constructor() {
+    /**
     * @param {string} chatId
     */
    constructor(chatId) {
        this.chatId = chatId;
        /**
         * Maintain in a form suitable for common LLM web service chat/completions' messages entry
         * @type {ChatMessages}
         */
        this.xchat = [];
        this.iLastSys = -1;
        this.latestResponse = "";
    }
    clear() {
@ -50,6 +78,27 @@ class SimpleChat {
        this.iLastSys = -1;
    }
    ods_key() {
        return `SimpleChat-${this.chatId}`
    }
    save() {
        /** @type {SimpleChatODS} */
        let ods = {iLastSys: this.iLastSys, xchat: this.xchat};
        localStorage.setItem(this.ods_key(), JSON.stringify(ods));
    }
    load() {
        let sods = localStorage.getItem(this.ods_key());
        if (sods == null) {
            return;
        }
        /** @type {SimpleChatODS} */
        let ods = JSON.parse(sods);
        this.iLastSys = ods.iLastSys;
        this.xchat = ods.xchat;
    }
    /**
     * Recent chat messages.
     * If iRecentUserMsgCnt < 0
@ -94,6 +143,15 @@ class SimpleChat {
        return rchat;
    }
    /**
     * Collate the latest response from the server/ai-model, as it is becoming available.
     * This is mainly useful for the stream mode.
     * @param {string} content
     */
    append_response(content) {
        this.latestResponse += content;
    }
    /**
     * Add an entry into xchat
     * @param {string} role
@ -107,6 +165,7 @@ class SimpleChat {
        if (role == Roles.System) {
            this.iLastSys = this.xchat.length - 1;
        }
        this.save();
        return true;
    }
@ -121,10 +180,8 @@ class SimpleChat {
        }
        let last = undefined;
        for(const x of this.recent_chat(gMe.iRecentUserMsgCnt)) {
-            let entry = document.createElement("p");
+            let entry = ui.el_create_append_p(`${x.role}: ${x.content}`, div);
            entry.className = `role-${x.role}`;
            entry.innerText = `${x.role}: ${x.content}`;
            div.appendChild(entry);
            last = entry;
        }
        if (last !== undefined) {
@ -132,21 +189,45 @@ class SimpleChat {
        } else {
            if (bClear) {
                div.innerHTML = gUsageMsg;
                gMe.setup_load(div, this);
                gMe.show_info(div);
            }
        }
        return last;
    }
    /**
     * Setup the fetch headers.
     * It picks the headers from gMe.headers.
     * It inserts Authorization only if its non-empty.
     * @param {string} apiEP
     */
    fetch_headers(apiEP) {
        let headers = new Headers();
        for(let k in gMe.headers) {
            let v = gMe.headers[k];
            if ((k == "Authorization") && (v.trim() == "")) {
                continue;
            }
            headers.append(k, v);
        }
        return headers;
    }
    /**
     * Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
     * The needed fields/options are picked from a global object.
     * Add optional stream flag, if required.
     * Convert the json into string.
     * @param {Object} obj
     */
-    request_jsonstr(obj) {
+    request_jsonstr_extend(obj) {
        for(let k in gMe.chatRequestOptions) {
            obj[k] = gMe.chatRequestOptions[k];
        }
        if (gMe.bStream) {
            obj["stream"] = true;
        }
        return JSON.stringify(obj);
    }
@ -157,7 +238,7 @@ class SimpleChat {
        let req = {
            messages: this.recent_chat(gMe.iRecentUserMsgCnt),
        }
-        return this.request_jsonstr(req);
+        return this.request_jsonstr_extend(req);
    }
    /**
@ -180,7 +261,60 @@ class SimpleChat {
        let req = {
            prompt: prompt,
        }
-        return this.request_jsonstr(req);
+        return this.request_jsonstr_extend(req);
    }
    /**
     * Return a string form of json object suitable for specified api endpoint.
     * @param {string} apiEP
     */
    request_jsonstr(apiEP) {
        if (apiEP == ApiEP.Type.Chat) {
            return this.request_messages_jsonstr();
        } else {
            return this.request_prompt_jsonstr(gMe.bCompletionInsertStandardRolePrefix);
        }
    }
    /**
     * Extract the ai-model/assistant's response from the http response got.
     * Optionally trim the message wrt any garbage at the end.
     * @param {any} respBody
     * @param {string} apiEP
     */
    response_extract(respBody, apiEP) {
        let assistant = "";
        if (apiEP == ApiEP.Type.Chat) {
            assistant = respBody["choices"][0]["message"]["content"];
        } else {
            try {
                assistant = respBody["choices"][0]["text"];
            } catch {
                assistant = respBody["content"];
            }
        }
        return assistant;
    }
    /**
     * Extract the ai-model/assistant's response from the http response got in streaming mode.
     * @param {any} respBody
     * @param {string} apiEP
     */
    response_extract_stream(respBody, apiEP) {
        let assistant = "";
        if (apiEP == ApiEP.Type.Chat) {
            if (respBody["choices"][0]["finish_reason"] !== "stop") {
                assistant = respBody["choices"][0]["delta"]["content"];
            }
        } else {
            try {
                assistant = respBody["choices"][0]["text"];
            } catch {
                assistant = respBody["content"];
            }
        }
        return assistant;
    }
    /**
@ -239,53 +373,99 @@ class SimpleChat {
        return sysPrompt;
    }
 }
-
+    /**
-let gBaseURL = "http://127.0.0.1:8080";
+     * Handle the multipart response from server/ai-model
-let gChatURL = {
+     * @param {Response} resp
-    'chat': `${gBaseURL}/chat/completions`,
+     * @param {string} apiEP
-    'completion': `${gBaseURL}/completions`,
+     * @param {HTMLDivElement} elDiv
 }
 /**
 * Set the class of the children, based on whether it is the idSelected or not.
 * @param {HTMLDivElement} elBase
 * @param {string} idSelected
 * @param {string} classSelected
 * @param {string} classUnSelected
     */
-function el_children_config_class(elBase, idSelected, classSelected, classUnSelected="") {
+    async handle_response_multipart(resp, apiEP, elDiv) {
-    for(let child of elBase.children) {
+        let elP = ui.el_create_append_p("", elDiv);
-        if (child.id == idSelected) {
+        if (!resp.body) {
-            child.className = classSelected;
+            throw Error("ERRR:SimpleChat:SC:HandleResponseMultiPart:No body...");
        }
        let tdUtf8 = new TextDecoder("utf-8");
        let rr = resp.body.getReader();
        this.latestResponse = "";
        let xLines = new du.NewLines();
        while(true) {
            let { value: cur,  done: done } = await rr.read();
            if (cur) {
                let curBody = tdUtf8.decode(cur, {stream: true});
                console.debug("DBUG:SC:PART:Str:", curBody);
                xLines.add_append(curBody);
            }
            while(true) {
                let curLine = xLines.shift(!done);
                if (curLine == undefined) {
                    break;
                }
                if (curLine.trim() == "") {
                    continue;
                }
                if (curLine.startsWith("data:")) {
                    curLine = curLine.substring(5);
                }
                let curJson = JSON.parse(curLine);
                console.debug("DBUG:SC:PART:Json:", curJson);
                this.append_response(this.response_extract_stream(curJson, apiEP));
            }
            elP.innerText = this.latestResponse;
            elP.scrollIntoView(false);
            if (done) {
                break;
            }
        }
        console.debug("DBUG:SC:PART:Full:", this.latestResponse);
        return this.latestResponse;
    }
    /**
     * Handle the oneshot response from server/ai-model
     * @param {Response} resp
     * @param {string} apiEP
     */
    async handle_response_oneshot(resp, apiEP) {
        let respBody = await resp.json();
        console.debug(`DBUG:SimpleChat:SC:${this.chatId}:HandleUserSubmit:RespBody:${JSON.stringify(respBody)}`);
        return this.response_extract(respBody, apiEP);
    }
    /**
     * Handle the response from the server be it in oneshot or multipart/stream mode.
     * Also take care of the optional garbage trimming.
     * @param {Response} resp
     * @param {string} apiEP
     * @param {HTMLDivElement} elDiv
     */
    async handle_response(resp, apiEP, elDiv) {
        let theResp = {
            assistant: "",
            trimmed: "",
        }
        if (gMe.bStream) {
            try {
                theResp.assistant = await this.handle_response_multipart(resp, apiEP, elDiv);
                this.latestResponse = "";
            } catch (error) {
                theResp.assistant = this.latestResponse;
                this.add(Roles.Assistant, theResp.assistant);
                this.latestResponse = "";
                throw error;
            }
        } else {
-            child.className = classUnSelected;
+            theResp.assistant = await this.handle_response_oneshot(resp, apiEP);
        }
        if (gMe.bTrimGarbage) {
            let origMsg = theResp.assistant;
            theResp.assistant = du.trim_garbage_at_end(origMsg);
            theResp.trimmed = origMsg.substring(theResp.assistant.length);
        }
        this.add(Roles.Assistant, theResp.assistant);
        return theResp;
    }
 }
 /**
 * Create button and set it up.
 * @param {string} id
 * @param {(this: HTMLButtonElement, ev: MouseEvent) => any} callback
 * @param {string | undefined} name
 * @param {string | undefined} innerText
 */
 function el_create_button(id, callback, name=undefined, innerText=undefined) {
    if (!name) {
        name = id;
    }
    if (!innerText) {
        innerText = id;
    }
    let btn = document.createElement("button");
    btn.id = id;
    btn.name = name;
    btn.innerText = innerText;
    btn.addEventListener("click", callback);
    return btn;
 }
@ -302,14 +482,16 @@ class MultiChatUI {
        this.elDivChat = /** @type{HTMLDivElement} */(document.getElementById("chat-div"));
        this.elBtnUser = /** @type{HTMLButtonElement} */(document.getElementById("user-btn"));
        this.elInUser = /** @type{HTMLInputElement} */(document.getElementById("user-in"));
-        this.elSelectApiEP = /** @type{HTMLSelectElement} */(document.getElementById("api-ep"));
+        this.elDivHeading = /** @type{HTMLSelectElement} */(document.getElementById("heading"));
        this.elDivSessions = /** @type{HTMLDivElement} */(document.getElementById("sessions-div"));
        this.elBtnSettings = /** @type{HTMLButtonElement} */(document.getElementById("settings"));
        this.validate_element(this.elInSystem, "system-in");
        this.validate_element(this.elDivChat, "chat-div");
        this.validate_element(this.elInUser, "user-in");
-        this.validate_element(this.elSelectApiEP, "api-ep");
+        this.validate_element(this.elDivHeading, "heading");
        this.validate_element(this.elDivChat, "sessions-div");
        this.validate_element(this.elBtnSettings, "settings");
    }
    /**
@ -350,13 +532,18 @@ class MultiChatUI {
            this.handle_session_switch(this.curChatId);
        }
        this.elBtnSettings.addEventListener("click", (ev)=>{
            this.elDivChat.replaceChildren();
            gMe.show_settings(this.elDivChat);
        });
        this.elBtnUser.addEventListener("click", (ev)=>{
            if (this.elInUser.disabled) {
                return;
            }
-            this.handle_user_submit(this.curChatId, this.elSelectApiEP.value).catch((/** @type{Error} */reason)=>{
+            this.handle_user_submit(this.curChatId, gMe.apiEP).catch((/** @type{Error} */reason)=>{
                let msg = `ERRR:SimpleChat\nMCUI:HandleUserSubmit:${this.curChatId}\n${reason.name}:${reason.message}`;
-                console.debug(msg.replace("\n", ":"));
+                console.error(msg.replace("\n", ":"));
                alert(msg);
                this.ui_reset_userinput();
            });
@ -377,6 +564,8 @@ class MultiChatUI {
            // allow user to insert enter into the system prompt using shift+enter.
            // while just pressing enter key will lead to setting the system prompt.
            if ((ev.key === "Enter") && (!ev.shiftKey)) {
                let value = this.elInSystem.value;
                this.elInSystem.value = value.substring(0,value.length-1);
                let chat = this.simpleChats[this.curChatId];
                chat.add_system_anytime(this.elInSystem.value, this.curChatId);
                chat.show(this.elDivChat);
@ -392,34 +581,12 @@ class MultiChatUI {
     * @param {boolean} bSwitchSession
     */
    new_chat_session(chatId, bSwitchSession=false) {
-        this.simpleChats[chatId] = new SimpleChat();
+        this.simpleChats[chatId] = new SimpleChat(chatId);
        if (bSwitchSession) {
            this.handle_session_switch(chatId);
        }
    }
    /**
     * Try read json response early, if available.
     * @param {Response} resp
     */
    async read_json_early(resp) {
        if (!resp.body) {
            throw Error("ERRR:SimpleChat:MCUI:ReadJsonEarly:No body...");
        }
        let tdUtf8 = new TextDecoder("utf-8");
        let rr = resp.body.getReader();
        let gotBody = "";
        while(true) {
            let { value: cur,  done: done} = await rr.read();
            let curBody = tdUtf8.decode(cur);
            console.debug("DBUG:SC:PART:", curBody);
            gotBody += curBody;
            if (done) {
                break;
            }
        }
        return JSON.parse(gotBody);
    }
    /**
     * Handle user query submit request, wrt specified chat session.
@ -434,7 +601,7 @@ class MultiChatUI {
        // So if user wants to simulate a multi-chat based completion query,
        // they will have to enter the full thing, as a suitable multiline
        // user input/query.
-        if ((apiEP == ApiEP.Completion) && (gMe.bCompletionFreshChatAlways)) {
+        if ((apiEP == ApiEP.Type.Completion) && (gMe.bCompletionFreshChatAlways)) {
            chat.clear();
        }
@ -447,41 +614,26 @@ class MultiChatUI {
        }
        chat.show(this.elDivChat);
-        let theBody;
+        let theUrl = ApiEP.Url(gMe.baseURL, apiEP);
-        let theUrl = gChatURL[apiEP]
+        let theBody = chat.request_jsonstr(apiEP);
        if (apiEP == ApiEP.Chat) {
            theBody = chat.request_messages_jsonstr();
        } else {
            theBody = chat.request_prompt_jsonstr(gMe.bCompletionInsertStandardRolePrefix);
        }
        this.elInUser.value = "working...";
        this.elInUser.disabled = true;
        console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:${theUrl}:ReqBody:${theBody}`);
        let theHeaders = chat.fetch_headers(apiEP);
        let resp = await fetch(theUrl, {
            method: "POST",
-            headers: {
+            headers: theHeaders,
                "Content-Type": "application/json",
            },
            body: theBody,
        });
-        let respBody = await resp.json();
+        let theResp = await chat.handle_response(resp, apiEP, this.elDivChat);
        //let respBody = await this.read_json_early(resp);
        console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:RespBody:${JSON.stringify(respBody)}`);
        let assistantMsg;
        if (apiEP == ApiEP.Chat) {
            assistantMsg = respBody["choices"][0]["message"]["content"];
        } else {
            try {
                assistantMsg = respBody["choices"][0]["text"];
            } catch {
                assistantMsg = respBody["content"];
            }
        }
        chat.add(Roles.Assistant, assistantMsg);
        if (chatId == this.curChatId) {
            chat.show(this.elDivChat);
            if (theResp.trimmed.length > 0) {
                let p = ui.el_create_append_p(`TRIMMED:${theResp.trimmed}`, this.elDivChat);
                p.className="role-trim";
            }
        } else {
            console.debug(`DBUG:SimpleChat:MCUI:HandleUserSubmit:ChatId has changed:[${chatId}] [${this.curChatId}]`);
        }
@ -500,7 +652,7 @@ class MultiChatUI {
        }
        elDiv.replaceChildren();
        // Btn for creating new chat session
-        let btnNew = el_create_button("New CHAT", (ev)=> {
+        let btnNew = ui.el_create_button("New CHAT", (ev)=> {
            if (this.elInUser.disabled) {
                console.error(`ERRR:SimpleChat:MCUI:NewChat:Current session [${this.curChatId}] awaiting response, ignoring request...`);
                alert("ERRR:SimpleChat\nMCUI:NewChat\nWait for response to pending query, before starting new chat session");
@ -514,7 +666,7 @@ class MultiChatUI {
            }
            this.new_chat_session(chatIdGot, true);
            this.create_session_btn(elDiv, chatIdGot);
-            el_children_config_class(elDiv, chatIdGot, "session-selected", "");
+            ui.el_children_config_class(elDiv, chatIdGot, "session-selected", "");
        });
        elDiv.appendChild(btnNew);
        // Btns for existing chat sessions
@ -528,7 +680,7 @@ class MultiChatUI {
    }
    create_session_btn(elDiv, cid) {
-        let btn = el_create_button(cid, (ev)=>{
+        let btn = ui.el_create_button(cid, (ev)=>{
            let target = /** @type{HTMLButtonElement} */(ev.target);
            console.debug(`DBUG:SimpleChat:MCUI:SessionClick:${target.id}`);
            if (this.elInUser.disabled) {
@ -537,7 +689,7 @@ class MultiChatUI {
                return;
            }
            this.handle_session_switch(target.id);
-            el_children_config_class(elDiv, target.id, "session-selected", "");
+            ui.el_children_config_class(elDiv, target.id, "session-selected", "");
        });
        elDiv.appendChild(btn);
        return btn;
@ -567,46 +719,183 @@ class MultiChatUI {
 class Me {
    constructor() {
        this.baseURL = "http://127.0.0.1:8080";
        this.defaultChatIds = [ "Default", "Other" ];
        this.multiChat = new MultiChatUI();
        this.bStream = true;
        this.bCompletionFreshChatAlways = true;
        this.bCompletionInsertStandardRolePrefix = false;
        this.bTrimGarbage = true;
        this.iRecentUserMsgCnt = 2;
        this.sRecentUserMsgCnt = {
            "Full": -1,
            "Last0": 1,
            "Last1": 2,
            "Last2": 3,
            "Last4": 5,
        };
        this.apiEP = ApiEP.Type.Chat;
        this.headers = {
            "Content-Type": "application/json",
            "Authorization": "", // Authorization: Bearer OPENAI_API_KEY
        }
        // Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
        this.chatRequestOptions = {
            "model": "gpt-3.5-turbo",
            "temperature": 0.7,
            "max_tokens": 1024,
-            "frequency_penalty": 1.2,
+            "n_predict": 1024,
-            "presence_penalty": 1.2,
+            //"frequency_penalty": 1.2,
-            "n_predict": 1024
+            //"presence_penalty": 1.2,
        };
    }
    /**
     * Disable console.debug by mapping it to a empty function.
     */
    debug_disable() {
        this.console_debug = console.debug;
        console.debug = () => {
        };
    }
    /**
     * Setup the load saved chat ui.
     * @param {HTMLDivElement} div
     * @param {SimpleChat} chat
     */
    setup_load(div, chat) {
        if (!(chat.ods_key() in localStorage)) {
            return;
        }
        div.innerHTML += `<p class="role-system">Restore</p>
        <p>Load previously saved chat session, if available</p>`;
        let btn = ui.el_create_button(chat.ods_key(), (ev)=>{
            console.log("DBUG:SimpleChat:SC:Load", chat);
            chat.load();
            queueMicrotask(()=>{
                chat.show(div);
                this.multiChat.elInSystem.value = chat.get_system_latest();
            });
        });
        div.appendChild(btn);
    }
    /**
     * Show the configurable parameters info in the passed Div element.
     * @param {HTMLDivElement} elDiv
     * @param {boolean} bAll
     */
    show_info(elDiv, bAll=false) {
        let p = ui.el_create_append_p("Settings (devel-tools-console document[gMe])", elDiv);
        p.className = "role-system";
        if (bAll) {
            ui.el_create_append_p(`baseURL:${this.baseURL}`, elDiv);
            ui.el_create_append_p(`Authorization:${this.headers["Authorization"]}`, elDiv);
            ui.el_create_append_p(`bStream:${this.bStream}`, elDiv);
            ui.el_create_append_p(`bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`, elDiv);
            ui.el_create_append_p(`bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`, elDiv);
            ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
            ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
            ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
        }
        ui.el_create_append_p(`chatRequestOptions:${JSON.stringify(this.chatRequestOptions, null, " - ")}`, elDiv);
        ui.el_create_append_p(`headers:${JSON.stringify(this.headers, null, " - ")}`, elDiv);
    }
    /**
     * Auto create ui input elements for fields in ChatRequestOptions
     * Currently supports text and number field types.
     * @param {HTMLDivElement} elDiv
     */
-    show_info(elDiv) {
+    show_settings_chatrequestoptions(elDiv) {
        let typeDict = {
            "string": "text",
            "number": "number",
        };
        let fs = document.createElement("fieldset");
        let legend = document.createElement("legend");
        legend.innerText = "ChatRequestOptions";
        fs.appendChild(legend);
        elDiv.appendChild(fs);
        for(const k in this.chatRequestOptions) {
            let val = this.chatRequestOptions[k];
            let type = typeof(val);
            if (!((type == "string") || (type == "number"))) {
                continue;
            }
            let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.chatRequestOptions[k], (val)=>{
                if (type == "number") {
                    val = Number(val);
                }
                this.chatRequestOptions[k] = val;
            });
            fs.appendChild(inp.div);
        }
    }
-        var p = document.createElement("p");
+    /**
-        p.innerText = "Settings (devel-tools-console gMe)";
+     * Show settings ui for configurable parameters, in the passed Div element.
-        p.className = "role-system";
+     * @param {HTMLDivElement} elDiv
-        elDiv.appendChild(p);
+     */
    show_settings(elDiv) {
-        var p = document.createElement("p");
+        let inp = ui.el_creatediv_input("SetBaseURL", "BaseURL", "text", this.baseURL, (val)=>{
-        p.innerText = `bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`;
+            this.baseURL = val;
-        elDiv.appendChild(p);
+        });
        elDiv.appendChild(inp.div);
-        p = document.createElement("p");
+        inp = ui.el_creatediv_input("SetAuthorization", "Authorization", "text", this.headers["Authorization"], (val)=>{
-        p.innerText = `bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`;
+            this.headers["Authorization"] = val;
-        elDiv.appendChild(p);
+        });
        inp.el.placeholder = "Bearer OPENAI_API_KEY";
        elDiv.appendChild(inp.div);
-        p = document.createElement("p");
+        let bb = ui.el_creatediv_boolbutton("SetStream", "Stream", {true: "[+] yes stream", false: "[-] do oneshot"}, this.bStream, (val)=>{
-        p.innerText = `iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`;
+            this.bStream = val;
-        elDiv.appendChild(p);
+        });
        elDiv.appendChild(bb.div);
-        p = document.createElement("p");
+        bb = ui.el_creatediv_boolbutton("SetCompletionFreshChatAlways", "CompletionFreshChatAlways", {true: "[+] yes fresh", false: "[-] no, with history"}, this.bCompletionFreshChatAlways, (val)=>{
-        p.innerText = `chatRequestOptions:${JSON.stringify(this.chatRequestOptions)}`;
+            this.bCompletionFreshChatAlways = val;
-        elDiv.appendChild(p);
+        });
        elDiv.appendChild(bb.div);
        bb = ui.el_creatediv_boolbutton("SetCompletionInsertStandardRolePrefix", "CompletionInsertStandardRolePrefix", {true: "[+] yes insert", false: "[-] dont insert"}, this.bCompletionInsertStandardRolePrefix, (val)=>{
            this.bCompletionInsertStandardRolePrefix = val;
        });
        elDiv.appendChild(bb.div);
        bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
            this.bTrimGarbage = val;
        });
        elDiv.appendChild(bb.div);
        let sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
            this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
        });
        elDiv.appendChild(sel.div);
        sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
            this.apiEP = ApiEP.Type[val];
        });
        elDiv.appendChild(sel.div);
        this.show_settings_chatrequestoptions(elDiv);
    }
@ -619,6 +908,9 @@ let gMe;
 function startme() {
    console.log("INFO:SimpleChat:StartMe:Starting...");
    gMe = new Me();
    gMe.debug_disable();
    document["gMe"] = gMe;
    document["du"] = du;
    for (let cid of gMe.defaultChatIds) {
        gMe.multiChat.new_chat_session(cid);
    }
--- a/examples/server/public_simplechat/ui.mjs
+++ b/examples/server/public_simplechat/ui.mjs
@ -0,0 +1,211 @@
 //@ts-check
 // Helpers to work with html elements
 // by Humans for All
 //
 /**
 * Set the class of the children, based on whether it is the idSelected or not.
 * @param {HTMLDivElement} elBase
 * @param {string} idSelected
 * @param {string} classSelected
 * @param {string} classUnSelected
 */
 export function el_children_config_class(elBase, idSelected, classSelected, classUnSelected="") {
    for(let child of elBase.children) {
        if (child.id == idSelected) {
            child.className = classSelected;
        } else {
            child.className = classUnSelected;
        }
    }
 }
 /**
 * Create button and set it up.
 * @param {string} id
 * @param {(this: HTMLButtonElement, ev: MouseEvent) => any} callback
 * @param {string | undefined} name
 * @param {string | undefined} innerText
 */
 export function el_create_button(id, callback, name=undefined, innerText=undefined) {
    if (!name) {
        name = id;
    }
    if (!innerText) {
        innerText = id;
    }
    let btn = document.createElement("button");
    btn.id = id;
    btn.name = name;
    btn.innerText = innerText;
    btn.addEventListener("click", callback);
    return btn;
 }
 /**
 * Create a para and set it up. Optionaly append it to a passed parent.
 * @param {string} text
 * @param {HTMLElement | undefined} elParent
 * @param {string | undefined} id
 */
 export function el_create_append_p(text, elParent=undefined, id=undefined) {
    let para = document.createElement("p");
    para.innerText = text;
    if (id) {
        para.id = id;
    }
    if (elParent) {
        elParent.appendChild(para);
    }
    return para;
 }
 /**
 * Create a button which represents bool value using specified text wrt true and false.
 * When ever user clicks the button, it will toggle the value and update the shown text.
 *
 * @param {string} id
 * @param {{true: string, false: string}} texts
 * @param {boolean} defaultValue
 * @param {function(boolean):void} cb
 */
 export function el_create_boolbutton(id, texts, defaultValue, cb) {
    let el = document.createElement("button");
    el["xbool"] = defaultValue;
    el["xtexts"] = structuredClone(texts);
    el.innerText = el["xtexts"][String(defaultValue)];
    if (id) {
        el.id = id;
    }
    el.addEventListener('click', (ev)=>{
        el["xbool"] = !el["xbool"];
        el.innerText = el["xtexts"][String(el["xbool"])];
        cb(el["xbool"]);
    })
    return el;
 }
 /**
 * Create a div wrapped button which represents bool value using specified text wrt true and false.
 * @param {string} id
 * @param {string} label
 * @param {{ true: string; false: string; }} texts
 * @param {boolean} defaultValue
 * @param {(arg0: boolean) => void} cb
 * @param {string} className
 */
 export function el_creatediv_boolbutton(id, label, texts, defaultValue, cb, className="gridx2") {
    let div = document.createElement("div");
    div.className = className;
    let lbl = document.createElement("label");
    lbl.setAttribute("for", id);
    lbl.innerText = label;
    div.appendChild(lbl);
    let btn = el_create_boolbutton(id, texts, defaultValue, cb);
    div.appendChild(btn);
    return { div: div, el: btn };
 }
 /**
 * Create a select ui element, with a set of options to select from.
 * * options: an object which contains name-value pairs
 * * defaultOption: the value whose name should be choosen, by default.
 * * cb : the call back returns the name string of the option selected.
 *
 * @param {string} id
 * @param {Object<string,*>} options
 * @param {*} defaultOption
 * @param {function(string):void} cb
 */
 export function el_create_select(id, options, defaultOption, cb) {
    let el = document.createElement("select");
    el["xselected"] = defaultOption;
    el["xoptions"] = structuredClone(options);
    for(let cur of Object.keys(options)) {
        let op = document.createElement("option");
        op.value = cur;
        op.innerText = cur;
        if (options[cur] == defaultOption) {
            op.selected = true;
        }
        el.appendChild(op);
    }
    if (id) {
        el.id = id;
        el.name = id;
    }
    el.addEventListener('change', (ev)=>{
        let target = /** @type{HTMLSelectElement} */(ev.target);
        console.log("DBUG:UI:Select:", id, ":", target.value);
        cb(target.value);
    })
    return el;
 }
 /**
 * Create a div wrapped select ui element, with a set of options to select from.
 *
 * @param {string} id
 * @param {any} label
 * @param {{ [x: string]: any; }} options
 * @param {any} defaultOption
 * @param {(arg0: string) => void} cb
 * @param {string} className
 */
 export function el_creatediv_select(id, label, options, defaultOption, cb, className="gridx2") {
    let div = document.createElement("div");
    div.className = className;
    let lbl = document.createElement("label");
    lbl.setAttribute("for", id);
    lbl.innerText = label;
    div.appendChild(lbl);
    let sel = el_create_select(id, options,defaultOption, cb);
    div.appendChild(sel);
    return { div: div, el: sel };
 }
 /**
 * Create a input ui element.
 *
 * @param {string} id
 * @param {string} type
 * @param {any} defaultValue
 * @param {function(any):void} cb
 */
 export function el_create_input(id, type, defaultValue, cb) {
    let el = document.createElement("input");
    el.type = type;
    el.value = defaultValue;
    if (id) {
        el.id = id;
    }
    el.addEventListener('change', (ev)=>{
        cb(el.value);
    })
    return el;
 }
 /**
 * Create a div wrapped input.
 *
 * @param {string} id
 * @param {string} label
 * @param {string} type
 * @param {any} defaultValue
 * @param {function(any):void} cb
 * @param {string} className
 */
 export function el_creatediv_input(id, label, type, defaultValue, cb, className="gridx2") {
    let div = document.createElement("div");
    div.className = className;
    let lbl = document.createElement("label");
    lbl.setAttribute("for", id);
    lbl.innerText = label;
    div.appendChild(lbl);
    let el = el_create_input(id, type, defaultValue, cb);
    div.appendChild(el);
    return { div: div, el: el };
 }
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -17,9 +17,20 @@
 #include "json.hpp"
 // auto generated files (update with ./deps.sh)
 #include "colorthemes.css.hpp"
 #include "style.css.hpp"
 #include "theme-beeninorder.css.hpp"
 #include "theme-ketivah.css.hpp"
 #include "theme-mangotango.css.hpp"
 #include "theme-playground.css.hpp"
 #include "theme-polarnight.css.hpp"
 #include "theme-snowstorm.css.hpp"
 #include "index.html.hpp"
 #include "index-new.html.hpp"
 #include "index.js.hpp"
 #include "completion.js.hpp"
 #include "system-prompts.js.hpp"
 #include "prompt-formats.js.hpp"
 #include "json-schema-to-grammar.mjs.hpp"
 #include <atomic>
@ -3750,7 +3761,6 @@ int main(int argc, char ** argv) {
        // Set the base directory for serving static files
        svr->set_base_dir(sparams.public_path);
    }
    // using embedded static files
    svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
    svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
@ -3758,6 +3768,19 @@ int main(int argc, char ** argv) {
    svr->Get("/json-schema-to-grammar.mjs", handle_static_file(
      json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
    // add new-ui files
    svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
    svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
    svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8"));
    svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
    svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
    svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
    svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
    svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
    svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
    svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
    svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
    // register API routes
    svr->Get ("/health",              handle_health);
    svr->Get ("/slots",               handle_slots);
--- a/flake.lock
+++ b/flake.lock
@ -5,11 +5,11 @@
        "nixpkgs-lib": "nixpkgs-lib"
      },
      "locked": {
-        "lastModified": 1715865404,
+        "lastModified": 1717285511,
-        "narHash": "sha256-/GJvTdTpuDjNn84j82cU6bXztE0MSkdnTWClUCRub78=",
+        "narHash": "sha256-iKzJcpdXih14qYVcZ9QC9XuZYnPc6T8YImb6dX166kw=",
        "owner": "hercules-ci",
        "repo": "flake-parts",
-        "rev": "8dc45382d5206bd292f9c2768b8058a8fd8311d9",
+        "rev": "2a55567fcf15b1b1c7ed712a2c6fadaec7412ea8",
        "type": "github"
      },
      "original": {
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1716509168,
+        "lastModified": 1716948383,
-        "narHash": "sha256-4zSIhSRRIoEBwjbPm3YiGtbd8HDWzFxJjw5DYSDy1n8=",
+        "narHash": "sha256-SzDKxseEcHR5KzPXLwsemyTR/kaM9whxeiJohbL04rs=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "bfb7a882678e518398ce9a31a881538679f6f092",
+        "rev": "ad57eef4ef0659193044870c731987a6df5cf56b",
        "type": "github"
      },
      "original": {
@ -36,14 +36,14 @@
    },
    "nixpkgs-lib": {
      "locked": {
-        "lastModified": 1714640452,
+        "lastModified": 1717284937,
-        "narHash": "sha256-QBx10+k6JWz6u7VsohfSw8g8hjdBZEf8CFzXH1/1Z94=",
+        "narHash": "sha256-lIbdfCsf8LMFloheeE6N31+BMIeixqyQWbSr2vk79EQ=",
        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/50eb7ecf4cd0a5756d7275c8ba36790e5bd53e33.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/eb9ceca17df2ea50a250b6b27f7bf6ab0186f198.tar.gz"
      },
      "original": {
        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/50eb7ecf4cd0a5756d7275c8ba36790e5bd53e33.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/eb9ceca17df2ea50a250b6b27f7bf6ab0186f198.tar.gz"
      }
    },
    "root": {
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@ -377,7 +377,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
    galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
    GGML_ASSERT(galloc->bufts != NULL);
-    galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
+    galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
    GGML_ASSERT(galloc->buffers != NULL);
    galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
@ -750,7 +750,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
                // this tensor was allocated without ggml-backend
                return;
            }
-            ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
+            ggml_backend_view_init(tensor);
        }
    } else {
        if (tensor->data == NULL) {
@ -899,12 +899,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
            if (t->view_src == NULL) {
                ggml_tallocr_alloc(&tallocr, t);
            } else if (t->buffer == NULL) {
-                ggml_backend_view_init(buffer, t);
+                ggml_backend_view_init(t);
            }
        } else {
            if (t->view_src != NULL && t->buffer == NULL) {
                // view of a pre-allocated tensor
-                ggml_backend_view_init(buffer, t);
+                ggml_backend_view_init(t);
            }
        }
    }
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -151,7 +151,7 @@ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
 bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
    ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
    if (dst_buf->iface.cpy_tensor) {
-        return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
+        return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
    }
    return false;
 }
@ -1887,15 +1887,15 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
 // utils
-void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+void ggml_backend_view_init(struct ggml_tensor * tensor) {
    GGML_ASSERT(tensor->buffer == NULL);
    GGML_ASSERT(tensor->view_src != NULL);
    GGML_ASSERT(tensor->view_src->buffer != NULL);
    GGML_ASSERT(tensor->view_src->data != NULL);
-    tensor->buffer = buffer;
+    tensor->buffer = tensor->view_src->buffer;
    tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
-    ggml_backend_buffer_init_tensor(buffer, tensor);
+    ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
 }
 void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
@ -1954,7 +1954,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
    struct ggml_tensor * dst = node_copies[id];
    if (dst->view_src != NULL) {
        graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
-        ggml_backend_view_init(dst->view_src->buffer, dst);
+        ggml_backend_view_init(dst);
    }
    else {
        ggml_backend_tensor_copy(src, dst);
--- a/ggml-backend.h
+++ b/ggml-backend.h
@ -225,7 +225,7 @@ extern "C" {
    // Tensor initialization
    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
-    GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
 #ifdef  __cplusplus
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -1870,7 +1870,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
        }
    }
 #else
-    if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
+    if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
        // use cublasGemmStridedBatchedEx
        CUBLAS_CHECK(
@ -2886,7 +2886,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
        case GGML_OP_CONT:
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_SOFT_MAX:
            return true;
        case GGML_OP_ROPE:
            return ggml_is_contiguous(op->src[0]);
        case GGML_OP_IM2COL:
        case GGML_OP_POOL_2D:
        case GGML_OP_SUM_ROWS:
@ -2903,10 +2905,14 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
            return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
 #else
-            if (op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128) {
+            if (op->src[0]->ne[0] == 128) {
                return true;
            }
-            return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA;
+            if (op->src[0]->ne[0] ==  64 && op->src[1]->type == GGML_TYPE_F16) {
                return true;
            }
            return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA &&
                op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
        default:
            return false;
--- a/ggml-cuda/concat.cu
+++ b/ggml-cuda/concat.cu
@ -1,5 +1,6 @@
 #include "concat.cuh"
 // contiguous kernels
 static __global__ void concat_f32_dim0(const float * x, const float * y, float * dst, const int ne0, const int ne00) {
    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
    if (nidx >= ne0) {
@ -92,25 +93,77 @@ static void concat_f32_cuda(const float * x, const float * y, float * dst, int n
    concat_f32_dim2<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
 }
 // non-contiguous kernel (slow)
 static __global__ void concat_f32_non_cont(
        const char * src0,
        const char * src1,
              char * dst,
           int64_t   ne00,
           int64_t   ne01,
           int64_t   ne02,
           int64_t   ne03,
          uint64_t   nb00,
          uint64_t   nb01,
          uint64_t   nb02,
          uint64_t   nb03,
           int64_t /*ne10*/,
           int64_t /*ne11*/,
           int64_t /*ne12*/,
           int64_t /*ne13*/,
          uint64_t   nb10,
          uint64_t   nb11,
          uint64_t   nb12,
          uint64_t   nb13,
           int64_t   ne0,
           int64_t /*ne1*/,
           int64_t /*ne2*/,
           int64_t /*ne3*/,
          uint64_t   nb0,
          uint64_t   nb1,
          uint64_t   nb2,
          uint64_t   nb3,
          int32_t   dim) {
    const int64_t i3 = blockIdx.z;
    const int64_t i2 = blockIdx.y;
    const int64_t i1 = blockIdx.x;
    int64_t o[4] = {0, 0, 0, 0};
    o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
    const float * x;
    for (int i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
        if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
            x = (const float *)(src0 + (i3       )*nb03 + (i2       )*nb02 + (i1       )*nb01 + (i0       )*nb00);
        } else {
            x = (const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
        }
        float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
        *y = *x;
    }
 }
 void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const ggml_tensor * src1 = dst->src[1];
    const float * src0_d = (const float *)src0->data;
    const float * src1_d = (const float *)src1->data;
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();
    const int32_t dim = ((int32_t *) dst->op_params)[0];
    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(ggml_is_contiguous(src1));
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
    if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
        const float * src0_d = (const float *)src0->data;
        const float * src1_d = (const float *)src1->data;
        float * dst_d = (float *)dst->data;
        if (dim != 3) {
            for (int i3 = 0; i3 < dst->ne[3]; i3++) {
                concat_f32_cuda(
@ -127,4 +180,17 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
            CUDA_CHECK(cudaMemcpyAsync(dst_d,           src0_d, size0, cudaMemcpyDeviceToDevice, stream));
            CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/4, src1_d, size1, cudaMemcpyDeviceToDevice, stream));
        }
    } else {
        dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]);
        concat_f32_non_cont<<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
                (const char *)src0->data,
                (const char *)src1->data,
                (      char *)dst->data,
                src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
                src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
                src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
                src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3],
                dst->ne[0],  dst->ne[1],  dst->ne[2],  dst->ne[3],
                dst->nb[0],  dst->nb[1],  dst->nb[2],  dst->nb[3], dim);
    }
 }
--- a/ggml-cuda/fattn-common.cuh
+++ b/ggml-cuda/fattn-common.cuh
@ -1,4 +1,8 @@
 #pragma once
 #include "common.cuh"
 #include "convert.cuh"
 #include "vecdotq.cuh"
 #include <cstdint>
@ -34,11 +38,523 @@ typedef void (* fattn_kernel_t)(
        const int nb11,
        const int nb12,
        const int nb13,
        const int nb21,
        const int nb22,
        const int nb23,
        const int ne0,
        const int ne1,
        const int ne2,
        const int ne3);
 typedef half (*vec_dot_KQ_f16_t)(
    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds);
 typedef float (*vec_dot_KQ_f32_t)(
    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds);
 template<typename T, int D>
 static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
 #if __CUDA_ARCH__ >= MIN_CC_DP4A
    const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
    GGML_UNUSED(Q_v);
    half sum = 0.0f;
 #pragma unroll
    for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
        const int k_KQ = k_KQ_0 + threadIdx.x;
        const int ib    = k_KQ /  QI8_1;
        const int iqs4  = k_KQ %  QI4_0;
        const int shift = k_KQ & (QI8_1/2);
        const int v = (get_int_from_uint8(K_q4_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
        const int u = Q_q8[k_KQ_0/WARP_SIZE];
        const int sumi = __dp4a(v, u, 0);
 #if FP16_AVAILABLE
        if (std::is_same<T, half>::value) {
            const half2  * Q_ds = (const half2  *) Q_ds_v;
            const half2 sum2 = __half2half2(K_q4_0[ib].d) * Q_ds[k_KQ_0/WARP_SIZE];
            sum += (T) (((half) sumi)*__low2half(sum2) - __high2half(sum2) /* *8/QI8_1 == 1 */);
        } else
 #endif // FP16_AVAILABLE
        {
            const float2 * Q_ds = (const float2 *) Q_ds_v;
            sum += (T) (__half2float(K_q4_0[ib].d) * (sumi*Q_ds[k_KQ_0/WARP_SIZE].x - (8/QI8_1)*Q_ds[k_KQ_0/WARP_SIZE].y));
        }
    }
    return sum;
 #else
    GGML_UNUSED(K_c);
    GGML_UNUSED(Q_v);
    GGML_UNUSED(Q_q8);
    GGML_UNUSED(Q_ds_v);
    NO_DEVICE_CODE;
 #endif  // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 template<typename T, int D>
 static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
 #if __CUDA_ARCH__ >= MIN_CC_DP4A
    const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
    GGML_UNUSED(Q_v);
    T sum = 0.0f;
 #pragma unroll
    for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
        const int k_KQ = k_KQ_0 + threadIdx.x;
        const int ib    = k_KQ /  QI8_1;
        const int iqs4  = k_KQ %  QI4_1;
        const int shift = k_KQ & (QI8_1/2);
        const int v = (get_int_from_uint8_aligned(K_q4_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
        const int u = Q_q8[k_KQ_0/WARP_SIZE];
        const int sumi = __dp4a(v, u, 0);
 #if FP16_AVAILABLE
        if (std::is_same<T, half>::value) {
            const half2  * Q_ds = (const half2  *) Q_ds_v;
            const half2 d4d8_m4s8 = K_q4_1[ib].dm * Q_ds[k_KQ_0/WARP_SIZE];
            const half2 sumid4d8_m4s8scaled = d4d8_m4s8 * make_half2(sumi, 1.0f/QI8_1);
            sum += (T) (__low2half(sumid4d8_m4s8scaled) + __high2half(sumid4d8_m4s8scaled));
        } else
 #endif // FP16_AVAILABLE
        {
            const float2 * Q_ds = (const float2 *) Q_ds_v;
            const float sumid4d8   =  __low2float(K_q4_1[ib].dm)*Q_ds[k_KQ_0/WARP_SIZE].x * sumi;
            const float m4s8scaled = __high2float(K_q4_1[ib].dm)*Q_ds[k_KQ_0/WARP_SIZE].y / QI8_1;
            sum += (T) (sumid4d8 + m4s8scaled);
        }
    }
    return sum;
 #else
    GGML_UNUSED(K_c);
    GGML_UNUSED(Q_v);
    GGML_UNUSED(Q_q8);
    GGML_UNUSED(Q_ds_v);
    NO_DEVICE_CODE;
 #endif  // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 template<typename T, int D>
 static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
 #if __CUDA_ARCH__ >= MIN_CC_DP4A
    const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
    GGML_UNUSED(Q_v);
    T sum = 0.0f;
 #pragma unroll
    for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
        const int k_KQ = k_KQ_0 + threadIdx.x;
        const int ib    = k_KQ /  QI8_1;
        const int iqs4  = k_KQ %  QI5_0;
        const int iqs8  = k_KQ %  QI8_1;
        const int shift = k_KQ & (QI8_1/2);
        int v = (get_int_from_uint8(K_q5_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
        const int vh = get_int_from_uint8(K_q5_0[ib].qh, 0) >> (iqs8 * QI5_0);
        v |= (vh <<  4) & 0x00000010; // 0 ->  4
        v |= (vh << 11) & 0x00001000; // 1 -> 12
        v |= (vh << 18) & 0x00100000; // 2 -> 20
        v |= (vh << 25) & 0x10000000; // 3 -> 28
        const int u = Q_q8[k_KQ_0/WARP_SIZE];
        const int sumi = __dp4a(v, u, 0);
 #if FP16_AVAILABLE
        if (std::is_same<T, half>::value) {
            const half2  * Q_ds = (const half2  *) Q_ds_v;
            const half2 sum2 = __half2half2(K_q5_0[ib].d) * Q_ds[k_KQ_0/WARP_SIZE];
            sum += (T) (((half) sumi)*__low2half(sum2) - __high2half(sum2)*__float2half(2.0f)) /* *16/QI8_1 == 2 */;
        } else
 #endif // FP16_AVAILABLE
        {
            const float2 * Q_ds = (const float2 *) Q_ds_v;
            sum += (T) (__half2float(K_q5_0[ib].d) * (sumi*Q_ds[k_KQ_0/WARP_SIZE].x - (16/QI8_1)*Q_ds[k_KQ_0/WARP_SIZE].y));
        }
    }
    return sum;
 #else
    GGML_UNUSED(K_c);
    GGML_UNUSED(Q_v);
    GGML_UNUSED(Q_q8);
    GGML_UNUSED(Q_ds_v);
    NO_DEVICE_CODE;
 #endif  // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 template<typename T, int D>
 static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
 #if __CUDA_ARCH__ >= MIN_CC_DP4A
    const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
    GGML_UNUSED(Q_v);
    T sum = 0.0f;
 #pragma unroll
    for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
        const int k_KQ = k_KQ_0 + threadIdx.x;
        const int ib    = k_KQ /  QI8_1;
        const int iqs4  = k_KQ %  QI5_1;
        const int iqs8  = k_KQ %  QI8_1;
        const int shift = k_KQ & (QI8_1/2);
        int v = (get_int_from_uint8(K_q5_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
        const int vh = get_int_from_uint8(K_q5_1[ib].qh, 0) >> (iqs8 * QI5_1);
        v |= (vh <<  4) & 0x00000010; // 0 ->  4
        v |= (vh << 11) & 0x00001000; // 1 -> 12
        v |= (vh << 18) & 0x00100000; // 2 -> 20
        v |= (vh << 25) & 0x10000000; // 3 -> 28
        const int u = Q_q8[k_KQ_0/WARP_SIZE];
        const int sumi = __dp4a(v, u, 0);
 #if FP16_AVAILABLE
        if (std::is_same<T, half>::value) {
            const half2  * Q_ds = (const half2  *) Q_ds_v;
            const half2 d5d8_m5s8 = K_q5_1[ib].dm * Q_ds[k_KQ_0/WARP_SIZE];
            const half2 sumid5d8_m5s8scaled = d5d8_m5s8 * make_half2(sumi, 1.0f/QI8_1);
            sum += (T) (__low2half(sumid5d8_m5s8scaled) + __high2half(sumid5d8_m5s8scaled));
        } else
 #endif // FP16_AVAILABLE
        {
            const float2 * Q_ds = (const float2 *) Q_ds_v;
            const float sumid5d8   =  __low2float(K_q5_1[ib].dm)*Q_ds[k_KQ_0/WARP_SIZE].x * sumi;
            const float m5s8scaled = __high2float(K_q5_1[ib].dm)*Q_ds[k_KQ_0/WARP_SIZE].y / QI8_1;
            sum += (T) (sumid5d8 + m5s8scaled);
        }
    }
    return sum;
 #else
    GGML_UNUSED(K_c);
    GGML_UNUSED(Q_v);
    GGML_UNUSED(Q_q8);
    GGML_UNUSED(Q_ds_v);
    NO_DEVICE_CODE;
 #endif  // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 template <typename T, int D>
 static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
 #if __CUDA_ARCH__ >= MIN_CC_DP4A
    const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
    GGML_UNUSED(Q_v);
    T sum = 0.0f;
 #pragma unroll
    for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
        const int k_KQ = k_KQ_0 + threadIdx.x;
        const int ib  = k_KQ / QI8_0;
        const int iqs = k_KQ % QI8_0;
        const int v = get_int_from_int8(K_q8_0[ib].qs, iqs);
        T Q_d;
        if (std::is_same<T, half>::value) {
            const half2  * Q_ds = (const half2  *) Q_ds_v;
            Q_d = __low2half(Q_ds[k_KQ_0/WARP_SIZE]);
        } else {
            const float2 * Q_ds = (const float2 *) Q_ds_v;
            Q_d = Q_ds[k_KQ_0/WARP_SIZE].x;
        }
        sum += vec_dot_q8_0_q8_1_impl<T, 1>(&v, &Q_q8[k_KQ_0/WARP_SIZE], K_q8_0[ib].d, Q_d);
    }
    return sum;
 #else
    GGML_UNUSED(K_c);
    GGML_UNUSED(Q_v);
    GGML_UNUSED(Q_q8);
    GGML_UNUSED(Q_ds_v);
    NO_DEVICE_CODE;
 #endif  // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 template <typename T, int D>
 static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_f16(
    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds_v) {
    const half2 * K_h2 = (const half2 *) K_c;
    GGML_UNUSED(Q_q8);
    GGML_UNUSED(Q_ds_v);
 #if FP16_AVAILABLE
    if (std::is_same<T, half>::value) {
        const half2 * Q_h2 = (const half2 *) Q_v;
        half2 sum2 = make_half2(0.0f, 0.0f);
 #pragma unroll
        for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
            const int k_KQ = k_KQ_0 + threadIdx.x;
            const half2 K_ik = K_h2[k_KQ];
            sum2 += K_ik * Q_h2[k_KQ_0/WARP_SIZE];
        }
        return __low2half(sum2) + __high2half(sum2);
    }
 #endif // FP16_AVAILABLE
    const float2 * Q_f2 = (const float2 *) Q_v;
    float sum = 0.0f;
 #pragma unroll
    for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
        const int k_KQ = k_KQ_0 + threadIdx.x;
        const half2 K_ik = K_h2[k_KQ];
        sum +=  __low2float(K_ik) * Q_f2[k_KQ_0/WARP_SIZE].x;
        sum += __high2float(K_ik) * Q_f2[k_KQ_0/WARP_SIZE].y;
    }
    return sum;
 }
 template <typename Tds>
 static __device__ __forceinline__ void quantize_q8_1_to_shared(
    const float * __restrict__ x, const float scale, int * __restrict__ yq32, void * __restrict__ yds) {
    float vals[sizeof(int)] = {0.0f};
 #pragma unroll
    for (int l = 0; l < sizeof(int); ++l) {
        vals[l] = scale * x[4*threadIdx.x + l];
    }
    float amax = fabsf(vals[0]);
    float sum  = vals[0];
 #pragma unroll
    for (int l = 1; l < sizeof(int); ++l) {
        amax = fmaxf(amax, fabsf(vals[l]));
        sum += vals[l];
    }
 #pragma unroll
    for (int mask = QI8_1/2; mask > 0; mask >>= 1) {
        amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, mask, 32));
        sum +=             __shfl_xor_sync(0xFFFFFFFF, sum,  mask, 32);
    }
    const float d = amax / 127;
    int q32 = 0;
    int8_t * q8 = (int8_t *) &q32;
    if (d != 0.0f) {
 #pragma unroll
        for (int l = 0; l < sizeof(int); ++l) {
            q8[l] = roundf(vals[l] / d);
        }
    }
    yq32[threadIdx.x] = q32;
    if (threadIdx.x % QI8_1 == 0) {
        if (std::is_same<Tds, half2>::value) {
            ((half2  *) yds)[threadIdx.x/QI8_1] =  make_half2(d, sum);
        } else {
            ((float2 *) yds)[threadIdx.x/QI8_1] = make_float2(d, sum);
        }
    }
 }
 typedef half  (*dequantize_1_f16_t)(const void *, const int64_t);
 typedef float (*dequantize_1_f32_t)(const void *, const int64_t);
 template <typename T>
 static __device__ __forceinline__ T dequantize_1_q4_0(const void * __restrict__ vx, const int64_t i) {
    const block_q4_0 * x = (const block_q4_0 *) vx;
    const int64_t ib    =  i          /  QK4_0;
    const int     iqs   =  i          % (QK4_0/2);
    const int     shift = (i % QK4_0) / (QK4_0/2);
    const T   d  = x[ib].d;
    const int q0 = x[ib].qs[iqs];
    const int q  = ((q0 >> (4*shift)) & 0x0F) - 8;
 #if FP16_AVAILABLE
    if (std::is_same<T, half>::value) {
        return ((half) d)*((half) q);
    }
 #endif // FP16_AVAILABLE
    return ((float) d)*((float) q);
 }
 template <typename T>
 static __device__ __forceinline__ T dequantize_1_q4_1(const void * __restrict__ vx, const int64_t i) {
    const block_q4_1 * x = (const block_q4_1 *) vx;
    const int64_t ib    =  i          /  QK4_1;
    const int     iqs   =  i          % (QK4_1/2);
    const int     shift = (i % QK4_1) / (QK4_1/2);
    const half2 dm = x[ib].dm;
    const int   q0 = x[ib].qs[iqs];
    const int   q  = ((q0 >> (4*shift)) & 0x0F);
 #if FP16_AVAILABLE
    if (std::is_same<T, half>::value) {
        return __low2half(dm)*((half) q) + __high2half(dm);
    }
 #endif // FP16_AVAILABLE
    return __low2float(dm)*((float) q) + __high2float(dm);
 }
 template <typename T>
 static __device__ __forceinline__ T dequantize_1_q5_0(const void * __restrict__ vx, const int64_t i) {
    const block_q5_0 * x = (const block_q5_0 *) vx;
    const int64_t ib    =  i          /  QK5_0;
    const int     idq   =  i          %  QK5_0;
    const int     iqs   =  i          % (QK5_0/2);
    const int     shift = (i % QK5_0) / (QK5_0/2);
    const T   d   = x[ib].d;
    const int ql0 = x[ib].qs[iqs];
    const int qh0 = get_int_from_uint8(x[ib].qh, 0);
    const int ql  = ((ql0 >> (4*shift)) & 0x0F);
    const int qh  = ((qh0 >> idq) << 4) & 0x10;
    const int q   = (ql | qh) - 16;
 #if FP16_AVAILABLE
    if (std::is_same<T, half>::value) {
        return ((half) d)*((half) q);
    }
 #endif // FP16_AVAILABLE
    return ((float) d)*((float) q);
 }
 template <typename T>
 static __device__ __forceinline__ T dequantize_1_q5_1(const void * __restrict__ vx, const int64_t i) {
    const block_q5_1 * x = (const block_q5_1 *) vx;
    const int64_t ib    =  i          /  QK5_1;
    const int     idq   =  i          %  QK5_1;
    const int     iqs   =  i          % (QK5_1/2);
    const int     shift = (i % QK5_1) / (QK5_1/2);
    const half2 dm  = x[ib].dm;
    const int   ql0 = x[ib].qs[iqs];
    const int   qh0 = get_int_from_uint8_aligned(x[ib].qh, 0);
    const int   ql  = ((ql0 >> (4*shift)) & 0x0F);
    const int   qh  = ((qh0 >> idq) << 4) & 0x10;
    const int   q   = (ql | qh);
 #if FP16_AVAILABLE
    if (std::is_same<T, half>::value) {
        return __low2half(dm)*((half) q) + __high2half(dm);
    }
 #endif // FP16_AVAILABLE
    return __low2float(dm)*((float) q) + __high2float(dm);
 }
 template <typename T>
 static __device__ __forceinline__ T dequantize_1_q8_0(const void * __restrict__ vx, const int64_t i) {
    const block_q8_0 * x = (const block_q8_0 *) vx;
    const int64_t ib  = i / QK8_0;
    const int     iqs = i % QK8_0;
    const T   d = x[ib].d;
    const int q = x[ib].qs[iqs];
 #if FP16_AVAILABLE
    if (std::is_same<T, half>::value) {
        return ((half) d)*((half) q);
    }
 #endif // FP16_AVAILABLE
    return ((float) d)*((float) q);
 }
 template <typename T>
 static __device__ __forceinline__ T dequantize_1_f16(const void * __restrict__ vx, const int64_t i) {
    const half * x = (const half *) vx;
    return x[i];
 }
 template <int D>
 constexpr __device__ vec_dot_KQ_f16_t get_vec_dot_KQ_f16(ggml_type type_K) {
    return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<half, D> :
        type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<half, D> :
        type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<half, D> :
        type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<half, D> :
        type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<half, D> :
        type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<half, D> :
        nullptr;
 }
 template <int D>
 constexpr __device__ vec_dot_KQ_f32_t get_vec_dot_KQ_f32(ggml_type type_K) {
    return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<float, D> :
        type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<float, D> :
        type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<float, D> :
        type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<float, D> :
        type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<float, D> :
        type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<float, D> :
        nullptr;
 }
 constexpr __device__ dequantize_1_f16_t get_dequantize_1_f16(ggml_type type_V) {
    return type_V == GGML_TYPE_Q4_0 ? dequantize_1_q4_0<half> :
        type_V == GGML_TYPE_Q4_1 ? dequantize_1_q4_1<half> :
        type_V == GGML_TYPE_Q5_0 ? dequantize_1_q5_0<half> :
        type_V == GGML_TYPE_Q5_1 ? dequantize_1_q5_1<half> :
        type_V == GGML_TYPE_Q8_0 ? dequantize_1_q8_0<half> :
        type_V == GGML_TYPE_F16 ? dequantize_1_f16<half> :
        nullptr;
 }
 constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) {
    return type_V == GGML_TYPE_Q4_0 ? dequantize_1_q4_0<float> :
        type_V == GGML_TYPE_Q4_1 ? dequantize_1_q4_1<float> :
        type_V == GGML_TYPE_Q5_0 ? dequantize_1_q5_0<float> :
        type_V == GGML_TYPE_Q5_1 ? dequantize_1_q5_1<float> :
        type_V == GGML_TYPE_Q8_0 ? dequantize_1_q8_0<float> :
        type_V == GGML_TYPE_F16 ? dequantize_1_f16<float> :
        nullptr;
 }
 template<int D, int parallel_blocks> // D == head size
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
@ -83,8 +599,32 @@ static __global__ void flash_attn_combine_results(
    dst[blockIdx.y*D + tid] = VKQ_numerator / VKQ_denominator;
 }
 static void on_no_fattn_vec_case(const int D) {
    if (D == 64) {
        fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
        fprintf(stderr, "By default only f16 KV cache is supported.\n");
        fprintf(stderr, "Compile with LLAMA_CUDA_FA_ALL_QUANTS for V cache quantization support.\n");
        GGML_ASSERT(false);
    } else if (D == 128) {
        fprintf(stderr, "Unsupported KV type combination for head_size 128.\n");
        fprintf(stderr, "Supported combinations:\n");
        fprintf(stderr, "  - K == q4_0, V == q4_0,  4.50 BPV\n");
        fprintf(stderr, "  - K == q8_0, V == q8_0,  8.50 BPV\n");
        fprintf(stderr, "  - K == f16,  V == f16,  16.00 BPV\n");
        fprintf(stderr, "Compile with LLAMA_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n");
        GGML_ASSERT(false);
    } else {
        fprintf(stderr, "Unsupported KV type combination for head_size 256.\n");
        fprintf(stderr, "Only f16 is supported.\n");
        GGML_ASSERT(false);
    }
 }
 template <int D, int parallel_blocks>
-void launch_fattn(ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel, int nwarps, int cols_per_block) {
+void launch_fattn(
    ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel,
    const int nwarps, const int cols_per_block, const bool need_f16_K, const bool need_f16_V
 ) {
    const ggml_tensor * Q = dst->src[0];
    const ggml_tensor * K = dst->src[1];
    const ggml_tensor * V = dst->src[2];
@ -94,8 +634,6 @@ void launch_fattn(ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kern
    ggml_tensor * KQV = dst;
    GGML_ASSERT(Q->type == GGML_TYPE_F32);
    GGML_ASSERT(K->type == GGML_TYPE_F16);
    GGML_ASSERT(V->type == GGML_TYPE_F16);
    GGML_ASSERT(KQV->type == GGML_TYPE_F32);
    GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
@ -107,9 +645,49 @@ void launch_fattn(ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kern
    ggml_cuda_pool & pool = ctx.pool();
    cudaStream_t main_stream = ctx.stream();
    ggml_cuda_pool_alloc<half>   K_f16(pool);
    ggml_cuda_pool_alloc<half>   V_f16(pool);
    ggml_cuda_pool_alloc<float>  dst_tmp(pool);
    ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
    char * K_data = (char *) K->data;
    size_t nb11 = K->nb[1];
    size_t nb12 = K->nb[2];
    size_t nb13 = K->nb[3];
    char * V_data = (char *) V->data;
    size_t nb21 = V->nb[1];
    size_t nb22 = V->nb[2];
    size_t nb23 = V->nb[3];
    if (need_f16_K && K->type != GGML_TYPE_F16) {
        K_f16.alloc(ggml_nelements(K));
        to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type);
        to_fp16(K_data, K_f16.ptr, ggml_nelements(K), main_stream);
        K_data = (char *) K_f16.ptr;
        const size_t bs = ggml_blck_size(K->type);
        const size_t ts = ggml_type_size(K->type);
        nb11 = nb11*bs*sizeof(half)/ts;
        nb12 = nb12*bs*sizeof(half)/ts;
        nb13 = nb13*bs*sizeof(half)/ts;
    }
    if (need_f16_V && V->type != GGML_TYPE_F16) {
        V_f16.alloc(ggml_nelements(V));
        to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);
        to_fp16(V_data, V_f16.ptr, ggml_nelements(V), main_stream);
        V_data = (char *) V_f16.ptr;
        const size_t bs = ggml_blck_size(V->type);
        const size_t ts = ggml_type_size(V->type);
        nb21 = nb21*bs*sizeof(half)/ts;
        nb22 = nb22*bs*sizeof(half)/ts;
        nb23 = nb23*bs*sizeof(half)/ts;
    }
    if (parallel_blocks > 1) {
        dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
        dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV));
@ -133,8 +711,8 @@ void launch_fattn(ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kern
    fattn_kernel<<<blocks_num, block_dim, shmem, main_stream>>>(
        (const char *) Q->data,
-        (const char *) K->data,
+        K_data,
-        (const char *) V->data,
+        V_data,
        mask ? ((const char *) mask->data) : nullptr,
        (parallel_blocks) == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
        scale, max_bias, m0, m1, n_head_log2,
@ -142,7 +720,8 @@ void launch_fattn(ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kern
        K->ne[0], K->ne[1], K->ne[2], K->ne[3],
        mask ? mask->ne[1] : 0, mask ?  mask->nb[1] : 0,
        Q->nb[1], Q->nb[2], Q->nb[3],
-        K->nb[1], K->nb[2], K->nb[3],
+        nb11, nb12, nb13,
        nb21, nb22, nb23,
        KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
    );
    CUDA_CHECK(cudaGetLastError());
--- a/ggml-cuda/fattn-tile-f16.cu
+++ b/ggml-cuda/fattn-tile-f16.cu
@ -36,6 +36,9 @@ static __global__ void flash_attn_tile_ext_f16(
        const int nb11,
        const int nb12,
        const int nb13,
        const int nb21,
        const int nb22,
        const int nb23,
        const int ne0,
        const int ne1,
        const int ne2,
@ -275,13 +278,13 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
            constexpr int      D = 64;
            constexpr int nwarps = 8;
            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks>;
-            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
+            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
        } break;
        case 128: {
            constexpr int      D = 128;
            constexpr int nwarps = 8;
            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks>;
-            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
+            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
        } break;
        default: {
            GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
--- a/ggml-cuda/fattn-tile-f32.cu
+++ b/ggml-cuda/fattn-tile-f32.cu
@ -36,6 +36,9 @@ static __global__ void flash_attn_tile_ext_f32(
        const int nb11,
        const int nb12,
        const int nb13,
        const int nb21,
        const int nb22,
        const int nb23,
        const int ne0,
        const int ne1,
        const int ne2,
@ -272,13 +275,13 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
            constexpr int      D = 64;
            constexpr int nwarps = 8;
            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks>;
-            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
+            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
        } break;
        case 128: {
            constexpr int      D = 128;
            constexpr int nwarps = 8;
            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks>;
-            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
+            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
        } break;
        default: {
            GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
--- a/ggml-cuda/fattn-vec-f16.cu
+++ b/ggml-cuda/fattn-vec-f16.cu
@ -1,330 +0,0 @@
 #include "common.cuh"
 #include "fattn-common.cuh"
 #include "fattn-vec-f16.cuh"
 template<int D, int ncols, int parallel_blocks> // D == head size
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_vec_ext_f16(
        const char * __restrict__ Q,
        const char * __restrict__ K,
        const char * __restrict__ V,
        const char * __restrict__ mask,
        float      * __restrict__ dst,
        float2     * __restrict__ dst_meta,
        const float scale,
        const float max_bias,
        const float m0,
        const float m1,
        const uint32_t n_head_log2,
        const int ne00,
        const int ne01,
        const int ne02,
        const int ne03,
        const int ne10,
        const int ne11,
        const int ne12,
        const int ne13,
        const int ne31,
        const int nb31,
        const int nb01,
        const int nb02,
        const int nb03,
        const int nb11,
        const int nb12,
        const int nb13,
        const int ne0,
        const int ne1,
        const int ne2,
        const int ne3) {
 #if FP16_AVAILABLE
    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic0);
    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.y / gqa_ratio));
    const half   * V_h   = (const half   *) (V    + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
    const half   * maskh = (const half   *)  mask + ne11*ic0;
    const int stride_KV  = nb11 / sizeof(half);
    const int stride_KV2 = nb11 / sizeof(half2);
    const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
    const half  slopeh = __float2half(slopef);
    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
    constexpr int nwarps = D / WARP_SIZE;
    const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
    __builtin_assume(tid < D);
    __shared__ half KQ[ncols*D];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        KQ[j*D + tid] = -HALF_MAX_HALF;
    }
    half2 * KQ2 = (half2 *) KQ;
    half kqmax[ncols];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        kqmax[j] = -HALF_MAX_HALF;
    }
    half kqsum[ncols] = {0.0f};
    __shared__ half kqmax_shared[ncols][WARP_SIZE];
    __shared__ half kqsum_shared[ncols][WARP_SIZE];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        if (threadIdx.y == 0) {
            kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
            kqsum_shared[j][threadIdx.x] = 0.0f;
        }
    }
    __syncthreads();
    // Convert Q to half2 and store in registers:
    half2 Q_h2[ncols][D/(2*WARP_SIZE)];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
 #pragma unroll
        for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
            const int i = i0 + threadIdx.x;
            const float2 tmp = ncols <= 2 || ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i] : make_float2(0.0f, 0.0f);
            Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
        }
    }
    half2 VKQ[ncols] = {{0.0f, 0.0f}};
    const int k_start = parallel_blocks == 1 ? 0 : ip*D;
    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
        // Calculate KQ tile and keep track of new maximum KQ values:
        // For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
        // see https://github.com/ggerganov/llama.cpp/pull/7061 .
        // Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
        half kqmax_new = kqmax[0];
        half kqmax_new_arr[ncols];
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            kqmax_new_arr[j] = kqmax[j];
        }
 #pragma unroll
        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
            const int i_KQ = i_KQ_0 + threadIdx.y;
            if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
                break;
            }
            half2 sum2[ncols] = {{0.0f, 0.0f}};
 #pragma unroll
            for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
                const int k_KQ = k_KQ_0 + threadIdx.x;
                const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
 #pragma unroll
                for (int j = 0; j < ncols; ++j) {
                    sum2[j] += K_ik * Q_h2[j][k_KQ_0/WARP_SIZE];
                }
            }
 #pragma unroll
            for (int j = 0; j < ncols; ++j) {
                sum2[j] = warp_reduce_sum(sum2[j]);
                half sum = __low2half(sum2[j]) + __high2half(sum2[j]);
                sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
                if (ncols == 1) {
                    kqmax_new        = ggml_cuda_hmax(kqmax_new,        sum);
                } else {
                    kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
                }
                if (threadIdx.x == 0) {
                    KQ[j*D + i_KQ] = sum;
                }
            }
        }
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
            kqmax_new_j = warp_reduce_max(kqmax_new_j);
            if (threadIdx.x == 0) {
                kqmax_shared[j][threadIdx.y] = kqmax_new_j;
            }
        }
        __syncthreads();
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            half kqmax_new_j = kqmax_shared[j][threadIdx.x];
            kqmax_new_j = warp_reduce_max(kqmax_new_j);
            const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
            kqmax[j] = kqmax_new_j;
            const half val = hexp(KQ[j*D + tid] - kqmax[j]);
            kqsum[j] = kqsum[j]*KQ_max_scale + val;
            KQ[j*D + tid] = val;
            VKQ[j] *= __half2half2(KQ_max_scale);
        }
        __syncthreads();
 #pragma unroll
        for (int k0 = 0; k0 < D; k0 += 2) {
            if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
                break;
            }
            half2 V_k;
            reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
            reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
 #pragma unroll
            for (int j = 0; j < ncols; ++j) {
                VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
            }
        }
        __syncthreads();
    }
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        kqsum[j] = warp_reduce_sum(kqsum[j]);
        if (threadIdx.x == 0) {
            kqsum_shared[j][threadIdx.y] = kqsum[j];
        }
    }
    __syncthreads();
 #pragma unroll
    for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
        if (ncols > 2 && ic0 + j_VKQ >= ne01) {
            break;
        }
        kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
        kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
        half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
        if (parallel_blocks == 1) {
            dst_val /= kqsum[j_VKQ];
        }
        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
        dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
    }
    if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
        dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
    }
 #else
   NO_DEVICE_CODE;
 #endif // FP16_AVAILABLE
 }
 void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_tensor * KQV = dst;
    ggml_tensor * Q   = dst->src[0];
    const int32_t precision = KQV->op_params[2];
    GGML_ASSERT(precision == GGML_PREC_DEFAULT);
    constexpr int cols_per_block  = 1;
    constexpr int parallel_blocks = 4;
    switch (Q->ne[0]) {
        case  64: {
            constexpr int      D = 64;
            constexpr int nwarps = D/WARP_SIZE;
            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
        } break;
        case 128: {
            constexpr int      D = 128;
            constexpr int nwarps = D/WARP_SIZE;
            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
        } break;
        case 256: {
            constexpr int      D = 256;
            constexpr int nwarps = D/WARP_SIZE;
            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
        } break;
        default:
            GGML_ASSERT(false);
            break;
    }
 }
 template <int cols_per_block, int parallel_blocks>
 void launch_fattn_vec_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * Q = dst->src[0];
    switch (Q->ne[0]) {
        case  64: {
            constexpr int      D = 64;
            constexpr int nwarps = D/WARP_SIZE;
            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
        } break;
        case 128: {
            constexpr int      D = 128;
            constexpr int nwarps = D/WARP_SIZE;
            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
        } break;
        default: {
            GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
        } break;
    }
 }
 void ggml_cuda_flash_attn_ext_vec_f16_no_mma(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * KQV = dst;
    const ggml_tensor * Q   = dst->src[0];
    const int32_t precision = KQV->op_params[2];
    GGML_ASSERT(precision == GGML_PREC_DEFAULT);
    if (Q->ne[1] == 1) {
        ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
        return;
    }
    if (Q->ne[1] == 2) {
        constexpr int cols_per_block  = 2;
        constexpr int parallel_blocks = 4;
        launch_fattn_vec_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
        return;
    }
    if (Q->ne[1] <= 4) {
        constexpr int cols_per_block  = 4;
        constexpr int parallel_blocks = 4;
        launch_fattn_vec_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
        return;
    }
    if (Q->ne[1] <= 8) {
        constexpr int cols_per_block  = 8;
        constexpr int parallel_blocks = 4;
        launch_fattn_vec_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
        return;
    }
    constexpr int cols_per_block  = 8;
    constexpr int parallel_blocks = 1;
    launch_fattn_vec_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
 }
--- a/ggml-cuda/fattn-vec-f16.cuh
+++ b/ggml-cuda/fattn-vec-f16.cuh
@ -1,5 +1,397 @@
 #include "common.cuh"
 #include "fattn-common.cuh"
-void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V> // D == head size
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_vec_ext_f16(
        const char * __restrict__ Q,
        const char * __restrict__ K,
        const char * __restrict__ V,
        const char * __restrict__ mask,
        float      * __restrict__ dst,
        float2     * __restrict__ dst_meta,
        const float scale,
        const float max_bias,
        const float m0,
        const float m1,
        const uint32_t n_head_log2,
        const int ne00,
        const int ne01,
        const int ne02,
        const int ne03,
        const int ne10,
        const int ne11,
        const int ne12,
        const int ne13,
        const int ne31,
        const int nb31,
        const int nb01,
        const int nb02,
        const int nb03,
        const int nb11,
        const int nb12,
        const int nb13,
        const int nb21,
        const int nb22,
        const int nb23,
        const int ne0,
        const int ne1,
        const int ne2,
        const int ne3) {
 #if FP16_AVAILABLE
    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
-void ggml_cuda_flash_attn_ext_vec_f16_no_mma(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+    constexpr vec_dot_KQ_f16_t vec_dot_KQ = get_vec_dot_KQ_f16<D>(type_K);
    constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16;
    constexpr dequantize_1_f16_t dequantize_1_v = get_dequantize_1_f16(type_V);
    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
    Q += nb02* blockIdx.y              + nb01*ic0;
    K += nb12*(blockIdx.y / gqa_ratio);
    V += nb22*(blockIdx.y / gqa_ratio);
    const half * maskh = (const half   *)  mask + ne11*ic0;
    const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
    const half  slopeh = __float2half(slopef);
    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
    constexpr int nwarps = D / WARP_SIZE;
    const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
    __builtin_assume(tid < D);
    __shared__ half KQ[ncols*D];
    half2 * KQ2 = (half2 *) KQ;
    half kqmax[ncols];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        kqmax[j] = -HALF_MAX_HALF;
    }
    half kqsum[ncols] = {0.0f};
    __shared__ half kqmax_shared[ncols][WARP_SIZE];
    __shared__ half kqsum_shared[ncols][WARP_SIZE];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        if (threadIdx.y == 0) {
            kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
            kqsum_shared[j][threadIdx.x] = 0.0f;
        }
    }
    __syncthreads();
    // Convert Q to half2 (f16 K) or q8_1 (quantized K) and store in registers:
    half2  Q_h2[ncols][D/(2*WARP_SIZE)];
    int   Q_i32[ncols][D/(sizeof(int)*QK8_1) == 0 ? 1 : D/(sizeof(int)*QK8_1)];
    half2  Q_ds[ncols][D/QK8_1 == 0 ? 1 : D/QK8_1];
    if (Q_q8_1) {
 #pragma unroll
        for (int j0 = 0; j0 < ncols; j0 += nwarps) {
            const int j = j0 + threadIdx.y;
            if (j0 + nwarps > ncols && j >= ncols) {
                break;
            }
            // Reuse KQ as temporary storage for converting Q to q8_1:
            int   * tmp_q_i32 = (int   *) &KQ[j*D];
            half2 * tmp_q_ds  = (half2 *) (tmp_q_i32 + D/sizeof(int));
            // Set memory to zero if out of bounds:
            if (ncols > 2 && ic0 + j >= ne01) {
 #pragma unroll
                for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
                    const int i = i0 + threadIdx.x;
                    tmp_q_i32[i] = 0;
                }
                if (threadIdx.x < D/QK8_1) {
                    tmp_q_ds[threadIdx.x] = make_half2(0.0f, 0.0f);
                }
                continue;
            }
            const float * Q_f = (const float *) (Q + j*nb01);
 #pragma unroll
            for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
                quantize_q8_1_to_shared<half2>(Q_f + 4*i0, scale, tmp_q_i32, tmp_q_ds);
            }
        }
        __syncthreads();
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            int   * tmp_q_i32 = (int   *) &KQ[j*D];
            half2 * tmp_q_ds  = (half2 *) (tmp_q_i32 + D/sizeof(int));
 #pragma unroll
            for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
                const int i = i0 + threadIdx.x;
                Q_i32[j][i0/WARP_SIZE] = tmp_q_i32[i];
                Q_ds[j][i0/WARP_SIZE]  = tmp_q_ds[i/QI8_1];
            }
        }
        __syncthreads();
    } else {
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            const float2 * Q_f2_j = (const float2 *) (Q + j*nb01);
 #pragma unroll
            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
                const int i = i0 + threadIdx.x;
                const float2 tmp = ncols <= 2 || ic0 + j < ne01 ? Q_f2_j[i] : make_float2(0.0f, 0.0f);
                Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
            }
        }
    }
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        KQ[j*D + tid] = -HALF_MAX_HALF;
    }
    half2 VKQ[ncols] = {{0.0f, 0.0f}};
    const int k_start = parallel_blocks == 1 ? 0 : ip*D;
    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
        // Calculate KQ tile and keep track of new maximum KQ values:
        // For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
        // see https://github.com/ggerganov/llama.cpp/pull/7061 .
        // Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
        half kqmax_new = kqmax[0];
        half kqmax_new_arr[ncols];
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            kqmax_new_arr[j] = kqmax[j];
        }
 #pragma unroll
        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
            const int i_KQ = i_KQ_0 + threadIdx.y;
            if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
                break;
            }
 #pragma unroll
            for (int j = 0; j < ncols; ++j) {
                half sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_h2[j], Q_i32[j], Q_ds[j]);
                sum = warp_reduce_sum(sum);
                sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
                if (ncols == 1) {
                    kqmax_new        = ggml_cuda_hmax(kqmax_new,        sum);
                } else {
                    kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
                }
                if (threadIdx.x == 0) {
                    KQ[j*D + i_KQ] = sum;
                }
            }
        }
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
            kqmax_new_j = warp_reduce_max(kqmax_new_j);
            if (threadIdx.x == 0) {
                kqmax_shared[j][threadIdx.y] = kqmax_new_j;
            }
        }
        __syncthreads();
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            half kqmax_new_j = kqmax_shared[j][threadIdx.x];
            kqmax_new_j = warp_reduce_max(kqmax_new_j);
            const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
            kqmax[j] = kqmax_new_j;
            const half val = hexp(KQ[j*D + tid] - kqmax[j]);
            kqsum[j] = kqsum[j]*KQ_max_scale + val;
            KQ[j*D + tid] = val;
            VKQ[j] *= __half2half2(KQ_max_scale);
        }
        __syncthreads();
 #pragma unroll
        for (int k0 = 0; k0 < D; k0 += 2) {
            if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
                break;
            }
            half2 V_k;
            reinterpret_cast<half&>(V_k.x) = dequantize_1_v(V + (k_VKQ_0 + k0 + 0)*nb21, tid);
            reinterpret_cast<half&>(V_k.y) = dequantize_1_v(V + (k_VKQ_0 + k0 + 1)*nb21, tid);
 #pragma unroll
            for (int j = 0; j < ncols; ++j) {
                VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
            }
        }
        __syncthreads();
    }
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        kqsum[j] = warp_reduce_sum(kqsum[j]);
        if (threadIdx.x == 0) {
            kqsum_shared[j][threadIdx.y] = kqsum[j];
        }
    }
    __syncthreads();
 #pragma unroll
    for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
        if (ncols > 2 && ic0 + j_VKQ >= ne01) {
            break;
        }
        kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
        kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
        half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
        if (parallel_blocks == 1) {
            dst_val /= kqsum[j_VKQ];
        }
        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
        dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
    }
    if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
        dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
    }
 #else
   NO_DEVICE_CODE;
 #endif // FP16_AVAILABLE
 }
 template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V>
 void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    constexpr int nwarps = D/WARP_SIZE;
    fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks, type_K, type_V>;
    constexpr bool need_f16_K = D != 128;
    constexpr bool need_f16_V = D != 128 && D != 64;
    launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, need_f16_K, need_f16_V);
 }
 template <int D, ggml_type type_K, ggml_type type_V>
 void ggml_cuda_flash_attn_ext_vec_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_tensor * KQV = dst;
    ggml_tensor * Q   = dst->src[0];
    ggml_tensor * K   = dst->src[1];
    ggml_tensor * V   = dst->src[2];
    const int32_t precision = KQV->op_params[2];
    GGML_ASSERT(precision == GGML_PREC_DEFAULT);
    GGML_ASSERT(K->type == type_K);
    GGML_ASSERT(V->type == type_V);
    if (Q->ne[1] == 1) {
        constexpr int cols_per_block  = 1;
        constexpr int parallel_blocks = 4;
        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
        return;
    }
    if (Q->ne[1] == 2) {
        constexpr int cols_per_block  = 2;
        constexpr int parallel_blocks = 4;
        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
        return;
    }
    if (Q->ne[1] <= 4) {
        constexpr int cols_per_block  = 4;
        constexpr int parallel_blocks = 4;
        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
        return;
    }
    if (Q->ne[1] <= 8) {
        constexpr int cols_per_block  = 8;
        constexpr int parallel_blocks = 4;
        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
        return;
    }
    constexpr int cols_per_block  = 8;
    constexpr int parallel_blocks = 1;
    ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
 }
 #define DECL_FATTN_VEC_F16_CASE(D, type_K, type_V)                          \
    template void ggml_cuda_flash_attn_ext_vec_f16_case                     \
    <D, type_K, type_V>(ggml_backend_cuda_context & ctx, ggml_tensor * dst) \
 extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0);
 extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1);
 extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0);
 extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1);
 extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0);
 extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q4_0);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q4_1);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q5_0);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q5_1);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q8_0);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);
 extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_F16);
 extern DECL_FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16);
--- a/ggml-cuda/fattn-vec-f32.cu
+++ b/ggml-cuda/fattn-vec-f32.cu
@ -1,279 +0,0 @@
 #include "common.cuh"
 #include "fattn-common.cuh"
 #include "fattn-vec-f32.cuh"
 template<int D, int ncols, int parallel_blocks> // D == head size
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_vec_ext_f32(
        const char * __restrict__ Q,
        const char * __restrict__ K,
        const char * __restrict__ V,
        const char * __restrict__ mask,
        float      * __restrict__ dst,
        float2     * __restrict__ dst_meta,
        const float scale,
        const float max_bias,
        const float m0,
        const float m1,
        const uint32_t n_head_log2,
        const int ne00,
        const int ne01,
        const int ne02,
        const int ne03,
        const int ne10,
        const int ne11,
        const int ne12,
        const int ne13,
        const int ne31,
        const int nb31,
        const int nb01,
        const int nb02,
        const int nb03,
        const int nb11,
        const int nb12,
        const int nb13,
        const int ne0,
        const int ne1,
        const int ne2,
        const int ne3) {
    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic0);
    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.y / gqa_ratio));
    const half   * V_h   = (const half   *) (V    + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
    const half   * maskh = (const half   *)  mask + ne11*ic0;
    const int stride_KV  = nb11 / sizeof(half);
    const int stride_KV2 = nb11 / sizeof(half2);
    const float slope = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
    constexpr int nwarps = D / WARP_SIZE;
    const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
    __builtin_assume(tid < D);
    __shared__ float KQ[ncols*D];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        KQ[j*D + tid] = -FLT_MAX/2.0f;
    }
    float kqmax[ncols];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        kqmax[j] = -FLT_MAX/2.0f;
    }
    float kqsum[ncols] = {0.0f};
    __shared__ float kqmax_shared[ncols][WARP_SIZE];
    __shared__ float kqsum_shared[ncols][WARP_SIZE];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        if (threadIdx.y == 0) {
            kqmax_shared[j][threadIdx.x] = -FLT_MAX/2.0f;
            kqsum_shared[j][threadIdx.x] = 0.0f;
        }
    }
    __syncthreads();
    // Convert Q to half2 and store in registers:
    float2 Q_h2[ncols][D/(2*WARP_SIZE)];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
 #pragma unroll
        for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
            const int i = i0 + threadIdx.x;
            Q_h2[j][i0/WARP_SIZE]    = ncols <= 2 || ic0 + j ? Q_f2[j*(nb01/sizeof(float2)) + i] : make_float2(0.0f, 0.0f);
            Q_h2[j][i0/WARP_SIZE].x *= scale;
            Q_h2[j][i0/WARP_SIZE].y *= scale;
        }
    }
    float VKQ[ncols] = {0.0f};
    const int k_start = parallel_blocks == 1 ? 0 : ip*D;
    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
        // Calculate KQ tile and keep track of new maximum KQ values:
        float kqmax_new_arr[ncols];
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            kqmax_new_arr[j] = kqmax[j];
        }
 #pragma unroll
        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
            const int i_KQ = i_KQ_0 + threadIdx.y;
            if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
                break;
            }
            float sum[ncols] = {0.0f};
 #pragma unroll
            for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
                const int k_KQ = k_KQ_0 + threadIdx.x;
                const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
 #pragma unroll
                for (int j = 0; j < ncols; ++j) {
                    sum[j] +=  __low2float(K_ik) * Q_h2[j][k_KQ_0/WARP_SIZE].x;
                    sum[j] += __high2float(K_ik) * Q_h2[j][k_KQ_0/WARP_SIZE].y;
                }
            }
 #pragma unroll
            for (int j = 0; j < ncols; ++j) {
                sum[j] = warp_reduce_sum(sum[j]);
                sum[j] += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
                kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum[j]);
                if (threadIdx.x == 0) {
                    KQ[j*D + i_KQ] = sum[j];
                }
            }
        }
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            float kqmax_new_j = kqmax_new_arr[j];
            kqmax_new_j = warp_reduce_max(kqmax_new_j);
            if (threadIdx.x == 0) {
                kqmax_shared[j][threadIdx.y] = kqmax_new_j;
            }
        }
        __syncthreads();
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            float kqmax_new_j = kqmax_shared[j][threadIdx.x];
            kqmax_new_j = warp_reduce_max(kqmax_new_j);
            const float KQ_max_scale = expf(kqmax[j] - kqmax_new_j);
            kqmax[j] = kqmax_new_j;
            const float val = expf(KQ[j*D + tid] - kqmax[j]);
            kqsum[j] = kqsum[j]*KQ_max_scale + val;
            KQ[j*D + tid] = val;
            VKQ[j] *= KQ_max_scale;
        }
        __syncthreads();
 #pragma unroll
        for (int k = 0; k < D; ++k) {
            if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k >= ne11) {
                break;
            }
            const float V_ki = __half2float(V_h[(k_VKQ_0 + k)*stride_KV + tid]);
 #pragma unroll
            for (int j = 0; j < ncols; ++j) {
                VKQ[j] += V_ki*KQ[j*D + k];
            }
        }
        __syncthreads();
    }
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        kqsum[j] = warp_reduce_sum(kqsum[j]);
        if (threadIdx.x == 0) {
            kqsum_shared[j][threadIdx.y] = kqsum[j];
        }
    }
    __syncthreads();
 #pragma unroll
    for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
        if (ncols > 2 && ic0 + j_VKQ >= ne01) {
            break;
        }
        kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
        kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
        float dst_val = VKQ[j_VKQ];
        if (parallel_blocks == 1) {
            dst_val /= kqsum[j_VKQ];
        }
        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
        dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
    }
    if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
        dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
    }
 }
 template <int cols_per_block, int parallel_blocks>
 void launch_fattn_vec_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * Q = dst->src[0];
    switch (Q->ne[0]) {
        case  64: {
            constexpr int      D = 64;
            constexpr int nwarps = D/WARP_SIZE;
            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks>;
            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
        } break;
        case 128: {
            constexpr int      D = 128;
            constexpr int nwarps = D/WARP_SIZE;
            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks>;
            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
        } break;
        default: {
            GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
        } break;
    }
 }
 void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * Q = dst->src[0];
    if (Q->ne[1] == 1) {
        constexpr int cols_per_block  = 1;
        constexpr int parallel_blocks = 4;
        launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
        return;
    }
    if (Q->ne[1] == 2) {
        constexpr int cols_per_block  = 2;
        constexpr int parallel_blocks = 4;
        launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
        return;
    }
    if (Q->ne[1] <= 4) {
        constexpr int cols_per_block  = 4;
        constexpr int parallel_blocks = 4;
        launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
        return;
    }
    if (Q->ne[1] <= 8) {
        constexpr int cols_per_block  = 8;
        constexpr int parallel_blocks = 4;
        launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
        return;
    }
    constexpr int cols_per_block  = 8;
    constexpr int parallel_blocks = 1;
    launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
 }
--- a/ggml-cuda/fattn-vec-f32.cuh
+++ b/ggml-cuda/fattn-vec-f32.cuh
@ -1,3 +1,374 @@
 #include "common.cuh"
 #include "fattn-common.cuh"
-void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V> // D == head size
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_vec_ext_f32(
        const char * __restrict__ Q,
        const char * __restrict__ K,
        const char * __restrict__ V,
        const char * __restrict__ mask,
        float      * __restrict__ dst,
        float2     * __restrict__ dst_meta,
        const float scale,
        const float max_bias,
        const float m0,
        const float m1,
        const uint32_t n_head_log2,
        const int ne00,
        const int ne01,
        const int ne02,
        const int ne03,
        const int ne10,
        const int ne11,
        const int ne12,
        const int ne13,
        const int ne31,
        const int nb31,
        const int nb01,
        const int nb02,
        const int nb03,
        const int nb11,
        const int nb12,
        const int nb13,
        const int nb21,
        const int nb22,
        const int nb23,
        const int ne0,
        const int ne1,
        const int ne2,
        const int ne3) {
    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
    constexpr vec_dot_KQ_f32_t vec_dot_KQ = get_vec_dot_KQ_f32<D>(type_K);
    constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16;
    constexpr dequantize_1_f32_t dequantize_1_v = get_dequantize_1_f32(type_V);
    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
    Q += nb02* blockIdx.y              + nb01*ic0;
    K += nb12*(blockIdx.y / gqa_ratio);
    V += nb22*(blockIdx.y / gqa_ratio); // K and V have same shape
    const half * maskh = (const half   *)  mask + ne11*ic0;
    const float slope = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
    constexpr int nwarps = D / WARP_SIZE;
    const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
    __builtin_assume(tid < D);
    __shared__ float KQ[ncols*D];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        KQ[j*D + tid] = -FLT_MAX/2.0f;
    }
    float kqmax[ncols];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        kqmax[j] = -FLT_MAX/2.0f;
    }
    float kqsum[ncols] = {0.0f};
    __shared__ float kqmax_shared[ncols][WARP_SIZE];
    __shared__ float kqsum_shared[ncols][WARP_SIZE];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        if (threadIdx.y == 0) {
            kqmax_shared[j][threadIdx.x] = -FLT_MAX/2.0f;
            kqsum_shared[j][threadIdx.x] = 0.0f;
        }
    }
    __syncthreads();
    // Convert Q to float2 (f16 K) or q8_1 (quantized K) and store in registers:
    float2  Q_f2[ncols][D/(2*WARP_SIZE)];
    int    Q_i32[ncols][D/(sizeof(int)*QK8_1) == 0 ? 1 : D >= D/(sizeof(int)*QK8_1)];
    float2  Q_ds[ncols][D/QK8_1 == 0 ? 1 : D/QK8_1];
    if (Q_q8_1) {
 #pragma unroll
        for (int j0 = 0; j0 < ncols; j0 += nwarps) {
            const int j = j0 + threadIdx.y;
            if (j0 + nwarps > ncols && j >= ncols) {
                break;
            }
            // Reuse KQ as temporary storage for converting Q to q8_1:
            int    * tmp_q_i32 = (int    *) &KQ[j*D];
            float2 * tmp_q_ds  = (float2 *) (tmp_q_i32 + D/sizeof(int));
            // Set memory to zero if out of bounds:
            if (ncols > 2 && ic0 + j >= ne01) {
 #pragma unroll
                for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
                    const int i = i0 + threadIdx.x;
                    tmp_q_i32[i] = 0;
                }
                if (threadIdx.x < D/QK8_1) {
                    tmp_q_ds[threadIdx.x] = make_float2(0.0f, 0.0f);
                }
                continue;
            }
            const float * Q_f = (const float *) (Q + j*nb01);
 #pragma unroll
            for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
                quantize_q8_1_to_shared<float2>(Q_f + 4*i0, scale, tmp_q_i32, tmp_q_ds);
            }
        }
        __syncthreads();
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            int    * tmp_q_i32 = (int    *) &KQ[j*D];
            float2 * tmp_q_ds  = (float2 *) (tmp_q_i32 + D/sizeof(int));
 #pragma unroll
            for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
                const int i = i0 + threadIdx.x;
                Q_i32[j][i0/WARP_SIZE] = tmp_q_i32[i];
                Q_ds[j][i0/WARP_SIZE]  = tmp_q_ds[i/QI8_1];
            }
        }
        __syncthreads();
    } else {
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            const float2 * Q_f2_j = (const float2 *) (Q + j*nb01);
 #pragma unroll
            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
                const int i = i0 + threadIdx.x;
                Q_f2[j][i0/WARP_SIZE]    = ncols <= 2 || ic0 + j ? Q_f2_j[i] : make_float2(0.0f, 0.0f);
                Q_f2[j][i0/WARP_SIZE].x *= scale;
                Q_f2[j][i0/WARP_SIZE].y *= scale;
            }
        }
    }
    float VKQ[ncols] = {0.0f};
    const int k_start = parallel_blocks == 1 ? 0 : ip*D;
    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
        // Calculate KQ tile and keep track of new maximum KQ values:
        float kqmax_new_arr[ncols];
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            kqmax_new_arr[j] = kqmax[j];
        }
 #pragma unroll
        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
            const int i_KQ = i_KQ_0 + threadIdx.y;
            if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
                break;
            }
 #pragma unroll
            for (int j = 0; j < ncols; ++j) {
                float sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_f2[j], Q_i32[j], Q_ds[j]);
                sum = warp_reduce_sum(sum);
                sum += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
                kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum);
                if (threadIdx.x == 0) {
                    KQ[j*D + i_KQ] = sum;
                }
            }
        }
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            float kqmax_new_j = kqmax_new_arr[j];
            kqmax_new_j = warp_reduce_max(kqmax_new_j);
            if (threadIdx.x == 0) {
                kqmax_shared[j][threadIdx.y] = kqmax_new_j;
            }
        }
        __syncthreads();
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            float kqmax_new_j = kqmax_shared[j][threadIdx.x];
            kqmax_new_j = warp_reduce_max(kqmax_new_j);
            const float KQ_max_scale = expf(kqmax[j] - kqmax_new_j);
            kqmax[j] = kqmax_new_j;
            const float val = expf(KQ[j*D + tid] - kqmax[j]);
            kqsum[j] = kqsum[j]*KQ_max_scale + val;
            KQ[j*D + tid] = val;
            VKQ[j] *= KQ_max_scale;
        }
        __syncthreads();
 #pragma unroll
        for (int k = 0; k < D; ++k) {
            if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k >= ne11) {
                break;
            }
            const float V_ki = dequantize_1_v(V + (k_VKQ_0 + k)*nb21, tid);
 #pragma unroll
            for (int j = 0; j < ncols; ++j) {
                VKQ[j] += V_ki*KQ[j*D + k];
            }
        }
        __syncthreads();
    }
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        kqsum[j] = warp_reduce_sum(kqsum[j]);
        if (threadIdx.x == 0) {
            kqsum_shared[j][threadIdx.y] = kqsum[j];
        }
    }
    __syncthreads();
 #pragma unroll
    for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
        if (ncols > 2 && ic0 + j_VKQ >= ne01) {
            break;
        }
        kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
        kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
        float dst_val = VKQ[j_VKQ];
        if (parallel_blocks == 1) {
            dst_val /= kqsum[j_VKQ];
        }
        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
        dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
    }
    if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
        dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
    }
 }
 template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V>
 void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    constexpr int nwarps = D/WARP_SIZE;
    fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks, type_K, type_V>;
    constexpr bool need_f16_K = D != 128;
    constexpr bool need_f16_V = D != 128 && D != 64;
    launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, need_f16_K, need_f16_V);
 }
 template <int D, ggml_type type_K, ggml_type type_V>
 void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_tensor * Q   = dst->src[0];
    ggml_tensor * K   = dst->src[1];
    ggml_tensor * V   = dst->src[2];
    GGML_ASSERT(K->type == type_K);
    GGML_ASSERT(V->type == type_V);
    if (Q->ne[1] == 1) {
        constexpr int cols_per_block  = 1;
        constexpr int parallel_blocks = 4;
        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
        return;
    }
    if (Q->ne[1] == 2) {
        constexpr int cols_per_block  = 2;
        constexpr int parallel_blocks = 4;
        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
        return;
    }
    if (Q->ne[1] <= 4) {
        constexpr int cols_per_block  = 4;
        constexpr int parallel_blocks = 4;
        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
        return;
    }
    if (Q->ne[1] <= 8) {
        constexpr int cols_per_block  = 8;
        constexpr int parallel_blocks = 4;
        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
        return;
    }
    constexpr int cols_per_block  = 8;
    constexpr int parallel_blocks = 1;
    ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
 }
 #define DECL_FATTN_VEC_F32_CASE(D, type_K, type_V)                          \
    template void ggml_cuda_flash_attn_ext_vec_f32_case                     \
    <D, type_K, type_V>(ggml_backend_cuda_context & ctx, ggml_tensor * dst) \
 extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0);
 extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1);
 extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0);
 extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1);
 extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0);
 extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q4_0);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q4_1);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q5_0);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q5_1);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q8_0);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);
 extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_F16);
 extern DECL_FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16);
--- a/ggml-cuda/fattn-wmma-f16.cuh
+++ b/ggml-cuda/fattn-wmma-f16.cuh
@ -0,0 +1,490 @@
 #include "common.cuh"
 #include "fattn-common.cuh"
 #if FP16_MMA_AVAILABLE
 #include <mma.h>
 #endif
 // D == head size, VKQ_stride == num VKQ rows calculated in parallel:
 template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(nwarps*WARP_SIZE, 1)
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_ext_f16(
        const char * __restrict__ Q,
        const char * __restrict__ K,
        const char * __restrict__ V,
        const char * __restrict__ mask,
        float      * __restrict__ dst,
        float2     * __restrict__ dst_meta,
        const float scale,
        const float max_bias,
        const float m0,
        const float m1,
        const uint32_t n_head_log2,
        const int ne00,
        const int ne01,
        const int ne02,
        const int ne03,
        const int ne10,
        const int ne11,
        const int ne12,
        const int ne13,
        const int ne31,
        const int nb31,
        const int nb01,
        const int nb02,
        const int nb03,
        const int nb11,
        const int nb12,
        const int nb13,
        const int nb21,
        const int nb22,
        const int nb23,
        const int ne0,
        const int ne1,
        const int ne2,
        const int ne3) {
 #if FP16_MMA_AVAILABLE
    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
    const int ic0 = ncols*(blockIdx.x / parallel_blocks); // Index of the first Q/QKV column to work on.
    const int ip  =        blockIdx.x % parallel_blocks;  // Index in group of blocks running for the same column in parallel.
    static_assert(D <= FATTN_KQ_STRIDE, "D must be <= FATTN_KQ_STRIDE.");
    static_assert(ncols == 8 || ncols % 16 == 0, "ncols must be 8 or a multiple of 16.");
    constexpr int frag_m = ncols == 8 ? 32 : 16;
    constexpr int frag_n = ncols == 8 ?  8 : 16;
    static_assert(D % frag_m == 0, "If ncols == 8 then D % frag_m must be 0.");
    typedef nvcuda::wmma::fragment<nvcuda::wmma::matrix_a,    frag_m, frag_n, 16, half, nvcuda::wmma::row_major> frag_a_K;
    typedef nvcuda::wmma::fragment<nvcuda::wmma::matrix_a,    frag_m, frag_n, 16, half, nvcuda::wmma::col_major> frag_a_V;
    typedef nvcuda::wmma::fragment<nvcuda::wmma::matrix_b,    frag_m, frag_n, 16, half, nvcuda::wmma::col_major> frag_b;
    typedef nvcuda::wmma::fragment<nvcuda::wmma::accumulator, frag_m, frag_n, 16, KQ_acc_t>                      frag_c_KQ;
    typedef nvcuda::wmma::fragment<nvcuda::wmma::accumulator, frag_m, frag_n, 16, half>                          frag_c_VKQ;
    constexpr int KQ_stride_tc  = nwarps*frag_m; // Number of KQ rows calculated in parallel.
    constexpr int VKQ_ratio = KQ_stride_tc/VKQ_stride; // Number of parallel VKQ accumulators needed to keep all warps busy.
    static_assert(VKQ_ratio <= nwarps, "VKQ_ratio must be <= nwarps.");
    // Pad internal representation of KQ, KQV to reduce shared memory bank conflicts:
    constexpr int D_padded = D + 8;
    constexpr int kqs_padded = FATTN_KQ_STRIDE + 8;
    constexpr int kqar = sizeof(KQ_acc_t)/sizeof(half);
    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
    const float * Q_f   = (const float *) (Q + nb02* blockIdx.y              + nb01*ic0);
    const half  * K_h   = (const half  *) (K + nb12*(blockIdx.y / gqa_ratio));
    const half  * V_h   = (const half  *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
    const half  * maskh = (const half  *)  mask + (nb31/sizeof(half))* ic0;
    const half2 * mask2 = (const half2 *)  mask + (nb31/sizeof(half))*(ic0/2);
    const int stride_Q  = nb01 / sizeof(float);
    const int stride_KV = nb11 / sizeof(half);
    const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
    const half  slopeh = __float2half(slopef);
    const half2 slope2 = make_half2(slopef, slopef);
    frag_b Q_b[D/16][ncols/frag_n];
    // A single buffer for temporarily holding tiles of KQ and VKQ parts:
    constexpr int mem_KQ = ncols*kqs_padded*kqar;
    constexpr int mem_VKQ_parts = VKQ_ratio*ncols*D_padded;
    __shared__ half KQ[mem_KQ >= mem_VKQ_parts ? mem_KQ : mem_VKQ_parts];
    float * KQ_f = (float *) KQ;
    half2 * KQ2 = (half2 *) KQ;
    float    KQ_rowsum_f[ncols/nwarps] = {0.0f};
    float       KQ_max_f[ncols/nwarps];
    float KQ_max_scale_f[ncols/nwarps] = {0.0f};
 #pragma unroll
    for (int j = 0; j < ncols/nwarps; ++j) {
        KQ_max_f[j] = -FLT_MAX/2.0f;
    }
    half2    KQ_rowsum_h2[ncols/nwarps] = {{0.0f, 0.0f}};
    half2       KQ_max_h2[ncols/nwarps];
    half2 KQ_max_scale_h2[ncols/nwarps] = {{0.0f, 0.0f}};
 #pragma unroll
    for (int j = 0; j < ncols/nwarps; ++j) {
        KQ_max_h2[j] = make_half2(-HALF_MAX_HALF, -HALF_MAX_HALF);
    }
    __shared__ half VKQ[ncols*D_padded]; // Accumulator for final VKQ slice.
    half2 * VKQ2 = (half2 *) VKQ;
 #pragma unroll
    for (int j0 = 0; j0 < ncols; j0 += nwarps) {
        const int j = j0 + threadIdx.y;
 #pragma unroll
        for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
            const int i = i0 + threadIdx.x;
            if (i0 + WARP_SIZE > D/2 && i >= D/2) {
                break;
            }
            VKQ2[j*(D_padded/2) + i] = make_half2(0.0f, 0.0f);
        }
    }
    // Convert Q to half and apply scale, temporarily store in KQ:
 #pragma unroll
    for (int j0 = 0; j0 < ncols; j0 += nwarps) {
        const int j = j0 + threadIdx.y;
 #pragma unroll
        for (int i0 = 0; i0 < D; i0 += WARP_SIZE) {
            const int i = i0 + threadIdx.x;
            if (i0 + WARP_SIZE > D && i >= D) {
                break;
            }
            KQ[j*D_padded + i] = ic0 + j < ne01 ? Q_f[j*stride_Q + i] * scale : 0.0f;
        }
    }
    __syncthreads();
    // Load Q into tensor core fragments/registers since it will be used frequently:
 #pragma unroll
    for (int i0 = 0; i0 < D; i0 += 16) {
 #pragma unroll
        for (int j0 = 0; j0 < ncols; j0 += frag_n) {
            nvcuda::wmma::load_matrix_sync(Q_b[i0/16][j0/frag_n], KQ + j0*D_padded + i0, D_padded);
        }
    }
    __syncthreads();
    // Iterate over ne11 == previous tokens:
    for (int k_VKQ_0 = ip*FATTN_KQ_STRIDE; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE) {
        // Calculate tile of KQ:
 #pragma unroll
        for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE; i_KQ_0 += KQ_stride_tc) {
            frag_c_KQ KQ_c[ncols/frag_n];
 #pragma unroll
            for (int j = 0; j < ncols/frag_n; ++j) {
                nvcuda::wmma::fill_fragment(KQ_c[j], 0.0f);
            }
 #pragma unroll
            for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 16) {
                frag_a_K K_a;
                nvcuda::wmma::load_matrix_sync(K_a, K_h + (k_VKQ_0 + i_KQ_0 + frag_m*threadIdx.y)*stride_KV + k_KQ_0, stride_KV);
 #pragma unroll
                for (int j = 0; j < ncols/frag_n; ++j) {
                    nvcuda::wmma::mma_sync(KQ_c[j], K_a, Q_b[k_KQ_0/16][j], KQ_c[j]);
                }
            }
 #pragma unroll
            for (int j0 = 0; j0 < ncols; j0 += frag_n) {
                nvcuda::wmma::store_matrix_sync((KQ_acc_t *) KQ + j0*kqs_padded + i_KQ_0 + frag_m*threadIdx.y, KQ_c[j0/frag_n], kqs_padded, nvcuda::wmma::mem_col_major);
            }
        }
        __syncthreads();
        // Calculate softmax for each KQ column using the current max. value.
        // The divisor is stored in KQ_rowsum and will be applied at the end.
 #pragma unroll
        for (int j0 = 0; j0 < ncols; j0 += nwarps) {
            const int j = j0 + threadIdx.y;
            if (std::is_same<KQ_acc_t, float>::value) {
                float KQ_f_tmp[FATTN_KQ_STRIDE / WARP_SIZE];
 #pragma unroll
                for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) {
                    const int k = k0 + threadIdx.x;
                    KQ_f_tmp[k0/WARP_SIZE] = KQ_f[j*kqs_padded + k];
                }
                float KQ_max_new = KQ_max_f[j0/nwarps];
 #pragma unroll
                for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) {
                    const int k = k0 + threadIdx.x;
                    KQ_f_tmp[k0/WARP_SIZE] += mask ? __half2float(slopeh*maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f;
                    KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/WARP_SIZE]);
                }
                KQ_max_new = warp_reduce_max(KQ_max_new);
                const float diff = KQ_max_f[j0/nwarps] - KQ_max_new;
                KQ_max_scale_f[j0/nwarps] = expf(diff);
                if (diff <= SOFTMAX_FTZ_THRESHOLD) {
                    KQ_max_scale_f[j0/nwarps] = 0.0f;
                }
                KQ_max_f[j0/nwarps] = KQ_max_new;
                float KQ_rowsum_add = 0.0f;
 #pragma unroll
                for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) {
                    const int k = k0 + threadIdx.x;
                    const float diff = KQ_f_tmp[k0/WARP_SIZE] - KQ_max_f[j0/nwarps];
                    KQ_f_tmp[k0/WARP_SIZE] = expf(diff);
                    if (diff <= SOFTMAX_FTZ_THRESHOLD) {
                        KQ_f_tmp[k0/WARP_SIZE] = 0.0f;
                    }
                    KQ_rowsum_add += KQ_f_tmp[k0/WARP_SIZE];
                    KQ[j*(kqar*kqs_padded) + k] = KQ_f_tmp[k0/WARP_SIZE];
                }
                KQ_rowsum_add = warp_reduce_sum(KQ_rowsum_add);
                // Scale previous KQ_rowsum to account for a potential increase in KQ_max:
                KQ_rowsum_f[j0/nwarps] = KQ_max_scale_f[j0/nwarps]*KQ_rowsum_f[j0/nwarps] + KQ_rowsum_add;
            } else {
                half2 KQ2_tmp[FATTN_KQ_STRIDE/(2*WARP_SIZE)];
 #pragma unroll
                for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) {
                    const int k = k0 + threadIdx.x;
                    KQ2_tmp[k0/WARP_SIZE] = KQ2[j*(kqs_padded/2) + k];
                }
                half2 KQ_max_new = KQ_max_h2[j0/nwarps];
 #pragma unroll
                for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) {
                    const int k = k0 + threadIdx.x;
                    KQ2_tmp[k0/WARP_SIZE] += mask ? slope2*mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f);
                    KQ_max_new = ggml_cuda_hmax2(KQ_max_new, KQ2_tmp[k0/WARP_SIZE]);
                }
                KQ_max_new = __half2half2(warp_reduce_max(ggml_cuda_hmax(__low2half(KQ_max_new), __high2half(KQ_max_new))));
                const half2 diff = KQ_max_h2[j0/nwarps] - KQ_max_new;
                KQ_max_scale_h2[j0/nwarps] = h2exp(diff);
                const uint32_t ftz_mask = __hgt2_mask(diff, make_half2(SOFTMAX_FTZ_THRESHOLD, SOFTMAX_FTZ_THRESHOLD));
                *((uint32_t *) &KQ_max_scale_h2[j0/nwarps]) &= ftz_mask;
                KQ_max_h2[j0/nwarps] = KQ_max_new;
                half2 KQ_rowsum_add = make_half2(0.0f, 0.0f);
 #pragma unroll
                for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) {
                    const int k = k0 + threadIdx.x;
                    const half2 diff = KQ2_tmp[k0/WARP_SIZE] - KQ_max_h2[j0/nwarps];
                    KQ2_tmp[k0/WARP_SIZE] = h2exp(diff);
                    const uint32_t ftz_mask = __hgt2_mask(diff, make_half2(SOFTMAX_FTZ_THRESHOLD, SOFTMAX_FTZ_THRESHOLD));
                    *((uint32_t *) &KQ2_tmp[k0/WARP_SIZE]) &= ftz_mask;
                    KQ_rowsum_add += KQ2_tmp[k0/WARP_SIZE];
                    KQ2[j*(kqs_padded/2) + k] = KQ2_tmp[k0/WARP_SIZE];
                }
                KQ_rowsum_add = warp_reduce_sum(KQ_rowsum_add);
                // Scale previous KQ_rowsum to account for a potential increase in KQ_max:
                KQ_rowsum_h2[j0/nwarps] = KQ_max_scale_h2[j0/nwarps]*KQ_rowsum_h2[j0/nwarps] + KQ_rowsum_add;
            }
        }
        __syncthreads();
        frag_b KQ_b[FATTN_KQ_STRIDE/(VKQ_ratio*16)][ncols/frag_n];
 #pragma unroll
        for (int j0 = 0; j0 < ncols; j0 += frag_n) {
 #pragma unroll
            for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += VKQ_ratio*16) {
                const int k = k0 + (threadIdx.y % VKQ_ratio)*16;
                nvcuda::wmma::load_matrix_sync(
                    KQ_b[k0/(VKQ_ratio*16)][j0/frag_n],
                    KQ + j0*(kqar*kqs_padded) + k,
                    kqar*kqs_padded);
            }
        }
        frag_c_VKQ VKQ_c[D/VKQ_stride][ncols/frag_n];
 #pragma unroll
        for (int i_VKQ_0 = 0; i_VKQ_0 < D; i_VKQ_0 += VKQ_stride) {
 #pragma unroll
            for (int j = 0; j < ncols/frag_n; ++j) {
                nvcuda::wmma::fill_fragment(VKQ_c[i_VKQ_0/VKQ_stride][j], 0.0f);
            }
 #pragma unroll
            for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += VKQ_ratio*16) {
                const int k = k0 + (threadIdx.y % VKQ_ratio)*16;
                frag_a_V v_a;
                nvcuda::wmma::load_matrix_sync(v_a, V_h + (k_VKQ_0 + k)*stride_KV + i_VKQ_0 + frag_m*(threadIdx.y/VKQ_ratio), stride_KV);
 #pragma unroll
                for (int j = 0; j < ncols/frag_n; ++j) {
                    nvcuda::wmma::mma_sync(VKQ_c[i_VKQ_0/VKQ_stride][j], v_a, KQ_b[k0/(VKQ_ratio*16)][j], VKQ_c[i_VKQ_0/VKQ_stride][j]);
                }
            }
        }
        __syncthreads();
        const int offset_k = (threadIdx.y % VKQ_ratio) * (ncols*D_padded);
 #pragma unroll
        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += VKQ_stride) {
 #pragma unroll
            for (int j0 = 0; j0 < ncols; j0 += frag_n) {
                nvcuda::wmma::store_matrix_sync(
                    KQ + offset_k + j0*D_padded + i_KQ_0 + frag_m*(threadIdx.y/VKQ_ratio),
                    VKQ_c[i_KQ_0/VKQ_stride][j0/frag_n],
                    D_padded, nvcuda::wmma::mem_col_major);
            }
        }
        __syncthreads();
 #pragma unroll
        for (int j0 = 0; j0 < ncols; j0 += nwarps) {
            const int j = j0 + threadIdx.y;
            half2 VKQ_scale;
            if (std::is_same<KQ_acc_t, float>::value) {
                VKQ_scale = make_half2(KQ_max_scale_f[j0/nwarps], KQ_max_scale_f[j0/nwarps]);
            } else {
                VKQ_scale = KQ_max_scale_h2[j0/nwarps];
            }
 #pragma unroll
            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
                const int i = i0 + threadIdx.x;
                if (i0 + WARP_SIZE > D/2 && i >= D/2) {
                    break;
                }
                half2 VKQ_add = make_half2(0.0f, 0.0f);
 #pragma unroll
                for (int l = 0; l < VKQ_ratio; ++l) {
                    VKQ_add += KQ2[l*(ncols*D_padded/2) + j*(D_padded/2) + i];
                }
                VKQ2[j*(D_padded/2) + i] = VKQ_scale*VKQ2[j*(D_padded/2) + i] + VKQ_add;
            }
        }
        __syncthreads();
    }
 #pragma unroll
    for (int j0 = 0; j0 < ncols; j0 += nwarps) {
        const int j_VKQ = j0 + threadIdx.y;
        if (ic0 + j_VKQ >= ne01) {
            return;
        }
        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
        float KQ_rowsum_j;
        if (std::is_same<KQ_acc_t, float>::value) {
            KQ_rowsum_j = KQ_rowsum_f[j0/nwarps];
        } else {
            KQ_rowsum_j = __low2float(KQ_rowsum_h2[j0/nwarps]) + __high2float(KQ_rowsum_h2[j0/nwarps]);
        }
 #pragma unroll
        for (int i0 = 0; i0 < D; i0 += WARP_SIZE) {
            const int i = i0 + threadIdx.x;
            if (i0 + WARP_SIZE > D && i >= D) {
                break;
            }
            float dst_val = VKQ[j_VKQ*D_padded + i];
            if (parallel_blocks == 1) {
                dst_val /= KQ_rowsum_j;
            }
            dst[j_dst*gridDim.y*D + blockIdx.y*D + i] = dst_val;
        }
        if (parallel_blocks == 1 || threadIdx.x != 0) {
            continue;
        }
        float2 dst_meta_val;
        if (std::is_same<KQ_acc_t, float>::value) {
            dst_meta_val.x = KQ_max_f[j0/nwarps];
        } else {
            dst_meta_val.x = __low2float(KQ_max_h2[j0/nwarps]);
        }
        dst_meta_val.y = KQ_rowsum_j;
        dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = dst_meta_val;
    }
 #else
   NO_DEVICE_CODE;
 #endif // FP16_MMA_AVAILABLE
 }
 constexpr int get_max_power_of_2(int x) {
    return x % 2 == 0 ? 2*get_max_power_of_2(x/2) : 1;
 }
 static_assert(get_max_power_of_2(1) == 1, "Test failed.");
 static_assert(get_max_power_of_2(2) == 2, "Test failed.");
 static_assert(get_max_power_of_2(4) == 4, "Test failed.");
 static_assert(get_max_power_of_2(6) == 2, "Test failed.");
 // Number of VKQ rows calculated in parallel:
 constexpr int get_VKQ_stride(int D, int nwarps, int frag_m) {
    return (get_max_power_of_2(D/frag_m) < nwarps ? get_max_power_of_2(D/frag_m) : nwarps)*frag_m;
 }
 static_assert(get_VKQ_stride(128, 1, 32) ==  32, "Test failed.");
 static_assert(get_VKQ_stride(128, 2, 32) ==  64, "Test failed.");
 static_assert(get_VKQ_stride(128, 4, 32) == 128, "Test failed.");
 static_assert(get_VKQ_stride( 64, 1, 32) ==  32, "Test failed.");
 static_assert(get_VKQ_stride( 64, 2, 32) ==  64, "Test failed.");
 static_assert(get_VKQ_stride( 64, 4, 32) ==  64, "Test failed.");
 static_assert(get_VKQ_stride( 80, 1, 16) ==  16, "Test failed.");
 static_assert(get_VKQ_stride( 80, 2, 16) ==  16, "Test failed.");
 static_assert(get_VKQ_stride( 80, 4, 16) ==  16, "Test failed.");
 template <int D, int cols_per_block, typename KQ_acc_t>
 void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * Q = dst->src[0];
    constexpr int nwarps = 4;
    constexpr int frag_m = cols_per_block == 8 && D % 32 == 0 ? 32 : 16;
    const int blocks_num_pb1 = ((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3];
    const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
    if (4*blocks_num_pb1 < 2*nsm) {
        constexpr int parallel_blocks = 4;
        fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
        launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
        return;
    }
    if (2*blocks_num_pb1 < 2*nsm) {
        constexpr int parallel_blocks = 2;
        fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
        launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
        return;
    }
    constexpr int parallel_blocks = 1;
    fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
    launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
 }
 #define DECL_FATTN_WMMA_F16_CASE(D, cols_per_block, KQ_acc_t)                         \
    template void ggml_cuda_flash_attn_ext_wmma_f16_case                              \
    <D, cols_per_block, KQ_acc_t>(ggml_backend_cuda_context & ctx, ggml_tensor * dst) \
 extern DECL_FATTN_WMMA_F16_CASE( 64, 16, float);
 extern DECL_FATTN_WMMA_F16_CASE( 80, 16, float);
 extern DECL_FATTN_WMMA_F16_CASE( 96, 16, float);
 extern DECL_FATTN_WMMA_F16_CASE(112, 16, float);
 extern DECL_FATTN_WMMA_F16_CASE(128, 16, float);
 extern DECL_FATTN_WMMA_F16_CASE(256, 16, float);
 extern DECL_FATTN_WMMA_F16_CASE( 64, 32, float);
 extern DECL_FATTN_WMMA_F16_CASE( 80, 32, float);
 extern DECL_FATTN_WMMA_F16_CASE( 96, 32, float);
 extern DECL_FATTN_WMMA_F16_CASE(112, 32, float);
 extern DECL_FATTN_WMMA_F16_CASE(128, 32, float);
 // extern DECL_FATTN_WMMA_F16_CASE(256, 16, float);
 extern DECL_FATTN_WMMA_F16_CASE( 64,  8, half);
 extern DECL_FATTN_WMMA_F16_CASE( 96,  8, half);
 extern DECL_FATTN_WMMA_F16_CASE(128,  8, half);
 extern DECL_FATTN_WMMA_F16_CASE(256,  8, half);
 extern DECL_FATTN_WMMA_F16_CASE( 64, 16, half);
 extern DECL_FATTN_WMMA_F16_CASE( 80, 16, half);
 extern DECL_FATTN_WMMA_F16_CASE( 96, 16, half);
 extern DECL_FATTN_WMMA_F16_CASE(112, 16, half);
 extern DECL_FATTN_WMMA_F16_CASE(128, 16, half);
 extern DECL_FATTN_WMMA_F16_CASE(256, 16, half);
 extern DECL_FATTN_WMMA_F16_CASE( 64, 32, half);
 extern DECL_FATTN_WMMA_F16_CASE( 80, 32, half);
 extern DECL_FATTN_WMMA_F16_CASE( 96, 32, half);
 extern DECL_FATTN_WMMA_F16_CASE(112, 32, half);
 extern DECL_FATTN_WMMA_F16_CASE(128, 32, half);
 extern DECL_FATTN_WMMA_F16_CASE(256, 16, half);
--- a/ggml-cuda/fattn.cu
+++ b/ggml-cuda/fattn.cu
@ -4,454 +4,295 @@
 #include "fattn-tile-f32.cuh"
 #include "fattn-vec-f16.cuh"
 #include "fattn-vec-f32.cuh"
 #include "fattn-wmma-f16.cuh"
 #include "fattn.cuh"
 #include <cstdint>
-#if FP16_MMA_AVAILABLE
+static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-#include <mma.h>
+    const ggml_tensor * KQV = dst;
 #endif
 // D == head size, VKQ_stride == num VKQ rows calculated in parallel:
 template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(nwarps*WARP_SIZE, 1)
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_ext_f16(
        const char * __restrict__ Q,
        const char * __restrict__ K,
        const char * __restrict__ V,
        const char * __restrict__ mask,
        float      * __restrict__ dst,
        float2     * __restrict__ dst_meta,
        const float scale,
        const float max_bias,
        const float m0,
        const float m1,
        const uint32_t n_head_log2,
        const int ne00,
        const int ne01,
        const int ne02,
        const int ne03,
        const int ne10,
        const int ne11,
        const int ne12,
        const int ne13,
        const int ne31,
        const int nb31,
        const int nb01,
        const int nb02,
        const int nb03,
        const int nb11,
        const int nb12,
        const int nb13,
        const int ne0,
        const int ne1,
        const int ne2,
        const int ne3) {
 #if FP16_MMA_AVAILABLE
    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
    const int ic0 = ncols*(blockIdx.x / parallel_blocks); // Index of the first Q/QKV column to work on.
    const int ip  =        blockIdx.x % parallel_blocks;  // Index in group of blocks running for the same column in parallel.
    static_assert(D <= FATTN_KQ_STRIDE, "D must be <= FATTN_KQ_STRIDE.");
    static_assert(ncols == 8 || ncols % 16 == 0, "ncols must be 8 or a multiple of 16.");
    constexpr int frag_m = ncols == 8 ? 32 : 16;
    constexpr int frag_n = ncols == 8 ?  8 : 16;
    static_assert(D % frag_m == 0, "If ncols == 8 then D % frag_m must be 0.");
    typedef nvcuda::wmma::fragment<nvcuda::wmma::matrix_a,    frag_m, frag_n, 16, half, nvcuda::wmma::row_major> frag_a_K;
    typedef nvcuda::wmma::fragment<nvcuda::wmma::matrix_a,    frag_m, frag_n, 16, half, nvcuda::wmma::col_major> frag_a_V;
    typedef nvcuda::wmma::fragment<nvcuda::wmma::matrix_b,    frag_m, frag_n, 16, half, nvcuda::wmma::col_major> frag_b;
    typedef nvcuda::wmma::fragment<nvcuda::wmma::accumulator, frag_m, frag_n, 16, KQ_acc_t>                      frag_c_KQ;
    typedef nvcuda::wmma::fragment<nvcuda::wmma::accumulator, frag_m, frag_n, 16, half>                          frag_c_VKQ;
    constexpr int KQ_stride_tc  = nwarps*frag_m; // Number of KQ rows calculated in parallel.
    constexpr int VKQ_ratio = KQ_stride_tc/VKQ_stride; // Number of parallel VKQ accumulators needed to keep all warps busy.
    static_assert(VKQ_ratio <= nwarps, "VKQ_ratio must be <= nwarps.");
    // Pad internal representation of KQ, KQV to reduce shared memory bank conflicts:
    constexpr int D_padded = D + 8;
    constexpr int kqs_padded = FATTN_KQ_STRIDE + 8;
    constexpr int kqar = sizeof(KQ_acc_t)/sizeof(half);
    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
    const float * Q_f   = (const float *) (Q + nb02* blockIdx.y              + nb01*ic0);
    const half  * K_h   = (const half  *) (K + nb12*(blockIdx.y / gqa_ratio));
    const half  * V_h   = (const half  *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
    const half  * maskh = (const half  *)  mask + (nb31/sizeof(half))* ic0;
    const half2 * mask2 = (const half2 *)  mask + (nb31/sizeof(half))*(ic0/2);
    const int stride_Q  = nb01 / sizeof(float);
    const int stride_KV = nb11 / sizeof(half);
    const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
    const half  slopeh = __float2half(slopef);
    const half2 slope2 = make_half2(slopef, slopef);
    frag_b Q_b[D/16][ncols/frag_n];
    // A single buffer for temporarily holding tiles of KQ and VKQ parts:
    constexpr int mem_KQ = ncols*kqs_padded*kqar;
    constexpr int mem_VKQ_parts = VKQ_ratio*ncols*D_padded;
    __shared__ half KQ[mem_KQ >= mem_VKQ_parts ? mem_KQ : mem_VKQ_parts];
    float * KQ_f = (float *) KQ;
    half2 * KQ2 = (half2 *) KQ;
    float    KQ_rowsum_f[ncols/nwarps] = {0.0f};
    float       KQ_max_f[ncols/nwarps];
    float KQ_max_scale_f[ncols/nwarps] = {0.0f};
 #pragma unroll
    for (int j = 0; j < ncols/nwarps; ++j) {
        KQ_max_f[j] = -FLT_MAX/2.0f;
    }
    half2    KQ_rowsum_h2[ncols/nwarps] = {{0.0f, 0.0f}};
    half2       KQ_max_h2[ncols/nwarps];
    half2 KQ_max_scale_h2[ncols/nwarps] = {{0.0f, 0.0f}};
 #pragma unroll
    for (int j = 0; j < ncols/nwarps; ++j) {
        KQ_max_h2[j] = make_half2(-HALF_MAX_HALF, -HALF_MAX_HALF);
    }
    __shared__ half VKQ[ncols*D_padded]; // Accumulator for final VKQ slice.
    half2 * VKQ2 = (half2 *) VKQ;
 #pragma unroll
    for (int j0 = 0; j0 < ncols; j0 += nwarps) {
        const int j = j0 + threadIdx.y;
 #pragma unroll
        for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
            const int i = i0 + threadIdx.x;
            if (i0 + WARP_SIZE > D/2 && i >= D/2) {
                break;
            }
            VKQ2[j*(D_padded/2) + i] = make_half2(0.0f, 0.0f);
        }
    }
    // Convert Q to half and apply scale, temporarily store in KQ:
 #pragma unroll
    for (int j0 = 0; j0 < ncols; j0 += nwarps) {
        const int j = j0 + threadIdx.y;
 #pragma unroll
        for (int i0 = 0; i0 < D; i0 += WARP_SIZE) {
            const int i = i0 + threadIdx.x;
            if (i0 + WARP_SIZE > D && i >= D) {
                break;
            }
            KQ[j*D_padded + i] = ic0 + j < ne01 ? Q_f[j*stride_Q + i] * scale : 0.0f;
        }
    }
    __syncthreads();
    // Load Q into tensor core fragments/registers since it will be used frequently:
 #pragma unroll
    for (int i0 = 0; i0 < D; i0 += 16) {
 #pragma unroll
        for (int j0 = 0; j0 < ncols; j0 += frag_n) {
            nvcuda::wmma::load_matrix_sync(Q_b[i0/16][j0/frag_n], KQ + j0*D_padded + i0, D_padded);
        }
    }
    __syncthreads();
    // Iterate over ne11 == previous tokens:
    for (int k_VKQ_0 = ip*FATTN_KQ_STRIDE; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE) {
        // Calculate tile of KQ:
 #pragma unroll
        for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE; i_KQ_0 += KQ_stride_tc) {
            frag_c_KQ KQ_c[ncols/frag_n];
 #pragma unroll
            for (int j = 0; j < ncols/frag_n; ++j) {
                nvcuda::wmma::fill_fragment(KQ_c[j], 0.0f);
            }
 #pragma unroll
            for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 16) {
                frag_a_K K_a;
                nvcuda::wmma::load_matrix_sync(K_a, K_h + (k_VKQ_0 + i_KQ_0 + frag_m*threadIdx.y)*stride_KV + k_KQ_0, stride_KV);
 #pragma unroll
                for (int j = 0; j < ncols/frag_n; ++j) {
                    nvcuda::wmma::mma_sync(KQ_c[j], K_a, Q_b[k_KQ_0/16][j], KQ_c[j]);
                }
            }
 #pragma unroll
            for (int j0 = 0; j0 < ncols; j0 += frag_n) {
                nvcuda::wmma::store_matrix_sync((KQ_acc_t *) KQ + j0*kqs_padded + i_KQ_0 + frag_m*threadIdx.y, KQ_c[j0/frag_n], kqs_padded, nvcuda::wmma::mem_col_major);
            }
        }
        __syncthreads();
        // Calculate softmax for each KQ column using the current max. value.
        // The divisor is stored in KQ_rowsum and will be applied at the end.
 #pragma unroll
        for (int j0 = 0; j0 < ncols; j0 += nwarps) {
            const int j = j0 + threadIdx.y;
            if (std::is_same<KQ_acc_t, float>::value) {
                float KQ_f_tmp[FATTN_KQ_STRIDE / WARP_SIZE];
 #pragma unroll
                for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) {
                    const int k = k0 + threadIdx.x;
                    KQ_f_tmp[k0/WARP_SIZE] = KQ_f[j*kqs_padded + k];
                }
                float KQ_max_new = KQ_max_f[j0/nwarps];
 #pragma unroll
                for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) {
                    const int k = k0 + threadIdx.x;
                    KQ_f_tmp[k0/WARP_SIZE] += mask ? __half2float(slopeh*maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f;
                    KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/WARP_SIZE]);
                }
                KQ_max_new = warp_reduce_max(KQ_max_new);
                const float diff = KQ_max_f[j0/nwarps] - KQ_max_new;
                KQ_max_scale_f[j0/nwarps] = expf(diff);
                if (diff <= SOFTMAX_FTZ_THRESHOLD) {
                    KQ_max_scale_f[j0/nwarps] = 0.0f;
                }
                KQ_max_f[j0/nwarps] = KQ_max_new;
                float KQ_rowsum_add = 0.0f;
 #pragma unroll
                for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) {
                    const int k = k0 + threadIdx.x;
                    const float diff = KQ_f_tmp[k0/WARP_SIZE] - KQ_max_f[j0/nwarps];
                    KQ_f_tmp[k0/WARP_SIZE] = expf(diff);
                    if (diff <= SOFTMAX_FTZ_THRESHOLD) {
                        KQ_f_tmp[k0/WARP_SIZE] = 0.0f;
                    }
                    KQ_rowsum_add += KQ_f_tmp[k0/WARP_SIZE];
                    KQ[j*(kqar*kqs_padded) + k] = KQ_f_tmp[k0/WARP_SIZE];
                }
                KQ_rowsum_add = warp_reduce_sum(KQ_rowsum_add);
                // Scale previous KQ_rowsum to account for a potential increase in KQ_max:
                KQ_rowsum_f[j0/nwarps] = KQ_max_scale_f[j0/nwarps]*KQ_rowsum_f[j0/nwarps] + KQ_rowsum_add;
            } else {
                half2 KQ2_tmp[FATTN_KQ_STRIDE/(2*WARP_SIZE)];
 #pragma unroll
                for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) {
                    const int k = k0 + threadIdx.x;
                    KQ2_tmp[k0/WARP_SIZE] = KQ2[j*(kqs_padded/2) + k];
                }
                half2 KQ_max_new = KQ_max_h2[j0/nwarps];
 #pragma unroll
                for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) {
                    const int k = k0 + threadIdx.x;
                    KQ2_tmp[k0/WARP_SIZE] += mask ? slope2*mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f);
                    KQ_max_new = ggml_cuda_hmax2(KQ_max_new, KQ2_tmp[k0/WARP_SIZE]);
                }
                KQ_max_new = __half2half2(warp_reduce_max(ggml_cuda_hmax(__low2half(KQ_max_new), __high2half(KQ_max_new))));
                const half2 diff = KQ_max_h2[j0/nwarps] - KQ_max_new;
                KQ_max_scale_h2[j0/nwarps] = h2exp(diff);
                const uint32_t ftz_mask = __hgt2_mask(diff, make_half2(SOFTMAX_FTZ_THRESHOLD, SOFTMAX_FTZ_THRESHOLD));
                *((uint32_t *) &KQ_max_scale_h2[j0/nwarps]) &= ftz_mask;
                KQ_max_h2[j0/nwarps] = KQ_max_new;
                half2 KQ_rowsum_add = make_half2(0.0f, 0.0f);
 #pragma unroll
                for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) {
                    const int k = k0 + threadIdx.x;
                    const half2 diff = KQ2_tmp[k0/WARP_SIZE] - KQ_max_h2[j0/nwarps];
                    KQ2_tmp[k0/WARP_SIZE] = h2exp(diff);
                    const uint32_t ftz_mask = __hgt2_mask(diff, make_half2(SOFTMAX_FTZ_THRESHOLD, SOFTMAX_FTZ_THRESHOLD));
                    *((uint32_t *) &KQ2_tmp[k0/WARP_SIZE]) &= ftz_mask;
                    KQ_rowsum_add += KQ2_tmp[k0/WARP_SIZE];
                    KQ2[j*(kqs_padded/2) + k] = KQ2_tmp[k0/WARP_SIZE];
                }
                KQ_rowsum_add = warp_reduce_sum(KQ_rowsum_add);
                // Scale previous KQ_rowsum to account for a potential increase in KQ_max:
                KQ_rowsum_h2[j0/nwarps] = KQ_max_scale_h2[j0/nwarps]*KQ_rowsum_h2[j0/nwarps] + KQ_rowsum_add;
            }
        }
        __syncthreads();
        frag_b KQ_b[FATTN_KQ_STRIDE/(VKQ_ratio*16)][ncols/frag_n];
 #pragma unroll
        for (int j0 = 0; j0 < ncols; j0 += frag_n) {
 #pragma unroll
            for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += VKQ_ratio*16) {
                const int k = k0 + (threadIdx.y % VKQ_ratio)*16;
                nvcuda::wmma::load_matrix_sync(
                    KQ_b[k0/(VKQ_ratio*16)][j0/frag_n],
                    KQ + j0*(kqar*kqs_padded) + k,
                    kqar*kqs_padded);
            }
        }
        frag_c_VKQ VKQ_c[D/VKQ_stride][ncols/frag_n];
 #pragma unroll
        for (int i_VKQ_0 = 0; i_VKQ_0 < D; i_VKQ_0 += VKQ_stride) {
 #pragma unroll
            for (int j = 0; j < ncols/frag_n; ++j) {
                nvcuda::wmma::fill_fragment(VKQ_c[i_VKQ_0/VKQ_stride][j], 0.0f);
            }
 #pragma unroll
            for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += VKQ_ratio*16) {
                const int k = k0 + (threadIdx.y % VKQ_ratio)*16;
                frag_a_V v_a;
                nvcuda::wmma::load_matrix_sync(v_a, V_h + (k_VKQ_0 + k)*stride_KV + i_VKQ_0 + frag_m*(threadIdx.y/VKQ_ratio), stride_KV);
 #pragma unroll
                for (int j = 0; j < ncols/frag_n; ++j) {
                    nvcuda::wmma::mma_sync(VKQ_c[i_VKQ_0/VKQ_stride][j], v_a, KQ_b[k0/(VKQ_ratio*16)][j], VKQ_c[i_VKQ_0/VKQ_stride][j]);
                }
            }
        }
        __syncthreads();
        const int offset_k = (threadIdx.y % VKQ_ratio) * (ncols*D_padded);
 #pragma unroll
        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += VKQ_stride) {
 #pragma unroll
            for (int j0 = 0; j0 < ncols; j0 += frag_n) {
                nvcuda::wmma::store_matrix_sync(
                    KQ + offset_k + j0*D_padded + i_KQ_0 + frag_m*(threadIdx.y/VKQ_ratio),
                    VKQ_c[i_KQ_0/VKQ_stride][j0/frag_n],
                    D_padded, nvcuda::wmma::mem_col_major);
            }
        }
        __syncthreads();
 #pragma unroll
        for (int j0 = 0; j0 < ncols; j0 += nwarps) {
            const int j = j0 + threadIdx.y;
            half2 VKQ_scale;
            if (std::is_same<KQ_acc_t, float>::value) {
                VKQ_scale = make_half2(KQ_max_scale_f[j0/nwarps], KQ_max_scale_f[j0/nwarps]);
            } else {
                VKQ_scale = KQ_max_scale_h2[j0/nwarps];
            }
 #pragma unroll
            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
                const int i = i0 + threadIdx.x;
                if (i0 + WARP_SIZE > D/2 && i >= D/2) {
                    break;
                }
                half2 VKQ_add = make_half2(0.0f, 0.0f);
 #pragma unroll
                for (int l = 0; l < VKQ_ratio; ++l) {
                    VKQ_add += KQ2[l*(ncols*D_padded/2) + j*(D_padded/2) + i];
                }
                VKQ2[j*(D_padded/2) + i] = VKQ_scale*VKQ2[j*(D_padded/2) + i] + VKQ_add;
            }
        }
        __syncthreads();
    }
 #pragma unroll
    for (int j0 = 0; j0 < ncols; j0 += nwarps) {
        const int j_VKQ = j0 + threadIdx.y;
        if (ic0 + j_VKQ >= ne01) {
            return;
        }
        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
        float KQ_rowsum_j;
        if (std::is_same<KQ_acc_t, float>::value) {
            KQ_rowsum_j = KQ_rowsum_f[j0/nwarps];
        } else {
            KQ_rowsum_j = __low2float(KQ_rowsum_h2[j0/nwarps]) + __high2float(KQ_rowsum_h2[j0/nwarps]);
        }
 #pragma unroll
        for (int i0 = 0; i0 < D; i0 += WARP_SIZE) {
            const int i = i0 + threadIdx.x;
            if (i0 + WARP_SIZE > D && i >= D) {
                break;
            }
            float dst_val = VKQ[j_VKQ*D_padded + i];
            if (parallel_blocks == 1) {
                dst_val /= KQ_rowsum_j;
            }
            dst[j_dst*gridDim.y*D + blockIdx.y*D + i] = dst_val;
        }
        if (parallel_blocks == 1 || threadIdx.x != 0) {
            continue;
        }
        float2 dst_meta_val;
        if (std::is_same<KQ_acc_t, float>::value) {
            dst_meta_val.x = KQ_max_f[j0/nwarps];
        } else {
            dst_meta_val.x = __low2float(KQ_max_h2[j0/nwarps]);
        }
        dst_meta_val.y = KQ_rowsum_j;
        dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = dst_meta_val;
    }
 #else
   NO_DEVICE_CODE;
 #endif // FP16_MMA_AVAILABLE
 }
 constexpr int get_max_power_of_2(int x) {
    return x % 2 == 0 ? 2*get_max_power_of_2(x/2) : 1;
 }
 static_assert(get_max_power_of_2(1) == 1, "Test failed.");
 static_assert(get_max_power_of_2(2) == 2, "Test failed.");
 static_assert(get_max_power_of_2(4) == 4, "Test failed.");
 static_assert(get_max_power_of_2(6) == 2, "Test failed.");
 // Number of VKQ rows calculated in parallel:
 constexpr int get_VKQ_stride(int D, int nwarps, int frag_m) {
    return (get_max_power_of_2(D/frag_m) < nwarps ? get_max_power_of_2(D/frag_m) : nwarps)*frag_m;
 }
 static_assert(get_VKQ_stride(128, 1, 32) ==  32, "Test failed.");
 static_assert(get_VKQ_stride(128, 2, 32) ==  64, "Test failed.");
 static_assert(get_VKQ_stride(128, 4, 32) == 128, "Test failed.");
 static_assert(get_VKQ_stride( 64, 1, 32) ==  32, "Test failed.");
 static_assert(get_VKQ_stride( 64, 2, 32) ==  64, "Test failed.");
 static_assert(get_VKQ_stride( 64, 4, 32) ==  64, "Test failed.");
 static_assert(get_VKQ_stride( 80, 1, 16) ==  16, "Test failed.");
 static_assert(get_VKQ_stride( 80, 2, 16) ==  16, "Test failed.");
 static_assert(get_VKQ_stride( 80, 4, 16) ==  16, "Test failed.");
 template <int D, int cols_per_block, int nwarps, typename KQ_acc_t>
 void launch_fattn_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * Q   = dst->src[0];
-    constexpr int frag_m = cols_per_block == 8 && D % 32 == 0 ? 32 : 16;
+    const int32_t precision = KQV->op_params[2];
    const int blocks_num_pb1 = ((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3];
    const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
-    if (4*blocks_num_pb1 < 2*nsm) {
+    if (precision != GGML_PREC_DEFAULT) {
-        constexpr int parallel_blocks = 4;
+        if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
-        fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
+            constexpr int cols_per_block = 16;
-        launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
+            switch (Q->ne[0]) {
                case 64:
                    ggml_cuda_flash_attn_ext_wmma_f16_case< 64, cols_per_block, float>(ctx, dst);
                    break;
                case 80:
                    ggml_cuda_flash_attn_ext_wmma_f16_case< 80, cols_per_block, float>(ctx, dst);
                    break;
                case 96:
                    ggml_cuda_flash_attn_ext_wmma_f16_case< 96, cols_per_block, float>(ctx, dst);
                    break;
                case 112:
                    ggml_cuda_flash_attn_ext_wmma_f16_case<112, cols_per_block, float>(ctx, dst);
                    break;
                case 128:
                    ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
                    break;
                case 256:
                    ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, float>(ctx, dst);
                    break;
                default:
                    GGML_ASSERT(false);
                    break;
            }
        } else {
            constexpr int cols_per_block = 32;
            switch (Q->ne[0]) {
                case 64:
                    ggml_cuda_flash_attn_ext_wmma_f16_case< 64, cols_per_block, float>(ctx, dst);
                    break;
                case 80:
                    ggml_cuda_flash_attn_ext_wmma_f16_case< 80, cols_per_block, float>(ctx, dst);
                    break;
                case 96:
                    ggml_cuda_flash_attn_ext_wmma_f16_case< 96, cols_per_block, float>(ctx, dst);
                    break;
                case 112:
                    ggml_cuda_flash_attn_ext_wmma_f16_case<112, cols_per_block, float>(ctx, dst);
                    break;
                case 128:
                    ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
                    break;
                // case 256:
                //     ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
                //     break;
                default:
                    GGML_ASSERT(false);
                    break;
            }
        }
        return;
    }
-    if (2*blocks_num_pb1 < 2*nsm) {
+
-        constexpr int parallel_blocks = 2;
+    if (Q->ne[1] <= 8 && Q->ne[0] % WARP_SIZE == 0) {
-        fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
+        constexpr int cols_per_block = 8;
-        launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
+        switch (Q->ne[0]) {
            case 64:
                ggml_cuda_flash_attn_ext_wmma_f16_case< 64, cols_per_block, half>(ctx, dst);
                break;
            case 96:
                ggml_cuda_flash_attn_ext_wmma_f16_case< 96, cols_per_block, half>(ctx, dst);
                break;
            case 128:
                ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, half>(ctx, dst);
                break;
            case 256:
                ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
                break;
            default:
                GGML_ASSERT(false);
                break;
        }
        return;
    }
-    constexpr int parallel_blocks = 1;
+
-    fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
+    if (Q->ne[1] <= 32) {
-    launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
+        constexpr int cols_per_block = 16;
        switch (Q->ne[0]) {
            case 64:
                ggml_cuda_flash_attn_ext_wmma_f16_case< 64, cols_per_block, half>(ctx, dst);
                break;
            case 80:
                ggml_cuda_flash_attn_ext_wmma_f16_case< 80, cols_per_block, half>(ctx, dst);
                break;
            case 96:
                ggml_cuda_flash_attn_ext_wmma_f16_case< 96, cols_per_block, half>(ctx, dst);
                break;
            case 112:
                ggml_cuda_flash_attn_ext_wmma_f16_case<112, cols_per_block, half>(ctx, dst);
                break;
            case 128:
                ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, half>(ctx, dst);
                break;
            case 256:
                ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
                break;
            default:
                GGML_ASSERT(false);
                break;
        }
        return;
    }
    constexpr int cols_per_block = 32;
    switch (Q->ne[0]) {
        case 64:
            ggml_cuda_flash_attn_ext_wmma_f16_case< 64, cols_per_block, half>(ctx, dst);
            break;
        case 80:
            ggml_cuda_flash_attn_ext_wmma_f16_case< 80, cols_per_block, half>(ctx, dst);
            break;
        case 96:
            ggml_cuda_flash_attn_ext_wmma_f16_case< 96, cols_per_block, half>(ctx, dst);
            break;
        case 112:
            ggml_cuda_flash_attn_ext_wmma_f16_case<112, cols_per_block, half>(ctx, dst);
            break;
        case 128:
            ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, half>(ctx, dst);
            break;
        case 256:
            ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
            break;
        default:
            GGML_ASSERT(false);
            break;
    }
 }
 #define FATTN_VEC_F16_CASE(D, type_K, type_V)                               \
    if (Q->ne[0] == (D) && K->type == (type_K) && V->type == (type_V)) {    \
        ggml_cuda_flash_attn_ext_vec_f16_case<D, type_K, type_V>(ctx, dst); \
        return;                                                             \
    }                                                                       \
 static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_tensor * Q = dst->src[1];
    ggml_tensor * K = dst->src[1];
    ggml_tensor * V = dst->src[2];
 #ifdef GGML_CUDA_FA_ALL_QUANTS
    FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0)
    FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1)
    FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0)
    FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1)
    FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0)
    FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16 )
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q4_0)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q4_1)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q5_0)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q5_1)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q8_0)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_F16)
    FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
 #else
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
    FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
    FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
    FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
 #endif // GGML_CUDA_FA_ALL_QUANTS
    on_no_fattn_vec_case(Q->ne[0]);
 }
 #define FATTN_VEC_F32_CASE(D, type_K, type_V)                               \
    if (Q->ne[0] == (D) && K->type == (type_K) && V->type == (type_V)) {    \
        ggml_cuda_flash_attn_ext_vec_f32_case<D, type_K, type_V>(ctx, dst); \
        return;                                                             \
    }                                                                       \
 static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_tensor * Q = dst->src[1];
    ggml_tensor * K = dst->src[1];
    ggml_tensor * V = dst->src[2];
 #ifdef GGML_CUDA_FA_ALL_QUANTS
    FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0)
    FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1)
    FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0)
    FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1)
    FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0)
    FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q4_0)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q4_1)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q5_0)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q5_1)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q8_0)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_F16)
    FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
 #else
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
    FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
    FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
    FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
 #endif // GGML_CUDA_FA_ALL_QUANTS
    on_no_fattn_vec_case(Q->ne[0]);
 }
 void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@ -464,8 +305,8 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
    // On AMD the tile kernels perform poorly, use the vec kernel instead:
    if (cc >= CC_OFFSET_AMD) {
-        if (precision == GGML_PREC_DEFAULT) {
+        if (precision == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
-            ggml_cuda_flash_attn_ext_vec_f16_no_mma(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
        } else {
            ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
        }
@ -483,156 +324,22 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
    if (!fp16_mma_available(cc)) {
        if (Q->ne[1] <= 8) {
-            ggml_cuda_flash_attn_ext_vec_f16_no_mma(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
        } else {
            ggml_cuda_flash_attn_ext_tile_f16(ctx, dst);
        }
        return;
    }
-    if (precision != GGML_PREC_DEFAULT) {
+    if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
-        if (Q->ne[1] == 1 && (Q->ne[0] == 64 || Q->ne[0] == 128)) {
+        if (precision == GGML_PREC_DEFAULT) {
            ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
            return;
        } else if(Q->ne[0] <= 128) {
            ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
            return;
        }
        if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
            constexpr int cols_per_block = 16;
            constexpr int nwarps         =  4;
            switch (Q->ne[0]) {
                case 64:
                    launch_fattn_f16< 64, cols_per_block, nwarps, float>(ctx, dst);
                    break;
                case 80:
                    launch_fattn_f16< 80, cols_per_block, nwarps, float>(ctx, dst);
                    break;
                case 96:
                    launch_fattn_f16< 96, cols_per_block, nwarps, float>(ctx, dst);
                    break;
                case 112:
                    launch_fattn_f16<112, cols_per_block, nwarps, float>(ctx, dst);
                    break;
                case 128:
                    launch_fattn_f16<128, cols_per_block, nwarps, float>(ctx, dst);
                    break;
                case 256:
                    launch_fattn_f16<256, cols_per_block, nwarps, float>(ctx, dst);
                    break;
                default:
                    GGML_ASSERT(false);
                    break;
            }
        } else {
            constexpr int cols_per_block = 32;
            constexpr int nwarps         =  4;
            switch (Q->ne[0]) {
                case 64:
                    launch_fattn_f16< 64, cols_per_block, nwarps, float>(ctx, dst);
                    break;
                case 80:
                    launch_fattn_f16< 80, cols_per_block, nwarps, float>(ctx, dst);
                    break;
                case 96:
                    launch_fattn_f16< 96, cols_per_block, nwarps, float>(ctx, dst);
                    break;
                case 112:
                    launch_fattn_f16<112, cols_per_block, nwarps, float>(ctx, dst);
                    break;
                case 128:
                    launch_fattn_f16<128, cols_per_block, nwarps, float>(ctx, dst);
                    break;
                // case 256:
                //     launch_fattn_f16<256, cols_per_block, nwarps, float>(ctx, dst);
                //     break;
                default:
                    GGML_ASSERT(false);
                    break;
            }
        }
        return;
    }
-    if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
+    ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
        ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
        return;
    }
    if (Q->ne[1] <= 8 && Q->ne[0] % WARP_SIZE == 0) {
        constexpr int cols_per_block = 8;
        constexpr int nwarps         = 4;
        switch (Q->ne[0]) {
            case 64:
                launch_fattn_f16< 64, cols_per_block, nwarps, half>(ctx, dst);
                break;
            case 96:
                launch_fattn_f16< 96, cols_per_block, nwarps, half>(ctx, dst);
                break;
            case 128:
                launch_fattn_f16<128, cols_per_block, nwarps, half>(ctx, dst);
                break;
            case 256:
                launch_fattn_f16<256, cols_per_block, nwarps, half>(ctx, dst);
                break;
            default:
                GGML_ASSERT(false);
                break;
        }
        return;
    }
    if (Q->ne[1] <= 32) {
        constexpr int cols_per_block = 16;
        constexpr int nwarps         =  4;
        switch (Q->ne[0]) {
            case 64:
                launch_fattn_f16< 64, cols_per_block, nwarps, half>(ctx, dst);
                break;
            case 80:
                launch_fattn_f16< 80, cols_per_block, nwarps, half>(ctx, dst);
                break;
            case 96:
                launch_fattn_f16< 96, cols_per_block, nwarps, half>(ctx, dst);
                break;
            case 112:
                launch_fattn_f16<112, cols_per_block, nwarps, half>(ctx, dst);
                break;
            case 128:
                launch_fattn_f16<128, cols_per_block, nwarps, half>(ctx, dst);
                break;
            case 256:
                launch_fattn_f16<256, cols_per_block, nwarps, half>(ctx, dst);
                break;
            default:
                GGML_ASSERT(false);
                break;
        }
        return;
    }
    constexpr int cols_per_block = 32;
    constexpr int nwarps         =  4;
    switch (Q->ne[0]) {
        case 64:
            launch_fattn_f16< 64, cols_per_block, nwarps, half>(ctx, dst);
            break;
        case 80:
            launch_fattn_f16< 80, cols_per_block, nwarps, half>(ctx, dst);
            break;
        case 96:
            launch_fattn_f16< 96, cols_per_block, nwarps, half>(ctx, dst);
            break;
        case 112:
            launch_fattn_f16<112, cols_per_block, nwarps, half>(ctx, dst);
            break;
        case 128:
            launch_fattn_f16<128, cols_per_block, nwarps, half>(ctx, dst);
            break;
        case 256:
            launch_fattn_f16<256, cols_per_block, nwarps, half>(ctx, dst);
            break;
        default:
            GGML_ASSERT(false);
            break;
    }
    return;
 }
--- a/ggml-cuda/mmq.cu
+++ b/ggml-cuda/mmq.cu
@ -386,7 +386,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
    }
-    return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
+    return vec_dot_q8_0_q8_1_impl<float, QR5_0*VDR_Q5_0_Q8_1_MMQ>
        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
 }
@ -547,7 +547,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
    const float * x_dmf = (const float *) x_dm;
    const float * y_df  = (const float *) y_ds;
-    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
+    return vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMQ>
        (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
         y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
 }
--- a/ggml-cuda/norm.cu
+++ b/ggml-cuda/norm.cu
@ -170,6 +170,8 @@ void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();
    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
@ -188,6 +190,8 @@ void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();
    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
@ -202,6 +206,8 @@ void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();
    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
--- a/ggml-cuda/rope.cu
+++ b/ggml-cuda/rope.cu
@ -61,7 +61,7 @@ static __global__ void rope(
 template<typename T, bool has_pos, bool has_freq_facs>
 static __global__ void rope_neox(
    const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims, const float * freq_factors
+    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors
 ) {
    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
@ -85,15 +85,13 @@ static __global__ void rope_neox(
    const int i  = row*ncols + ib*n_dims + ic/2;
    const int i2 = row/p_delta_rows;
    float cur_rot = inv_ndims * ic - ib;
    const int p = has_pos ? pos[i2] : 0;
    const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
-    const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f)/freq_factor;
+    const float theta_base = p*powf(theta_scale, col/2.0f)/freq_factor;
    float cos_theta, sin_theta;
-    rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
+    rope_yarn(theta_base, freq_scale, corr_dims, ic, ext_factor, attn_factor, &cos_theta, &sin_theta);
    const float x0 = x[i + 0];
    const float x1 = x[i + n_dims/2];
@ -174,30 +172,29 @@ static void rope_neox_cuda(
    const dim3 block_nums(nrows, num_blocks_x, 1);
    const float theta_scale = powf(freq_base, -2.0f/n_dims);
    const float inv_ndims = -1.0f / n_dims;
    if (pos == nullptr) {
        if (freq_factors == nullptr) {
            rope_neox<T, false, false><<<block_nums, block_dims, 0, stream>>>(
                x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-                theta_scale, inv_ndims, freq_factors
+                theta_scale, freq_factors
                );
        } else {
            rope_neox<T, false, true><<<block_nums, block_dims, 0, stream>>>(
                x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-                theta_scale, inv_ndims, freq_factors
+                theta_scale, freq_factors
                );
        }
    } else {
        if (freq_factors == nullptr) {
            rope_neox<T, true, false><<<block_nums, block_dims, 0, stream>>>(
                x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-                theta_scale, inv_ndims, freq_factors
+                theta_scale, freq_factors
                );
        } else {
            rope_neox<T, true, true><<<block_nums, block_dims, 0, stream>>>(
                x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-                theta_scale, inv_ndims, freq_factors
+                theta_scale, freq_factors
                );
        }
    }
@ -254,6 +251,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();
    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
    GGML_ASSERT(src0->type == dst->type);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate-variants.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
 DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
--- a/Show More
+++ b/Show More