Merge branch 'ggerganov:master' into ag_cuda_graphs

This commit is contained in:
agray3 2024-04-29 14:04:11 +01:00 committed by GitHub
commit 9c5786161d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
75 changed files with 3939 additions and 8688 deletions

View File

@ -32,7 +32,7 @@ on:
- cron: '04 2 * * *' - cron: '04 2 * * *'
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}-${{ github.event.inputs.sha }} group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:

View File

@ -32,6 +32,8 @@ jobs:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v4 uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -88,6 +90,8 @@ jobs:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v4 uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -206,6 +210,8 @@ jobs:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v4 uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -238,6 +244,33 @@ jobs:
./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf ./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 ./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
name: llama-bin-ubuntu-x64.zip
# ubuntu-latest-cmake-sanitizer: # ubuntu-latest-cmake-sanitizer:
# runs-on: ubuntu-latest # runs-on: ubuntu-latest
# #
@ -560,6 +593,63 @@ jobs:
run: | run: |
make swift make swift
windows-msys2:
runs-on: windows-latest
strategy:
fail-fast: false
matrix:
include:
- { sys: UCRT64, env: ucrt-x86_64, build: Release }
- { sys: CLANG64, env: clang-x86_64, build: Release }
steps:
- name: Clone
uses: actions/checkout@v4
- name: Setup ${{ matrix.sys }}
uses: msys2/setup-msys2@v2
with:
update: true
msystem: ${{matrix.sys}}
install: >-
base-devel
mingw-w64-${{matrix.env}}-toolchain
mingw-w64-${{matrix.env}}-cmake
mingw-w64-${{matrix.env}}-openblas
- name: Build using make
shell: msys2 {0}
run: |
make -j $(nproc)
- name: Clean after building using make
shell: msys2 {0}
run: |
make clean
- name: Build using make w/ OpenBLAS
shell: msys2 {0}
run: |
make LLAMA_OPENBLAS=1 -j $(nproc)
- name: Build using CMake
shell: msys2 {0}
run: |
cmake -B build
cmake --build build --config ${{ matrix.build }} -j $(nproc)
- name: Clean after building using CMake
shell: msys2 {0}
run: |
rm -rf build
- name: Build using CMake w/ OpenBLAS
shell: msys2 {0}
run: |
cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
cmake --build build --config ${{ matrix.build }} -j $(nproc)
windows-latest-cmake: windows-latest-cmake:
runs-on: windows-latest runs-on: windows-latest

View File

@ -23,7 +23,7 @@ on:
- cron: '2 4 * * *' - cron: '2 4 * * *'
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:
@ -41,23 +41,16 @@ jobs:
sanitizer: "" sanitizer: ""
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
container:
image: ubuntu:latest
ports:
- 8888
options: --cpus 4
steps: steps:
- name: Dependencies - name: Dependencies
id: depends id: depends
run: | run: |
apt-get update sudo apt-get update
apt-get -y install \ sudo apt-get -y install \
build-essential \ build-essential \
xxd \ xxd \
git \ git \
cmake \ cmake \
python3-pip \
curl \ curl \
wget \ wget \
language-pack-en \ language-pack-en \
@ -70,6 +63,17 @@ jobs:
fetch-depth: 0 fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Python setup
id: setup_python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Tests dependencies
id: test_dependencies
run: |
pip install -r examples/server/tests/requirements.txt
- name: Verify server deps - name: Verify server deps
id: verify_server_deps id: verify_server_deps
run: | run: |
@ -100,10 +104,6 @@ jobs:
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
- name: Tests dependencies
id: test_dependencies
run: |
pip install -r examples/server/tests/requirements.txt
- name: Tests - name: Tests
id: server_integration_tests id: server_integration_tests
@ -129,6 +129,7 @@ jobs:
uses: actions/checkout@v4 uses: actions/checkout@v4
with: with:
fetch-depth: 0 fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: libCURL - name: libCURL
id: get_libcurl id: get_libcurl

4
.gitignore vendored
View File

@ -34,6 +34,7 @@ lcov-report/
gcovr-report/ gcovr-report/
build* build*
!build.zig
cmake-build-* cmake-build-*
out/ out/
tmp/ tmp/
@ -100,6 +101,9 @@ qnt-*.txt
perf-*.txt perf-*.txt
examples/jeopardy/results.txt examples/jeopardy/results.txt
examples/server/*.html.hpp
examples/server/*.js.hpp
examples/server/*.mjs.hpp
poetry.lock poetry.lock
poetry.toml poetry.toml

View File

@ -43,17 +43,7 @@ else()
set(LLAMA_METAL_DEFAULT OFF) set(LLAMA_METAL_DEFAULT OFF)
endif() endif()
# TODO: fix this for Android CI set(LLAMA_LLAMAFILE_DEFAULT ON)
# https://github.com/ggerganov/llama.cpp/pull/6716#issuecomment-2061509191
#if (CMAKE_SYSTEM_NAME MATCHES "ANDROID")
# set(LLAMA_LLAMAFILE_DEFAULT OFF)
#else()
# set(LLAMA_LLAMAFILE_DEFAULT ON)
#endif()
# TODO: temporary disable until MoE is fixed
# https://github.com/ggerganov/llama.cpp/pull/6716
set(LLAMA_LLAMAFILE_DEFAULT OFF)
# general # general
option(BUILD_SHARED_LIBS "build shared libraries" OFF) option(BUILD_SHARED_LIBS "build shared libraries" OFF)

View File

@ -384,10 +384,6 @@ ifdef LLAMA_OPENBLAS
MK_LDFLAGS += $(shell pkg-config --libs openblas) MK_LDFLAGS += $(shell pkg-config --libs openblas)
endif # LLAMA_OPENBLAS endif # LLAMA_OPENBLAS
# TODO: temporary disable until MoE is fixed
# https://github.com/ggerganov/llama.cpp/pull/6716
LLAMA_NO_LLAMAFILE := 1
ifndef LLAMA_NO_LLAMAFILE ifndef LLAMA_NO_LLAMAFILE
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
OBJS += sgemm.o OBJS += sgemm.o
@ -699,7 +695,7 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
common.o: common/common.cpp $(COMMON_H_DEPS) common.o: common/common.cpp $(COMMON_H_DEPS)
@ -772,7 +768,7 @@ batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS) quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -800,10 +796,19 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2) $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
examples/server/%.hpp: examples/server/public/% Makefile
@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
echo "unsigned char $${NAME}[] = {" && \
cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
echo "};" && \
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
) > $@
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS) gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

View File

@ -229,12 +229,12 @@ source /opt/intel/oneapi/setvars.sh
# Build LLAMA with MKL BLAS acceleration for intel GPU # Build LLAMA with MKL BLAS acceleration for intel GPU
mkdir -p build && cd build mkdir -p build && cd build
# Option 1: Use FP16 for better performance in long-prompt inference # Option 1: Use FP32 (recommended for better performance in most cases)
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# Option 2: Use FP32 by default
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Option 2: Use FP16
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
#build all binary #build all binary
cmake --build . --config Release -j -v cmake --build . --config Release -j -v
``` ```
@ -250,12 +250,12 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
# Build LLAMA with Nvidia BLAS acceleration through SYCL # Build LLAMA with Nvidia BLAS acceleration through SYCL
mkdir -p build && cd build mkdir -p build && cd build
# Option 1: Use FP16 for better performance in long-prompt inference # Option 1: Use FP32 (recommended for better performance in most cases)
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# Option 2: Use FP32 by default
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Option 2: Use FP16
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
#build all binary #build all binary
cmake --build . --config Release -j -v cmake --build . --config Release -j -v
@ -416,6 +416,10 @@ mkdir -p build
cd build cd build
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
# Option 1: Use FP32 (recommended for better performance in most cases)
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
# Option 2: Or FP16
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
make -j make -j

View File

@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
### Recent API changes ### Recent API changes
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341 - [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122 - [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017 - [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
@ -92,10 +93,11 @@ Typically finetunes of the base models below are supported as well.
- [X] LLaMA 🦙 - [X] LLaMA 🦙
- [x] LLaMA 2 🦙🦙 - [x] LLaMA 2 🦙🦙
- [x] LLaMA 3 🦙🦙🦙
- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) - [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral) - [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct) - [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
- [X] Falcon - [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2) - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne) - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/) - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
@ -118,8 +120,9 @@ Typically finetunes of the base models below are supported as well.
- [x] [CodeShell](https://github.com/WisdomShell/codeshell) - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
- [x] [Gemma](https://ai.google.dev/gemma) - [x] [Gemma](https://ai.google.dev/gemma)
- [x] [Mamba](https://github.com/state-spaces/mamba) - [x] [Mamba](https://github.com/state-spaces/mamba)
- [x] [Grok-1](https://huggingface.co/keyfan/grok-1-hf)
- [x] [Xverse](https://huggingface.co/models?search=xverse) - [x] [Xverse](https://huggingface.co/models?search=xverse)
- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01) - [x] [Command-R models](https://huggingface.co/models?search=CohereForAI/c4ai-command-r)
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion) - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B) - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
- [x] [OLMo](https://allenai.org/olmo) - [x] [OLMo](https://allenai.org/olmo)
@ -134,6 +137,8 @@ Typically finetunes of the base models below are supported as well.
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V) - [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM) - [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL) - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
**HTTP server** **HTTP server**
@ -549,7 +554,7 @@ Building the program with BLAS support may lead to some performance improvements
OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU. OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK). You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
- For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed. - For Ubuntu, Debian, and Fedora the packages `opencl-headers`, `ocl-icd` may be needed.
- For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page. - For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
@ -574,6 +579,12 @@ Building the program with BLAS support may lead to some performance improvements
Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages. Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
Linux packaging:
Fedora Linux:
```bash
sudo dnf install clblast
```
Alternatively, they may be built from source. Alternatively, they may be built from source.
- <details> - <details>
@ -1110,7 +1121,9 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a` - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`z = ggml_mul_mat(ctx, x, y)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means `zT = x @ yT` - Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
![matmul](media/matmul.png)
### Docs ### Docs

View File

@ -140,4 +140,33 @@ pub fn build(b: *std.build.Builder) !void {
if (server.target.isWindows()) { if (server.target.isWindows()) {
server.linkSystemLibrary("ws2_32"); server.linkSystemLibrary("ws2_32");
} }
const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" };
for (server_assets) |asset| {
const input_path = b.fmt("examples/server/public/{s}", .{asset});
const output_path = b.fmt("examples/server/{s}.hpp", .{asset});
// Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`:
const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize));
defer b.allocator.free(input);
var buf = std.ArrayList(u8).init(b.allocator);
defer buf.deinit();
for (input) |byte| {
try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte});
}
var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_");
defer b.allocator.free(name);
std.mem.replaceScalar(u8, name, '.', '_');
try std.fs.cwd().writeFile(output_path, b.fmt(
"unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n",
.{ name, buf.items, name, input.len },
));
std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path });
}
} }

View File

@ -160,7 +160,9 @@ function gg_run_test_scripts_debug {
set -e set -e
# TODO: too slow, run on dedicated node
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
#(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
set +e set +e
} }
@ -184,6 +186,7 @@ function gg_run_test_scripts_release {
set -e set -e
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
set +e set +e
} }

View File

@ -108,7 +108,7 @@ int32_t get_num_physical_cores() {
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
} }
#if defined(__x86_64__) && defined(__linux__) #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
#include <pthread.h> #include <pthread.h>
static void cpuid(unsigned leaf, unsigned subleaf, static void cpuid(unsigned leaf, unsigned subleaf,
@ -162,7 +162,7 @@ static int count_math_cpus(int cpu_count) {
* Returns number of CPUs on system that are useful for math. * Returns number of CPUs on system that are useful for math.
*/ */
int get_math_cpu_count() { int get_math_cpu_count() {
#if defined(__x86_64__) && defined(__linux__) #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
int cpu_count = sysconf(_SC_NPROCESSORS_ONLN); int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
if (cpu_count < 1) { if (cpu_count < 1) {
return get_num_physical_cores(); return get_num_physical_cores();
@ -234,15 +234,63 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
return result; return result;
} }
bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
const char * sep = strchr(data, '=');
if (sep == nullptr || sep - data >= 128) {
fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
return false;
}
llama_model_kv_override kvo;
std::strncpy(kvo.key, data, sep - data);
kvo.key[sep - data] = 0;
sep++;
if (strncmp(sep, "int:", 4) == 0) {
sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
kvo.val_i64 = std::atol(sep);
} else if (strncmp(sep, "float:", 6) == 0) {
sep += 6;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
kvo.val_f64 = std::atof(sep);
} else if (strncmp(sep, "bool:", 5) == 0) {
sep += 5;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
if (std::strcmp(sep, "true") == 0) {
kvo.val_bool = true;
} else if (std::strcmp(sep, "false") == 0) {
kvo.val_bool = false;
} else {
fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
return false;
}
} else if (strncmp(sep, "str:", 4) == 0) {
sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
if (strlen(sep) > 127) {
fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
return false;
}
strncpy(kvo.val_str, sep, 127);
kvo.val_str[127] = '\0';
} else {
fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
return false;
}
overrides.emplace_back(std::move(kvo));
return true;
}
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) { bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
llama_sampling_params& sparams = params.sparams; llama_sampling_params & sparams = params.sparams;
if (arg == "-s" || arg == "--seed") { if (arg == "-s" || arg == "--seed") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
return true; return true;
} }
// This is temporary, in the future the samplign state will be moved fully to llama_sampling_context.
params.seed = std::stoul(argv[i]); params.seed = std::stoul(argv[i]);
sparams.seed = std::stoul(argv[i]);
return true; return true;
} }
if (arg == "-t" || arg == "--threads") { if (arg == "-t" || arg == "--threads") {
@ -1087,6 +1135,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.n_print = std::stoi(argv[i]); params.n_print = std::stoi(argv[i]);
return true; return true;
} }
if (arg == "--check-tensors") {
params.check_tensors = true;
return true;
}
if (arg == "--ppl-output-type") { if (arg == "--ppl-output-type") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -1238,47 +1290,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
invalid_param = true; invalid_param = true;
return true; return true;
} }
char* sep = strchr(argv[i], '='); if (!parse_kv_override(argv[i], params.kv_overrides)) {
if (sep == nullptr || sep - argv[i] >= 128) {
fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
invalid_param = true;
return true;
}
struct llama_model_kv_override kvo;
std::strncpy(kvo.key, argv[i], sep - argv[i]);
kvo.key[sep - argv[i]] = 0;
sep++;
if (strncmp(sep, "int:", 4) == 0) {
sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
kvo.int_value = std::atol(sep);
}
else if (strncmp(sep, "float:", 6) == 0) {
sep += 6;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
kvo.float_value = std::atof(sep);
}
else if (strncmp(sep, "bool:", 5) == 0) {
sep += 5;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
if (std::strcmp(sep, "true") == 0) {
kvo.bool_value = true;
}
else if (std::strcmp(sep, "false") == 0) {
kvo.bool_value = false;
}
else {
fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
invalid_param = true;
return true;
}
}
else {
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
invalid_param = true; invalid_param = true;
return true; return true;
} }
params.kv_overrides.push_back(kvo);
return true; return true;
} }
#ifndef LOG_DISABLE_LOGS #ifndef LOG_DISABLE_LOGS
@ -1549,9 +1565,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n"); printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
printf(" -ptc N, --print-token-count N\n"); printf(" -ptc N, --print-token-count N\n");
printf(" print token count every N tokens (default: %d)\n", params.n_print); printf(" print token count every N tokens (default: %d)\n", params.n_print);
printf(" --check-tensors check model tensor data for invalid values\n");
printf("\n"); printf("\n");
#ifndef LOG_DISABLE_LOGS #ifndef LOG_DISABLE_LOGS
log_print_usage(); log_print_usage();
@ -1772,6 +1789,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
mparams.tensor_split = params.tensor_split; mparams.tensor_split = params.tensor_split;
mparams.use_mmap = params.use_mmap; mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock; mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
if (params.kv_overrides.empty()) { if (params.kv_overrides.empty()) {
mparams.kv_overrides = NULL; mparams.kv_overrides = NULL;
} else { } else {
@ -2326,12 +2344,12 @@ std::vector<llama_token> llama_tokenize(
return result; return result;
} }
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
std::vector<char> result(8, 0); std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
if (n_tokens < 0) { if (n_tokens < 0) {
result.resize(-n_tokens); result.resize(-n_tokens);
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
GGML_ASSERT(check == -n_tokens); GGML_ASSERT(check == -n_tokens);
} else { } else {
result.resize(n_tokens); result.resize(n_tokens);

View File

@ -86,8 +86,8 @@ struct gpt_params {
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
// // sampling parameters // // sampling parameters
struct llama_sampling_params sparams; struct llama_sampling_params sparams;
@ -161,6 +161,7 @@ struct gpt_params {
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
bool no_kv_offload = false; // disable KV offloading bool no_kv_offload = false; // disable KV offloading
bool warmup = true; // warmup run bool warmup = true; // warmup run
bool check_tensors = false; // validate tensor data
std::string cache_type_k = "f16"; // KV cache data type for the K std::string cache_type_k = "f16"; // KV cache data type for the K
std::string cache_type_v = "f16"; // KV cache data type for the V std::string cache_type_v = "f16"; // KV cache data type for the V
@ -170,6 +171,8 @@ struct gpt_params {
std::string image = ""; // path to an image file std::string image = ""; // path to an image file
}; };
bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params); bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
bool gpt_params_parse(int argc, char ** argv, gpt_params & params); bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
@ -237,11 +240,12 @@ std::vector<llama_token> llama_tokenize(
bool add_special, bool add_special,
bool parse_special = false); bool parse_special = false);
// tokenizes a token into a piece // tokenizes a token into a piece, optionally renders special/control tokens
// should work similar to Python's `tokenizer.id_to_piece` // should work similar to Python's `tokenizer.id_to_piece`
std::string llama_token_to_piece( std::string llama_token_to_piece(
const struct llama_context * ctx, const struct llama_context * ctx,
llama_token token); llama_token token,
bool special = true);
// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function // TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
// that takes into account the tokenizer type and decides how to handle the leading space // that takes into account the tokenizer type and decides how to handle the leading space

View File

@ -234,7 +234,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
// USE LOG() INSTEAD // USE LOG() INSTEAD
// //
#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER) #if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER)
#define LOG_IMPL(str, ...) \ #define LOG_IMPL(str, ...) \
do { \ do { \
if (LOG_TARGET != nullptr) \ if (LOG_TARGET != nullptr) \
@ -257,7 +257,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
// USE LOG_TEE() INSTEAD // USE LOG_TEE() INSTEAD
// //
#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER) #if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER)
#define LOG_TEE_IMPL(str, ...) \ #define LOG_TEE_IMPL(str, ...) \
do { \ do { \
if (LOG_TARGET != nullptr) \ if (LOG_TARGET != nullptr) \

View File

@ -1,4 +1,6 @@
#define LLAMA_API_INTERNAL
#include "sampling.h" #include "sampling.h"
#include <random>
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) { struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
struct llama_sampling_context * result = new llama_sampling_context(); struct llama_sampling_context * result = new llama_sampling_context();
@ -33,6 +35,8 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
result->prev.resize(params.n_prev); result->prev.resize(params.n_prev);
llama_sampling_set_rng_seed(result, params.seed);
return result; return result;
} }
@ -62,6 +66,13 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
ctx->cur.clear(); ctx->cur.clear();
} }
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
if (seed == LLAMA_DEFAULT_SEED) {
seed = time(NULL);
}
ctx->rng.seed(seed);
}
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) { void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
if (dst->grammar) { if (dst->grammar) {
llama_grammar_free(dst->grammar); llama_grammar_free(dst->grammar);
@ -203,7 +214,7 @@ static llama_token llama_sampling_sample_impl(
sampler_queue(ctx_main, params, cur_p, min_keep); sampler_queue(ctx_main, params, cur_p, min_keep);
id = llama_sample_token(ctx_main, &cur_p); id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);
//{ //{
// const int n_top = 10; // const int n_top = 10;

View File

@ -4,9 +4,10 @@
#include "grammar-parser.h" #include "grammar-parser.h"
#include <random>
#include <string> #include <string>
#include <vector>
#include <unordered_map> #include <unordered_map>
#include <vector>
// sampler types // sampler types
enum class llama_sampler_type : char { enum class llama_sampler_type : char {
@ -20,25 +21,26 @@ enum class llama_sampler_type : char {
// sampling parameters // sampling parameters
typedef struct llama_sampling_params { typedef struct llama_sampling_params {
int32_t n_prev = 64; // number of previous tokens to remember int32_t n_prev = 64; // number of previous tokens to remember
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
int32_t top_k = 40; // <= 0 to use vocab size int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled float top_p = 0.95f; // 1.0 = disabled
float min_p = 0.05f; // 0.0 = disabled float min_p = 0.05f; // 0.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
float dynatemp_range = 0.00f; // 0.0 = disabled float dynatemp_range = 0.00f; // 0.0 = disabled
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat = 1.00f; // 1.0 = disabled float penalty_repeat = 1.00f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled float penalty_freq = 0.00f; // 0.0 = disabled
float penalty_present = 0.00f; // 0.0 = disabled float penalty_present = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token bool penalize_nl = false; // consider newlines as a repeatable token
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
std::vector<llama_sampler_type> samplers_sequence = { std::vector<llama_sampler_type> samplers_sequence = {
llama_sampler_type::TOP_K, llama_sampler_type::TOP_K,
@ -79,6 +81,8 @@ struct llama_sampling_context {
// TODO: replace with ring-buffer // TODO: replace with ring-buffer
std::vector<llama_token> prev; std::vector<llama_token> prev;
std::vector<llama_token_data> cur; std::vector<llama_token_data> cur;
std::mt19937 rng;
}; };
#include "common.h" #include "common.h"
@ -93,6 +97,9 @@ void llama_sampling_free(struct llama_sampling_context * ctx);
// - reset grammar // - reset grammar
void llama_sampling_reset(llama_sampling_context * ctx); void llama_sampling_reset(llama_sampling_context * ctx);
// Set the sampler seed
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
// Copy the sampler context // Copy the sampler context
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst); void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);

View File

@ -363,6 +363,16 @@ class Model(ABC):
scores.append(-1000.0) scores.append(-1000.0)
toktypes.append(SentencePieceTokenTypes.USER_DEFINED) toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
if vocab_size > len(tokens):
pad_count = vocab_size - len(tokens)
print(
f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]"
)
for i in range(1, pad_count + 1):
tokens.append(f"[PAD{i}]")
scores.append(-1000.0)
toktypes.append(SentencePieceTokenTypes.UNUSED)
assert len(tokens) == vocab_size assert len(tokens) == vocab_size
self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_model("llama")
@ -1301,15 +1311,23 @@ class LlamaModel(Model):
try: try:
self. _set_vocab_sentencepiece() self. _set_vocab_sentencepiece()
except FileNotFoundError: except FileNotFoundError:
self._set_vocab_llama_hf() try:
self._set_vocab_llama_hf()
except (FileNotFoundError, TypeError):
# Llama 3
self._set_vocab_gpt2()
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256)
special_token_types = ['prefix', 'suffix', 'middle', 'eot']) if self.hparams.get("vocab_size", 32000) == 32016:
special_vocab._set_special_token("prefix", 32007) special_vocab = gguf.SpecialVocab(
special_vocab._set_special_token("suffix", 32008) self.dir_model, load_merges=False,
special_vocab._set_special_token("middle", 32009) special_token_types = ['prefix', 'suffix', 'middle', 'eot']
special_vocab._set_special_token("eot", 32010) )
special_vocab.add_to_gguf(self.gguf_writer) special_vocab._set_special_token("prefix", 32007)
special_vocab._set_special_token("suffix", 32008)
special_vocab._set_special_token("middle", 32009)
special_vocab._set_special_token("eot", 32010)
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self): def set_gguf_parameters(self):
super().set_gguf_parameters() super().set_gguf_parameters()
@ -1781,6 +1799,12 @@ class QwenModel(Model):
class Qwen2Model(Model): class Qwen2Model(Model):
model_arch = gguf.MODEL_ARCH.QWEN2 model_arch = gguf.MODEL_ARCH.QWEN2
def set_vocab(self):
try:
self._set_vocab_sentencepiece()
except FileNotFoundError:
self._set_vocab_gpt2()
@Model.register("Qwen2MoeForCausalLM") @Model.register("Qwen2MoeForCausalLM")
class Qwen2MoeModel(Model): class Qwen2MoeModel(Model):
@ -1971,6 +1995,91 @@ class Phi2Model(Model):
self.gguf_writer.add_add_bos_token(False) self.gguf_writer.add_add_bos_token(False)
@Model.register("Phi3ForCausalLM")
class Phi3MiniModel(Model):
model_arch = gguf.MODEL_ARCH.PHI3
def set_vocab(self):
from sentencepiece import SentencePieceProcessor
tokenizer_path = self.dir_model / 'tokenizer.model'
if not tokenizer_path.is_file():
print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
sys.exit(1)
tokenizer = SentencePieceProcessor(str(tokenizer_path))
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
scores: list[float] = [-10000.0] * vocab_size
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
for token_id in range(tokenizer.vocab_size()):
piece = tokenizer.id_to_piece(token_id)
text = piece.encode("utf-8")
score = tokenizer.get_score(token_id)
toktype = SentencePieceTokenTypes.NORMAL
if tokenizer.is_unknown(token_id):
toktype = SentencePieceTokenTypes.UNKNOWN
elif tokenizer.is_control(token_id):
toktype = SentencePieceTokenTypes.CONTROL
elif tokenizer.is_unused(token_id):
toktype = SentencePieceTokenTypes.UNUSED
elif tokenizer.is_byte(token_id):
toktype = SentencePieceTokenTypes.BYTE
tokens[token_id] = text
scores[token_id] = score
toktypes[token_id] = toktype
added_tokens_file = self.dir_model / 'added_tokens.json'
if added_tokens_file.is_file():
with open(added_tokens_file, "r", encoding="utf-8") as f:
added_tokens_json = json.load(f)
for key in added_tokens_json:
token_id = added_tokens_json[key]
if (token_id >= vocab_size):
print(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue
tokens[token_id] = key.encode("utf-8")
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):
block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
rot_pct = 1.0
n_embd = self.find_hparam(["hidden_size", "n_embd"])
n_head = self.find_hparam(["num_attention_heads", "n_head"])
rms_eps = self.find_hparam(["rms_norm_eps"])
self.gguf_writer.add_name("Phi3")
self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
self.gguf_writer.add_embedding_length(n_embd)
self.gguf_writer.add_feed_forward_length(8192)
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_head_count(n_head)
self.gguf_writer.add_head_count_kv(n_head)
self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
self.gguf_writer.add_file_type(self.ftype)
@Model.register("PlamoForCausalLM") @Model.register("PlamoForCausalLM")
class PlamoModel(Model): class PlamoModel(Model):
model_arch = gguf.MODEL_ARCH.PLAMO model_arch = gguf.MODEL_ARCH.PLAMO
@ -2194,6 +2303,8 @@ class InternLM2Model(Model):
old_eos = special_vocab.special_token_ids["eos"] old_eos = special_vocab.special_token_ids["eos"]
if "chat" in os.path.basename(self.dir_model.absolute()): if "chat" in os.path.basename(self.dir_model.absolute()):
# For the chat model, we replace the eos with '<|im_end|>'. # For the chat model, we replace the eos with '<|im_end|>'.
# TODO: this is a hack, should be fixed
# https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer) special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
in chat mode so that the conversation can end normally.") in chat mode so that the conversation can end normally.")
@ -2429,12 +2540,15 @@ class GemmaModel(Model):
def set_vocab(self): def set_vocab(self):
self._set_vocab_sentencepiece() self._set_vocab_sentencepiece()
# TODO: these special tokens should be exported only for the CodeGemma family
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
special_token_types = ['prefix', 'suffix', 'middle', 'eot']) special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
special_vocab._set_special_token("prefix", 67) special_vocab._set_special_token("prefix", 67)
special_vocab._set_special_token("suffix", 69) special_vocab._set_special_token("suffix", 69)
special_vocab._set_special_token("middle", 68) special_vocab._set_special_token("middle", 68)
special_vocab._set_special_token("eot", 70) special_vocab._set_special_token("fsep", 70)
special_vocab._set_special_token("eot", 107)
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self): def set_gguf_parameters(self):
@ -2523,28 +2637,34 @@ class MambaModel(Model):
field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL) field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1])) self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]))
field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST) field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES) field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID) field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0]) self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID) field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0]) self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID) field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0]) self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
def set_gguf_parameters(self): def set_gguf_parameters(self):
d_model = self.find_hparam(["hidden_size", "d_model"]) d_model = self.find_hparam(["hidden_size", "d_model"])
d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16
# ceiling division # ceiling division
# ref: https://stackoverflow.com/a/17511341/22827863 # ref: https://stackoverflow.com/a/17511341/22827863
# ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
# Fail early for models which don't have a block expansion factor of 2 # Fail early for models which don't have a block expansion factor of 2

View File

@ -525,7 +525,14 @@ class LlamaHfVocab(Vocab):
# pre-check so we know if we need transformers # pre-check so we know if we need transformers
tokenizer_model: dict[str, Any] = tokenizer_json['model'] tokenizer_model: dict[str, Any] = tokenizer_json['model']
if ( is_llama3 = (
tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
and not tokenizer_model.get('byte_fallback', True)
)
if is_llama3:
raise TypeError('Llama 3 must be converted with BpeVocab')
if not is_llama3 and (
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False) tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
or tokenizer_json['decoder']['type'] != 'Sequence' or tokenizer_json['decoder']['type'] != 'Sequence'
): ):

View File

@ -153,7 +153,7 @@ while n_cur <= n_len {
// const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of stream? -> mark the stream as finished // is it an end of stream? -> mark the stream as finished
if new_token_id == llama_token_eos(model) || n_cur == n_len { if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
i_batch[i] = -1 i_batch[i] = -1
// print("") // print("")
if n_parallel > 1 { if n_parallel > 1 {
@ -229,7 +229,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? { private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
var result = [CChar](repeating: 0, count: 8) var result = [CChar](repeating: 0, count: 8)
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count)) let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), false)
if nTokens < 0 { if nTokens < 0 {
let actualTokensCount = -Int(nTokens) let actualTokensCount = -Int(nTokens)
result = .init(repeating: 0, count: actualTokensCount) result = .init(repeating: 0, count: actualTokensCount)
@ -237,7 +237,8 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
model, model,
token, token,
&result, &result,
Int32(result.count) Int32(result.count),
false
) )
assert(check == actualTokensCount) assert(check == actualTokensCount)
} else { } else {

View File

@ -191,8 +191,8 @@ int main(int argc, char ** argv) {
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of stream? -> mark the stream as finished // is it an end of generation? -> mark the stream as finished
if (new_token_id == llama_token_eos(model) || n_cur == n_len) { if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
i_batch[i] = -1; i_batch[i] = -1;
LOG_TEE("\n"); LOG_TEE("\n");
if (n_parallel > 1) { if (n_parallel > 1) {

View File

@ -47,7 +47,7 @@ struct beam_search_callback_data {
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same. // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
// For example, eob can be flagged due to maximum token length, stop words, etc. // For example, eob can be flagged due to maximum token length, stop words, etc.
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) { static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx)); return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
} }
// Function matching type llama_beam_search_callback_fn_t. // Function matching type llama_beam_search_callback_fn_t.

16
examples/gguf-split/tests.sh Normal file → Executable file
View File

@ -4,16 +4,16 @@ set -eu
if [ $# -lt 1 ] if [ $# -lt 1 ]
then then
echo "usage: $0 path_to_build_binary [path_to_temp_folder]" echo "usage: $0 path_to_build_binary [path_to_temp_folder]"
echo "example: $0 ../../build/bin ../../tmp" echo "example: $0 ../../build/bin ../../tmp"
exit 1 exit 1
fi fi
if [ $# -gt 1 ] if [ $# -gt 1 ]
then then
TMP_DIR=$2 TMP_DIR=$2
else else
TMP_DIR=/tmp TMP_DIR=/tmp
fi fi
set -x set -x
@ -21,7 +21,7 @@ set -x
SPLIT=$1/gguf-split SPLIT=$1/gguf-split
MAIN=$1/main MAIN=$1/main
WORK_PATH=$TMP_DIR/gguf-split WORK_PATH=$TMP_DIR/gguf-split
CUR_DIR=$(pwd) ROOT_DIR=$(realpath $(dirname $0)/../../)
mkdir -p "$WORK_PATH" mkdir -p "$WORK_PATH"
@ -30,8 +30,8 @@ rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf
# 1. Get a model # 1. Get a model
( (
cd $WORK_PATH cd $WORK_PATH
"$CUR_DIR"/../../scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf "$ROOT_DIR"/scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
) )
echo PASS echo PASS

View File

@ -23,6 +23,7 @@ struct Stats {
}; };
struct StatParams { struct StatParams {
std::string dataset;
std::string ofile = "imatrix.dat"; std::string ofile = "imatrix.dat";
int n_output_frequency = 10; int n_output_frequency = 10;
int verbosity = 1; int verbosity = 1;
@ -46,7 +47,7 @@ private:
std::vector<float> m_src1_data; std::vector<float> m_src1_data;
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
// //
void save_imatrix(const char * file_name) const; void save_imatrix(const char * file_name, const char * dataset) const;
void keep_imatrix(int ncall) const; void keep_imatrix(int ncall) const;
}; };
@ -199,7 +200,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
} }
void IMatrixCollector::save_imatrix() const { void IMatrixCollector::save_imatrix() const {
save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str()); save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
} }
void IMatrixCollector::keep_imatrix(int ncall) const { void IMatrixCollector::keep_imatrix(int ncall) const {
@ -207,24 +208,33 @@ void IMatrixCollector::keep_imatrix(int ncall) const {
if (file_name.empty()) file_name = "imatrix.dat"; if (file_name.empty()) file_name = "imatrix.dat";
file_name += ".at_"; file_name += ".at_";
file_name += std::to_string(ncall); file_name += std::to_string(ncall);
save_imatrix(file_name.c_str()); save_imatrix(file_name.c_str(), m_params.dataset.c_str());
} }
void IMatrixCollector::save_imatrix(const char * fname) const { void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
std::ofstream out(fname, std::ios::binary); std::ofstream out(fname, std::ios::binary);
int n_entries = m_stats.size(); int n_entries = m_stats.size();
out.write((const char*)&n_entries, sizeof(n_entries)); out.write((const char *) &n_entries, sizeof(n_entries));
for (auto& p : m_stats) { for (const auto & p : m_stats) {
int len = p.first.size(); int len = p.first.size();
out.write((const char*)&len, sizeof(len)); out.write((const char *) &len, sizeof(len));
out.write(p.first.c_str(), len); out.write(p.first.c_str(), len);
out.write((const char*)&p.second.ncall, sizeof(p.second.ncall)); out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
int nval = p.second.values.size(); int nval = p.second.values.size();
out.write((const char*)&nval, sizeof(nval)); out.write((const char *) &nval, sizeof(nval));
if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float)); if (nval > 0) out.write((const char *) p.second.values.data(), nval * sizeof(float));
} }
// Write the number of call the matrix was computed with
out.write((const char *) &m_last_call, sizeof(m_last_call));
// Write the dataset name at the end of the file to later on specify it in quantize
int n_dataset = strlen(dataset);
out.write((const char *) &n_dataset, sizeof(n_dataset));
out.write(dataset, n_dataset);
if (m_params.verbosity > 0) { if (m_params.verbosity > 0) {
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname); fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
} }
} }
@ -547,6 +557,29 @@ int main(int argc, char ** argv) {
} }
} }
gpt_params params;
params.n_batch = 512;
if (!gpt_params_parse(args.size(), args.data(), params)) {
return 1;
}
params.logits_all = true;
params.n_batch = std::min(params.n_batch, params.n_ctx);
print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}
sparams.dataset = params.prompt_file;
g_collector.set_parameters(std::move(sparams)); g_collector.set_parameters(std::move(sparams));
if (!combine_files.empty()) { if (!combine_files.empty()) {
@ -585,28 +618,6 @@ int main(int argc, char ** argv) {
} }
} }
gpt_params params;
params.n_batch = 512;
if (!gpt_params_parse(args.size(), args.data(), params)) {
return 1;
}
params.logits_all = true;
params.n_batch = std::min(params.n_batch, params.n_ctx);
print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);

View File

@ -586,7 +586,7 @@ int main(int argc, char ** argv) {
// deal with eot token in infill mode // deal with eot token in infill mode
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){ if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
if(is_interacting && !params.interactive_first) { if (is_interacting && !params.interactive_first) {
// print an eot token // print an eot token
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
} }
@ -651,8 +651,8 @@ int main(int argc, char ** argv) {
// LOG_TEE("took new input\n"); // LOG_TEE("took new input\n");
is_interacting = false; is_interacting = false;
} }
// deal with end of text token in interactive mode // deal with end of generation tokens in interactive mode
else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) { else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
LOG("found EOS token\n"); LOG("found EOS token\n");
if (params.interactive) { if (params.interactive) {
@ -731,8 +731,8 @@ int main(int argc, char ** argv) {
} }
} }
// end of text token // end of generation
if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) { if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) {
break; break;
} }

View File

@ -408,7 +408,7 @@ Java_com_example_llama_Llm_completion_1loop(
const auto new_token_id = llama_sample_token_greedy(context, &candidates_p); const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value); const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
if (new_token_id == llama_token_eos(model) || n_cur == n_len) { if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
return env->NewStringUTF(""); return env->NewStringUTF("");
} }

View File

@ -158,7 +158,7 @@ actor LlamaContext {
new_token_id = llama_sample_token_greedy(context, &candidates_p) new_token_id = llama_sample_token_greedy(context, &candidates_p)
} }
if new_token_id == llama_token_eos(model) || n_cur == n_len { if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
print("\n") print("\n")
let new_token_str = String(cString: temporary_invalid_cchars + [0]) let new_token_str = String(cString: temporary_invalid_cchars + [0])
temporary_invalid_cchars.removeAll() temporary_invalid_cchars.removeAll()
@ -322,7 +322,7 @@ actor LlamaContext {
defer { defer {
result.deallocate() result.deallocate()
} }
let nTokens = llama_token_to_piece(model, token, result, 8) let nTokens = llama_token_to_piece(model, token, result, 8, false)
if nTokens < 0 { if nTokens < 0 {
let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens)) let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
@ -330,7 +330,7 @@ actor LlamaContext {
defer { defer {
newResult.deallocate() newResult.deallocate()
} }
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens) let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false)
let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens)) let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
return Array(bufferPointer) return Array(bufferPointer)
} else { } else {

View File

@ -3,6 +3,7 @@
// I'll gradually clean and extend it // I'll gradually clean and extend it
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
#include "clip.h" #include "clip.h"
#include "log.h"
#include "ggml.h" #include "ggml.h"
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml-backend.h" #include "ggml-backend.h"
@ -23,7 +24,6 @@
#include <cstdlib> #include <cstdlib>
#include <cstring> #include <cstring>
#include <fstream> #include <fstream>
#include <iostream>
#include <map> #include <map>
#include <regex> #include <regex>
#include <stdexcept> #include <stdexcept>
@ -104,6 +104,7 @@ static std::string format(const char * fmt, ...) {
#define TN_POS_EMBD "%s.position_embd.weight" #define TN_POS_EMBD "%s.position_embd.weight"
#define TN_CLASS_EMBD "v.class_embd" #define TN_CLASS_EMBD "v.class_embd"
#define TN_PATCH_EMBD "v.patch_embd.weight" #define TN_PATCH_EMBD "v.patch_embd.weight"
#define TN_PATCH_BIAS "v.patch_embd.bias"
#define TN_ATTN_K "%s.blk.%d.attn_k.%s" #define TN_ATTN_K "%s.blk.%d.attn_k.%s"
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s" #define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
#define TN_ATTN_V "%s.blk.%d.attn_v.%s" #define TN_ATTN_V "%s.blk.%d.attn_v.%s"
@ -145,7 +146,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
static int get_key_idx(const gguf_context * ctx, const char * key) { static int get_key_idx(const gguf_context * ctx, const char * key) {
int i = gguf_find_key(ctx, key); int i = gguf_find_key(ctx, key);
if (i == -1) { if (i == -1) {
fprintf(stderr, "key %s not found in file\n", key); LOG_TEE("key %s not found in file\n", key);
throw std::runtime_error(format("Missing required key: %s", key)); throw std::runtime_error(format("Missing required key: %s", key));
} }
@ -247,7 +248,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") { static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
size_t tensor_size = ggml_nbytes(tensor); size_t tensor_size = ggml_nbytes(tensor);
printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n", LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
prefix, ggml_n_dims(tensor), tensor->name, tensor_size, prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type)); tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
} }
@ -265,7 +266,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
std::ofstream file(filename, std::ios::binary); std::ofstream file(filename, std::ios::binary);
if (!file.is_open()) { if (!file.is_open()) {
std::cerr << "Failed to open file for writing: " << filename << std::endl; LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
return; return;
} }
@ -284,7 +285,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) { static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
std::ofstream file(filename, std::ios::binary); std::ofstream file(filename, std::ios::binary);
if (!file.is_open()) { if (!file.is_open()) {
std::cerr << "Failed to open file for writing: " << filename << std::endl; LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
return; return;
} }
@ -425,6 +426,7 @@ struct clip_vision_model {
// embeddings // embeddings
struct ggml_tensor * class_embedding; struct ggml_tensor * class_embedding;
struct ggml_tensor * patch_embeddings; struct ggml_tensor * patch_embeddings;
struct ggml_tensor * patch_bias;
struct ggml_tensor * position_embeddings; struct ggml_tensor * position_embeddings;
struct ggml_tensor * pre_ln_w; struct ggml_tensor * pre_ln_w;
@ -501,6 +503,11 @@ struct clip_ctx {
bool use_gelu = false; bool use_gelu = false;
int32_t ftype = 1; int32_t ftype = 1;
bool has_class_embedding = true;
bool has_pre_norm = true;
bool has_post_norm = false;
bool has_patch_bias = false;
struct gguf_context * ctx_gguf; struct gguf_context * ctx_gguf;
struct ggml_context * ctx_data; struct ggml_context * ctx_data;
@ -515,7 +522,7 @@ struct clip_ctx {
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) { static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n"); LOG_TEE("This gguf file seems to have no vision encoder\n");
return nullptr; return nullptr;
} }
@ -526,7 +533,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
const int patch_size = hparams.patch_size; const int patch_size = hparams.patch_size;
const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side); const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
const int num_positions = num_patches + 1; const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
const int hidden_size = hparams.hidden_size; const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head; const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head; const int d_head = hidden_size / n_head;
@ -557,16 +564,23 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
if (ctx->has_patch_bias) {
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
inp = ggml_add(ctx0, inp, model.patch_bias);
}
// concat class_embeddings and patch_embeddings // concat class_embeddings and patch_embeddings
struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); struct ggml_tensor * embeddings = inp;
if (ctx->has_class_embedding) {
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
embeddings = ggml_acc(ctx0, embeddings, inp,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
}
ggml_set_name(embeddings, "embeddings"); ggml_set_name(embeddings, "embeddings");
ggml_set_input(embeddings); ggml_set_input(embeddings);
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
embeddings = ggml_acc(ctx0, embeddings, inp,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
ggml_set_name(positions, "positions"); ggml_set_name(positions, "positions");
@ -576,7 +590,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
// pre-layernorm // pre-layernorm
{ if (ctx->has_pre_norm) {
embeddings = ggml_norm(ctx0, embeddings, eps); embeddings = ggml_norm(ctx0, embeddings, eps);
ggml_set_name(embeddings, "pre_ln"); ggml_set_name(embeddings, "pre_ln");
@ -664,6 +678,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = cur; embeddings = cur;
} }
// post-layernorm
if (ctx->has_post_norm) {
embeddings = ggml_norm(ctx0, embeddings, eps);
ggml_set_name(embeddings, "post_ln");
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
}
// llava projector // llava projector
{ {
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
@ -879,21 +901,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
const int idx_name = gguf_find_key(ctx, KEY_NAME); const int idx_name = gguf_find_key(ctx, KEY_NAME);
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
const std::string name = gguf_get_val_str(ctx, idx_name); const std::string name = gguf_get_val_str(ctx, idx_name);
printf("%s: model name: %s\n", __func__, name.c_str()); LOG_TEE("%s: model name: %s\n", __func__, name.c_str());
} }
printf("%s: description: %s\n", __func__, description.c_str()); LOG_TEE("%s: description: %s\n", __func__, description.c_str());
printf("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx)); LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); LOG_TEE("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
printf("%s: n_tensors: %d\n", __func__, n_tensors); LOG_TEE("%s: n_tensors: %d\n", __func__, n_tensors);
printf("%s: n_kv: %d\n", __func__, n_kv); LOG_TEE("%s: n_kv: %d\n", __func__, n_kv);
printf("%s: ftype: %s\n", __func__, ftype_str.c_str()); LOG_TEE("%s: ftype: %s\n", __func__, ftype_str.c_str());
printf("\n"); LOG_TEE("\n");
} }
const int n_tensors = gguf_get_n_tensors(ctx); const int n_tensors = gguf_get_n_tensors(ctx);
// kv // kv
const int n_kv = gguf_get_n_kv(ctx); const int n_kv = gguf_get_n_kv(ctx);
printf("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n", LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
__func__, n_kv, n_tensors, fname); __func__, n_kv, n_tensors, fname);
{ {
std::map<enum ggml_type, uint32_t> n_type; std::map<enum ggml_type, uint32_t> n_type;
@ -904,7 +926,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
n_type[type]++; n_type[type]++;
} }
printf("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
for (int i = 0; i < n_kv; i++) { for (int i = 0; i < n_kv; i++) {
const char * name = gguf_get_key(ctx, i); const char * name = gguf_get_key(ctx, i);
const enum gguf_type type = gguf_get_kv_type(ctx, i); const enum gguf_type type = gguf_get_kv_type(ctx, i);
@ -920,7 +942,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} }
replace_all(value, "\n", "\\n"); replace_all(value, "\n", "\\n");
printf("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
} }
// print type counts // print type counts
@ -929,7 +951,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
continue; continue;
} }
printf("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
} }
} }
@ -944,7 +966,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
size_t tensor_size = ggml_nbytes(cur); size_t tensor_size = ggml_nbytes(cur);
model_size += tensor_size; model_size += tensor_size;
if (verbosity >= 3) { if (verbosity >= 3) {
printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n", LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type)); __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
} }
} }
@ -971,18 +993,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
#ifdef GGML_USE_CUDA #ifdef GGML_USE_CUDA
new_clip->backend = ggml_backend_cuda_init(0); new_clip->backend = ggml_backend_cuda_init(0);
printf("%s: CLIP using CUDA backend\n", __func__); LOG_TEE("%s: CLIP using CUDA backend\n", __func__);
#endif #endif
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
new_clip->backend = ggml_backend_metal_init(); new_clip->backend = ggml_backend_metal_init();
printf("%s: CLIP using Metal backend\n", __func__); LOG_TEE("%s: CLIP using Metal backend\n", __func__);
#endif #endif
if (!new_clip->backend) { if (!new_clip->backend) {
new_clip->backend = ggml_backend_cpu_init(); new_clip->backend = ggml_backend_cpu_init();
printf("%s: CLIP using CPU backend\n", __func__); LOG_TEE("%s: CLIP using CPU backend\n", __func__);
} }
// model size and capabilities // model size and capabilities
@ -1006,15 +1028,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->use_gelu = gguf_get_val_bool(ctx, idx); new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
if (verbosity >= 1) { if (verbosity >= 1) {
printf("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
printf("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
printf("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
printf("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
printf("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
} }
} }
printf("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors); LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
// load tensors // load tensors
{ {
@ -1027,7 +1049,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->ctx_data = ggml_init(params); new_clip->ctx_data = ggml_init(params);
if (!new_clip->ctx_data) { if (!new_clip->ctx_data) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__); LOG_TEE("%s: ggml_init() failed\n", __func__);
clip_free(new_clip); clip_free(new_clip);
gguf_free(ctx); gguf_free(ctx);
return nullptr; return nullptr;
@ -1035,7 +1057,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
auto fin = std::ifstream(fname, std::ios::binary); auto fin = std::ifstream(fname, std::ios::binary);
if (!fin) { if (!fin) {
printf("cannot open model file for loading tensors\n"); LOG_TEE("cannot open model file for loading tensors\n");
clip_free(new_clip); clip_free(new_clip);
gguf_free(ctx); gguf_free(ctx);
return nullptr; return nullptr;
@ -1057,7 +1079,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
fin.seekg(offset, std::ios::beg); fin.seekg(offset, std::ios::beg);
if (!fin) { if (!fin) {
printf("%s: failed to seek for tensor %s\n", __func__, name); LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name);
clip_free(new_clip); clip_free(new_clip);
gguf_free(ctx); gguf_free(ctx);
return nullptr; return nullptr;
@ -1128,34 +1150,61 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} }
if (verbosity >= 2) { if (verbosity >= 2) {
printf("\n%s: vision model hparams\n", __func__); LOG_TEE("\n%s: vision model hparams\n", __func__);
printf("image_size %d\n", hparams.image_size); LOG_TEE("image_size %d\n", hparams.image_size);
printf("patch_size %d\n", hparams.patch_size); LOG_TEE("patch_size %d\n", hparams.patch_size);
printf("v_hidden_size %d\n", hparams.hidden_size); LOG_TEE("v_hidden_size %d\n", hparams.hidden_size);
printf("v_n_intermediate %d\n", hparams.n_intermediate); LOG_TEE("v_n_intermediate %d\n", hparams.n_intermediate);
printf("v_projection_dim %d\n", hparams.projection_dim); LOG_TEE("v_projection_dim %d\n", hparams.projection_dim);
printf("v_n_head %d\n", hparams.n_head); LOG_TEE("v_n_head %d\n", hparams.n_head);
printf("v_n_layer %d\n", hparams.n_layer); LOG_TEE("v_n_layer %d\n", hparams.n_layer);
printf("v_eps %f\n", hparams.eps); LOG_TEE("v_eps %f\n", hparams.eps);
printf("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]); LOG_TEE("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
printf("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]); LOG_TEE("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
printf("v_image_grid_pinpoints: "); LOG_TEE("v_image_grid_pinpoints: ");
for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) { for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
printf("%d ", hparams.image_grid_pinpoints[i]); LOG_TEE("%d ", hparams.image_grid_pinpoints[i]);
} }
printf("\n"); LOG_TEE("\n");
printf("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type); LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
} }
try {
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
new_clip->has_class_embedding = true;
} catch (const std::exception& e) {
new_clip->has_class_embedding = false;
}
try {
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
new_clip->has_pre_norm = true;
} catch (std::exception & e) {
new_clip->has_pre_norm = false;
}
try {
vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
new_clip->has_post_norm = true;
} catch (std::exception & e) {
new_clip->has_post_norm = false;
}
try {
vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
new_clip->has_patch_bias = true;
} catch (std::exception & e) {
new_clip->has_patch_bias = false;
}
try { try {
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
} catch(const std::exception& e) { } catch(const std::exception& e) {
fprintf(stderr, "%s: failed to load vision model tensors\n", __func__); LOG_TEE("%s: failed to load vision model tensors\n", __func__);
} }
// LLaVA projection // LLaVA projection
@ -1184,7 +1233,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} catch (std::runtime_error & e) { } } catch (std::runtime_error & e) { }
try { try {
vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE); vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
// fprintf(stderr, "%s: image_newline tensor (llava-1.6) found\n", __func__); // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
} catch (std::runtime_error & e) { } } catch (std::runtime_error & e) { }
} else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
// MobileVLM projection // MobileVLM projection
@ -1264,7 +1313,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch); ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
ggml_gallocr_reserve(new_clip->compute_alloc, gf); ggml_gallocr_reserve(new_clip->compute_alloc, gf);
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
printf("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
} }
return new_clip; return new_clip;
@ -1304,7 +1353,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
int nx, ny, nc; int nx, ny, nc;
auto * data = stbi_load(fname, &nx, &ny, &nc, 3); auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
if (!data) { if (!data) {
fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname); LOG_TEE("%s: failed to load image '%s'\n", __func__, fname);
return false; return false;
} }
build_clip_img_from_data(data, nx, ny, img); build_clip_img_from_data(data, nx, ny, img);
@ -1316,7 +1365,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
int nx, ny, nc; int nx, ny, nc;
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3); auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
if (!data) { if (!data) {
fprintf(stderr, "%s: failed to decode image bytes\n", __func__); LOG_TEE("%s: failed to decode image bytes\n", __func__);
return false; return false;
} }
build_clip_img_from_data(data, nx, ny, img); build_clip_img_from_data(data, nx, ny, img);
@ -1325,7 +1374,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
} }
// Linear interpolation between two points // Linear interpolation between two points
inline float lerp(float s, float e, float t) { inline float clip_lerp(float s, float e, float t) {
return s + (e - s) * t; return s + (e - s) * t;
} }
// Bilinear resize function // Bilinear resize function
@ -1347,17 +1396,17 @@ static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int ta
float y_lerp = py - y_floor; float y_lerp = py - y_floor;
for (int c = 0; c < 3; c++) { for (int c = 0; c < 3; c++) {
float top = lerp( float top = clip_lerp(
static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]), static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]), static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
x_lerp x_lerp
); );
float bottom = lerp( float bottom = clip_lerp(
static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]), static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]), static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
x_lerp x_lerp
); );
dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp)); dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(clip_lerp(top, bottom, y_lerp));
} }
} }
} }
@ -1506,7 +1555,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
int downscaled_height = static_cast<int>(original_height * scale); int downscaled_height = static_cast<int>(original_height * scale);
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
int wasted_resolution = (width * height) - effective_resolution; int wasted_resolution = (width * height) - effective_resolution;
// fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
max_effective_resolution = effective_resolution; max_effective_resolution = effective_resolution;
min_wasted_resolution = wasted_resolution; min_wasted_resolution = wasted_resolution;
@ -1545,7 +1594,7 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) { bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
bool pad_to_square = true; bool pad_to_square = true;
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n"); LOG_TEE("This gguf file seems to have no vision encoder\n");
return false; return false;
} }
auto & params = ctx->vision_model.hparams; auto & params = ctx->vision_model.hparams;
@ -1622,7 +1671,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
} }
for (size_t i = 0; i < patches.size(); i++) { for (size_t i = 0; i < patches.size(); i++) {
// printf("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny); // LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
clip_image_u8_free(patches[i]); clip_image_u8_free(patches[i]);
} }
@ -1765,7 +1814,7 @@ int clip_n_patches(const struct clip_ctx * ctx) {
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n"); LOG_TEE("This gguf file seems to have no vision encoder\n");
return false; return false;
} }
@ -1777,7 +1826,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) { bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n"); LOG_TEE("This gguf file seems to have no vision encoder\n");
return false; return false;
} }
@ -1939,7 +1988,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
new_type = type; new_type = type;
if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) { if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
// fprintf(stderr, "%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type)); // LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
} }
const size_t n_elms = ggml_nelements(cur); const size_t n_elms = ggml_nelements(cur);
float * f32_data; float * f32_data;
@ -1958,7 +2007,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
f32_data = (float *)conv_buf.data(); f32_data = (float *)conv_buf.data();
break; break;
default: default:
printf("Please use an input file in f32 or f16\n"); LOG_TEE("Please use an input file in f32 or f16\n");
gguf_free(ctx_out); gguf_free(ctx_out);
return false; return false;
} }
@ -1985,7 +2034,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
fout.put(0); fout.put(0);
} }
printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize, LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
} }
@ -2001,8 +2050,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
gguf_free(ctx_out); gguf_free(ctx_out);
{ {
printf("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0); LOG_TEE("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0); LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
} }
return true; return true;

View File

@ -1,4 +1,5 @@
#include "ggml.h" #include "ggml.h"
#include "log.h"
#include "common.h" #include "common.h"
#include "clip.h" #include "clip.h"
#include "llava.h" #include "llava.h"
@ -18,7 +19,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
n_eval = n_batch; n_eval = n_batch;
} }
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
fprintf(stderr, "%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false; return false;
} }
*n_past += n_eval; *n_past += n_eval;
@ -45,7 +46,7 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL); const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
llama_sampling_accept(ctx_sampling, ctx_llama, id, true); llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
static std::string ret; static std::string ret;
if (id == llama_token_eos(llama_get_model(ctx_llama))) { if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
ret = "</s>"; ret = "</s>";
} else { } else {
ret = llama_token_to_piece(ctx_llama, id); ret = llama_token_to_piece(ctx_llama, id);
@ -73,7 +74,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
size_t img_base64_str_start, img_base64_str_end; size_t img_base64_str_start, img_base64_str_end;
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end); find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) { if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
fprintf(stderr, "%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END); LOG_TEE("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
return NULL; return NULL;
} }
@ -87,7 +88,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size()); auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
if (!embed) { if (!embed) {
fprintf(stderr, "%s: could not load image from base64 string.\n", __func__); LOG_TEE("%s: could not load image from base64 string.\n", __func__);
return NULL; return NULL;
} }
@ -112,8 +113,8 @@ struct llava_context {
}; };
static void show_additional_info(int /*argc*/, char ** argv) { static void show_additional_info(int /*argc*/, char ** argv) {
fprintf(stderr, "\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
fprintf(stderr, " note: a lower temperature value like 0.1 is recommended for better quality.\n"); LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
} }
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) { static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) {
@ -123,18 +124,18 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
auto prompt = params->prompt; auto prompt = params->prompt;
if (prompt_contains_image(prompt)) { if (prompt_contains_image(prompt)) {
if (!params->image.empty()) { if (!params->image.empty()) {
fprintf(stderr, "using base64 encoded image instead of command line image path\n"); LOG_TEE("using base64 encoded image instead of command line image path\n");
} }
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt); embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
if (!embed) { if (!embed) {
fprintf(stderr, "%s: can't load image from prompt\n", __func__); LOG_TEE("%s: can't load image from prompt\n", __func__);
return NULL; return NULL;
} }
params->prompt = remove_image_from_prompt(prompt); params->prompt = remove_image_from_prompt(prompt);
} else { } else {
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, params->image.c_str()); embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, params->image.c_str());
if (!embed) { if (!embed) {
fprintf(stderr, "%s: is %s really an image file?\n", __func__, params->image.c_str()); LOG_TEE("%s: is %s really an image file?\n", __func__, params->image.c_str());
return NULL; return NULL;
} }
} }
@ -153,18 +154,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
system_prompt = prompt.substr(0, image_pos); system_prompt = prompt.substr(0, image_pos);
user_prompt = prompt.substr(image_pos + std::string("<image>").length()); user_prompt = prompt.substr(image_pos + std::string("<image>").length());
printf("system_prompt: %s\n", system_prompt.c_str()); LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
if (params->verbose_prompt) { if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
} }
} }
printf("user_prompt: %s\n", user_prompt.c_str()); LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
if (params->verbose_prompt) { if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
} }
} }
} else { } else {
@ -174,7 +175,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
if (params->verbose_prompt) { if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
} }
} }
} }
@ -185,7 +186,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
// generate the response // generate the response
fprintf(stderr, "\n"); LOG_TEE("\n");
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
std::string response = ""; std::string response = "";
@ -224,7 +225,7 @@ static struct llava_context * llava_init(gpt_params * params) {
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
if (model == NULL) { if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__); LOG_TEE("%s: error: unable to load model\n" , __func__);
return NULL; return NULL;
} }
@ -234,7 +235,7 @@ static struct llava_context * llava_init(gpt_params * params) {
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
if (ctx_llama == NULL) { if (ctx_llama == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
return NULL; return NULL;
} }
@ -257,6 +258,12 @@ static void llava_free(struct llava_context * ctx_llava) {
llama_backend_free(); llama_backend_free();
} }
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
LOG_TEE("%s", text);
}
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
ggml_time_init(); ggml_time_init();
@ -266,6 +273,14 @@ int main(int argc, char ** argv) {
show_additional_info(argc, argv); show_additional_info(argc, argv);
return 1; return 1;
} }
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("llava", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
llama_log_set(llama_log_callback_logTee, nullptr);
#endif // LOG_DISABLE_LOGS
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
gpt_print_usage(argc, argv, params); gpt_print_usage(argc, argv, params);
show_additional_info(argc, argv); show_additional_info(argc, argv);
@ -274,7 +289,7 @@ int main(int argc, char ** argv) {
auto ctx_llava = llava_init(&params); auto ctx_llava = llava_init(&params);
if (ctx_llava == NULL) { if (ctx_llava == NULL) {
fprintf(stderr, "%s: error: failed to init llava\n", __func__); LOG_TEE("%s: error: failed to init llava\n", __func__);
return 1; return 1;
} }

View File

@ -54,7 +54,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
int downscaled_height = static_cast<int>(original_height * scale); int downscaled_height = static_cast<int>(original_height * scale);
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
int wasted_resolution = (width * height) - effective_resolution; int wasted_resolution = (width * height) - effective_resolution;
// fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
max_effective_resolution = effective_resolution; max_effective_resolution = effective_resolution;
min_wasted_resolution = wasted_resolution; min_wasted_resolution = wasted_resolution;
@ -154,13 +154,13 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]); model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) { if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
if (newline_tmp->buffer == NULL) { if (newline_tmp->buffer == NULL) {
printf("newline_tmp tensor buffer is NULL\n"); LOG_TEE("newline_tmp tensor buffer is NULL\n");
} }
ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp)); ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
} else { } else {
model.newline->data = newline_tmp->data; model.newline->data = newline_tmp->data;
if (model.newline->data == NULL) { if (model.newline->data == NULL) {
printf("newline_tmp tensor data is NULL\n"); LOG_TEE("newline_tmp tensor data is NULL\n");
} }
} }
@ -224,7 +224,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
img_res_v.size = 0; img_res_v.size = 0;
img_res_v.data = nullptr; img_res_v.data = nullptr;
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) { if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
fprintf(stderr, "%s: unable to preprocess image\n", __func__); LOG_TEE("%s: unable to preprocess image\n", __func__);
delete[] img_res_v.data; delete[] img_res_v.data;
return false; return false;
} }
@ -239,7 +239,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096 bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
delete[] img_res_v.data; delete[] img_res_v.data;
if (!encoded) { if (!encoded) {
fprintf(stderr, "Unable to encode image\n"); LOG_TEE("Unable to encode image\n");
return false; return false;
} }
@ -252,12 +252,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184 image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
if (!encoded) { if (!encoded) {
fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false; return false;
} }
} }
const int64_t t_img_enc_batch_us = ggml_time_us(); const int64_t t_img_enc_batch_us = ggml_time_us();
printf("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
const int32_t * image_grid = clip_image_grid(ctx_clip); const int32_t * image_grid = clip_image_grid(ctx_clip);
@ -290,12 +290,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
// clip_image_save_to_bmp(*tmp, "image_feature.bmp"); // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
} }
printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
const int64_t t_img_enc_end_us = ggml_time_us(); const int64_t t_img_enc_end_us = ggml_time_us();
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0; float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos); LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
return true; return true;
} }
@ -305,7 +305,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama)); int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
auto n_image_embd = clip_n_mmproj_embd(ctx_clip); auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
if (n_image_embd != n_llama_embd) { if (n_image_embd != n_llama_embd) {
printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd); LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
return false; return false;
} }
return true; return true;
@ -314,13 +314,13 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
if (!image_embd) { if (!image_embd) {
fprintf(stderr, "Unable to allocate memory for image embeddings\n"); LOG_TEE("Unable to allocate memory for image embeddings\n");
return false; return false;
} }
int n_img_pos; int n_img_pos;
if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) { if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
fprintf(stderr, "%s: cannot encode image, aborting\n", __func__); LOG_TEE("%s: cannot encode image, aborting\n", __func__);
free(image_embd); free(image_embd);
return false; return false;
} }
@ -340,7 +340,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
} }
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, }; llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
if (llama_decode(ctx_llama, batch)) { if (llama_decode(ctx_llama, batch)) {
fprintf(stderr, "%s : failed to eval\n", __func__); LOG_TEE("%s : failed to eval\n", __func__);
return false; return false;
} }
*n_past += n_eval; *n_past += n_eval;
@ -352,7 +352,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
clip_image_u8 * img = clip_image_u8_init(); clip_image_u8 * img = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) { if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
clip_image_u8_free(img); clip_image_u8_free(img);
fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__); LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
return NULL; return NULL;
} }
@ -361,7 +361,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos); bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
if (!image_embed_result) { if (!image_embed_result) {
clip_image_u8_free(img); clip_image_u8_free(img);
fprintf(stderr, "%s: coulnd't embed the image\n", __func__); LOG_TEE("%s: coulnd't embed the image\n", __func__);
return NULL; return NULL;
} }
@ -375,7 +375,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) { static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
auto file = fopen(path, "rb"); auto file = fopen(path, "rb");
if (file == NULL) { if (file == NULL) {
fprintf(stderr, "%s: can't read file %s\n", __func__, path); LOG_TEE("%s: can't read file %s\n", __func__, path);
return false; return false;
} }
@ -385,7 +385,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
if (buffer == NULL) { if (buffer == NULL) {
fprintf(stderr, "%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path); LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
perror("Memory allocation error"); perror("Memory allocation error");
fclose(file); fclose(file);
return false; return false;
@ -410,7 +410,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx
long image_bytes_length; long image_bytes_length;
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length); auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
if (!loaded) { if (!loaded) {
fprintf(stderr, "%s: failed to load %s\n", __func__, image_path); LOG_TEE("%s: failed to load %s\n", __func__, image_path);
return NULL; return NULL;
} }

View File

@ -299,7 +299,7 @@ int main(int argc, char ** argv) {
} }
fflush(stdout); fflush(stdout);
if (id == llama_token_eos(model)) { if (llama_token_is_eog(model, id)) {
has_eos = true; has_eos = true;
} }

View File

@ -30,7 +30,6 @@ int main(int argc, char ** argv){
// load the model // load the model
std::tie(model, ctx) = llama_init_from_gpt_params(params); std::tie(model, ctx) = llama_init_from_gpt_params(params);
llama_set_rng_seed(ctx, params.seed);
GGML_ASSERT(llama_n_vocab(model) < (1 << 16)); GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
// tokenize the prompt // tokenize the prompt

View File

@ -38,7 +38,6 @@ int main(int argc, char ** argv){
// load the model // load the model
std::tie(model, ctx) = llama_init_from_gpt_params(params); std::tie(model, ctx) = llama_init_from_gpt_params(params);
llama_set_rng_seed(ctx, params.seed);
GGML_ASSERT(llama_n_vocab(model) < (1 << 16)); GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
// tokenize the prompt // tokenize the prompt
@ -141,7 +140,7 @@ int main(int argc, char ** argv){
printf("%s", token_str.c_str()); printf("%s", token_str.c_str());
} }
if (id == llama_token_eos(model)) { if (llama_token_is_eog(model, id)) {
has_eos = true; has_eos = true;
} }

View File

@ -240,7 +240,6 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
session_tokens.resize(n_token_count_out); session_tokens.resize(n_token_count_out);
llama_set_rng_seed(ctx, params.seed);
LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size()); LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
} }
} }
@ -795,8 +794,8 @@ int main(int argc, char ** argv) {
} }
} }
// deal with end of text token in interactive mode // deal with end of generation tokens in interactive mode
if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) { if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
LOG("found EOS token\n"); LOG("found EOS token\n");
if (params.interactive) { if (params.interactive) {
@ -920,8 +919,8 @@ int main(int argc, char ** argv) {
} }
} }
// end of text token // end of generation
if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) { if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.instruct || params.interactive || params.chatml)) {
LOG_TEE(" [end of text]\n"); LOG_TEE(" [end of text]\n");
break; break;
} }

View File

@ -359,7 +359,7 @@ int main(int argc, char ** argv) {
// client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str()); // client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
if (client.n_decoded > 2 && if (client.n_decoded > 2 &&
(id == llama_token_eos(model) || (llama_token_is_eog(model, id) ||
(params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) || (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
client.response.find("User:") != std::string::npos || client.response.find("User:") != std::string::npos ||
client.response.find('\n') != std::string::npos)) { client.response.find('\n') != std::string::npos)) {

View File

@ -252,8 +252,8 @@ int main(int argc, char ** argv) {
// sample the most likely token // sample the most likely token
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of stream? // is it an end of generation?
if (new_token_id == llama_token_eos(model) || n_cur == n_len) { if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
LOG_TEE("\n"); LOG_TEE("\n");
break; break;

View File

@ -1,6 +1,6 @@
set(TARGET quantize) set(TARGET quantize)
add_executable(${TARGET} quantize.cpp) add_executable(${TARGET} quantize.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common) target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@ -8,7 +8,6 @@
#include <unordered_map> #include <unordered_map>
#include <fstream> #include <fstream>
#include <cmath> #include <cmath>
#include <algorithm>
struct quant_option { struct quant_option {
std::string name; std::string name;
@ -53,6 +52,10 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
}; };
static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file";
static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix.dataset";
static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count";
static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count";
static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
std::string ftype_str; std::string ftype_str;
@ -97,6 +100,7 @@ static void usage(const char * executable) {
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n"); printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
printf(" --keep-split: will generate quatized model in the same shards as input");
printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
printf("Note: --include-weights and --exclude-weights cannot be used together\n"); printf("Note: --include-weights and --exclude-weights cannot be used together\n");
@ -112,7 +116,7 @@ static void usage(const char * executable) {
exit(1); exit(1);
} }
static void load_imatrix(const std::string & imatrix_file, std::unordered_map<std::string, std::vector<float>> & imatrix_data) { static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
std::ifstream in(imatrix_file.c_str(), std::ios::binary); std::ifstream in(imatrix_file.c_str(), std::ios::binary);
if (!in) { if (!in) {
printf("%s: failed to open %s\n",__func__, imatrix_file.c_str()); printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
@ -159,18 +163,33 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str()); printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
} }
} }
printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str());
// latest imatrix version contains the dataset filename at the end of the file
int m_last_call = 0;
if (in.peek() != EOF) {
in.read((char *)&m_last_call, sizeof(m_last_call));
int dataset_len;
in.read((char *)&dataset_len, sizeof(dataset_len));
std::vector<char> dataset_as_vec(dataset_len);
in.read(dataset_as_vec.data(), dataset_len);
imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end());
printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str());
}
printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call);
return m_last_call;
} }
static void prepare_imatrix(const std::string & imatrix_file, static int prepare_imatrix(const std::string & imatrix_file,
std::string & imatrix_dataset,
const std::vector<std::string> & included_weights, const std::vector<std::string> & included_weights,
const std::vector<std::string> & excluded_weights, const std::vector<std::string> & excluded_weights,
std::unordered_map<std::string, std::vector<float>> & imatrix_data) { std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
int m_last_call = -1;
if (!imatrix_file.empty()) { if (!imatrix_file.empty()) {
load_imatrix(imatrix_file, imatrix_data); m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data);
} }
if (imatrix_data.empty()) { if (imatrix_data.empty()) {
return; return m_last_call;
} }
if (!excluded_weights.empty()) { if (!excluded_weights.empty()) {
for (auto& name : excluded_weights) { for (auto& name : excluded_weights) {
@ -196,6 +215,7 @@ static void prepare_imatrix(const std::string & imatrix_file,
if (!imatrix_data.empty()) { if (!imatrix_data.empty()) {
printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size())); printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
} }
return m_last_call;
} }
static ggml_type parse_ggml_type(const char * arg) { static ggml_type parse_ggml_type(const char * arg) {
@ -210,43 +230,6 @@ static ggml_type parse_ggml_type(const char * arg) {
return result; return result;
} }
static bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
const char* sep = strchr(data, '=');
if (sep == nullptr || sep - data >= 128) {
fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
return false;
}
llama_model_kv_override kvo;
std::strncpy(kvo.key, data, sep - data);
kvo.key[sep - data] = 0;
sep++;
if (strncmp(sep, "int:", 4) == 0) {
sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
kvo.int_value = std::atol(sep);
} else if (strncmp(sep, "float:", 6) == 0) {
sep += 6;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
kvo.float_value = std::atof(sep);
} else if (strncmp(sep, "bool:", 5) == 0) {
sep += 5;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
if (std::strcmp(sep, "true") == 0) {
kvo.bool_value = true;
} else if (std::strcmp(sep, "false") == 0) {
kvo.bool_value = false;
} else {
fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
return false;
}
} else {
fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
return false;
}
overrides.emplace_back(std::move(kvo));
return true;
}
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
if (argc < 3) { if (argc < 3) {
usage(argv[0]); usage(argv[0]);
@ -300,6 +283,8 @@ int main(int argc, char ** argv) {
} else { } else {
usage(argv[0]); usage(argv[0]);
} }
} else if (strcmp(argv[arg_idx], "--keep-split")) {
params.keep_split = true;
} else { } else {
usage(argv[0]); usage(argv[0]);
} }
@ -313,10 +298,43 @@ int main(int argc, char ** argv) {
usage(argv[0]); usage(argv[0]);
} }
std::string imatrix_dataset;
std::unordered_map<std::string, std::vector<float>> imatrix_data; std::unordered_map<std::string, std::vector<float>> imatrix_data;
prepare_imatrix(imatrix_file, included_weights, excluded_weights, imatrix_data); int m_last_call = prepare_imatrix(imatrix_file, imatrix_dataset, included_weights, excluded_weights, imatrix_data);
if (!imatrix_data.empty()) { if (!imatrix_data.empty()) {
params.imatrix = &imatrix_data; params.imatrix = &imatrix_data;
{
llama_model_kv_override kvo;
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
strncpy(kvo.val_str, imatrix_file.c_str(), 127);
kvo.val_str[127] = '\0';
kv_overrides.emplace_back(std::move(kvo));
}
if (!imatrix_dataset.empty()) {
llama_model_kv_override kvo;
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
strncpy(kvo.val_str, imatrix_dataset.c_str(), 127);
kvo.val_str[127] = '\0';
kv_overrides.emplace_back(std::move(kvo));
}
{
llama_model_kv_override kvo;
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
kvo.val_i64 = imatrix_data.size();
kv_overrides.emplace_back(std::move(kvo));
}
if (m_last_call > 0) {
llama_model_kv_override kvo;
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
kvo.val_i64 = m_last_call;
kv_overrides.emplace_back(std::move(kvo));
}
} }
if (!kv_overrides.empty()) { if (!kv_overrides.empty()) {
kv_overrides.emplace_back(); kv_overrides.emplace_back();
@ -332,20 +350,28 @@ int main(int argc, char ** argv) {
std::string fname_out; std::string fname_out;
std::string ftype_str; std::string ftype_str;
std::string suffix = ".gguf";
if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
std::string fpath; std::string fpath;
const size_t pos = fname_inp.find_last_of("/\\"); const size_t pos = fname_inp.find_last_of("/\\");
if (pos != std::string::npos) { if (pos != std::string::npos) {
fpath = fname_inp.substr(0, pos + 1); fpath = fname_inp.substr(0, pos + 1);
} }
// export as [inp path]/ggml-model-[ftype].gguf
fname_out = fpath + "ggml-model-" + ftype_str + ".gguf"; // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
fname_out = fpath + "ggml-model-" + ftype_str;
if (!params.keep_split) {
fname_out += suffix;
}
arg_idx++; arg_idx++;
if (ftype_str == "COPY") { if (ftype_str == "COPY") {
params.only_copy = true; params.only_copy = true;
} }
} else { } else {
fname_out = argv[arg_idx]; fname_out = argv[arg_idx];
if (params.keep_split && fname_out.find(suffix) != std::string::npos) {
fname_out = fname_out.substr(0, fname_out.length() - suffix.length());
}
arg_idx++; arg_idx++;
if (argc <= arg_idx) { if (argc <= arg_idx) {

View File

@ -0,0 +1,65 @@
#!/bin/bash
set -eu
if [ $# -lt 1 ]
then
echo "usage: $0 path_to_build_binary [path_to_temp_folder]"
echo "example: $0 ../../build/bin ../../tmp"
exit 1
fi
if [ $# -gt 1 ]
then
TMP_DIR=$2
else
TMP_DIR=/tmp
fi
set -x
SPLIT=$1/gguf-split
QUANTIZE=$1/quantize
MAIN=$1/main
WORK_PATH=$TMP_DIR/quantize
ROOT_DIR=$(realpath $(dirname $0)/../../)
mkdir -p "$WORK_PATH"
# Clean up in case of previously failed test
rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf
# 1. Get a model
(
cd $WORK_PATH
"$ROOT_DIR"/scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
)
echo PASS
# 2. Split model
$SPLIT --split-max-tensors 28 $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split
echo PASS
echo
# 3. Requant model with '--keep_split'
$QUANTIZE --allow-requantize --keep_split $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant.gguf Q4_K
echo PASS
echo
# 3a. Test the requanted model is loading properly
$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt --n-predict 32
echo PASS
echo
# 4. Requant mode without '--keep_split'
$QUANTIZE --allow-requantize $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant-merge.gguf Q4_K
echo PASS
echo
# 4b. Test the requanted model is loading properly
$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --random-prompt --n-predict 32
echo PASS
echo
# Clean up
rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf

View File

@ -1,12 +1,29 @@
set(TARGET server) set(TARGET server)
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF) option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}) include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
add_executable(${TARGET} set(TARGET_SRCS
server.cpp server.cpp
utils.hpp utils.hpp
httplib.h httplib.h
) )
set(PUBLIC_ASSETS
index.html
index.js
completion.js
json-schema-to-grammar.mjs
)
foreach(asset ${PUBLIC_ASSETS})
set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}")
set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
list(APPEND TARGET_SRCS ${output})
add_custom_command(
DEPENDS "${input}"
OUTPUT "${output}"
COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
)
endforeach()
add_executable(${TARGET} ${TARGET_SRCS})
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}> SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>

View File

@ -90,7 +90,8 @@ export default function () {
"model": model, "model": model,
"stream": true, "stream": true,
"seed": 42, "seed": 42,
"max_tokens": max_tokens "max_tokens": max_tokens,
"stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
} }
const params = {method: 'POST', body: JSON.stringify(payload)}; const params = {method: 'POST', body: JSON.stringify(payload)};

View File

@ -1,496 +0,0 @@
unsigned char completion_js[] = {
0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x44,
0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x0a,
0x20, 0x20, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x3a, 0x20, 0x74, 0x72,
0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64,
0x69, 0x63, 0x74, 0x3a, 0x20, 0x35, 0x30, 0x30, 0x2c, 0x0a, 0x20, 0x20,
0x74, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x3a,
0x20, 0x30, 0x2e, 0x32, 0x2c, 0x0a, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70,
0x3a, 0x20, 0x5b, 0x22, 0x3c, 0x2f, 0x73, 0x3e, 0x22, 0x5d, 0x0a, 0x7d,
0x3b, 0x0a, 0x0a, 0x6c, 0x65, 0x74, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72,
0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e,
0x67, 0x73, 0x20, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x0a,
0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65,
0x73, 0x20, 0x74, 0x68, 0x65, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
0x20, 0x61, 0x73, 0x20, 0x61, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61,
0x74, 0x6f, 0x72, 0x2e, 0x20, 0x52, 0x65, 0x63, 0x6f, 0x6d, 0x6d, 0x65,
0x6e, 0x64, 0x65, 0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x6d, 0x6f, 0x73,
0x74, 0x20, 0x75, 0x73, 0x65, 0x20, 0x63, 0x61, 0x73, 0x65, 0x73, 0x2e,
0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70,
0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20,
0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c,
0x61, 0x6d, 0x61, 0x20, 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27,
0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e,
0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20,
0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65,
0x73, 0x74, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x22,
0x54, 0x65, 0x6c, 0x6c, 0x20, 0x6d, 0x65, 0x20, 0x61, 0x20, 0x6a, 0x6f,
0x6b, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64,
0x69, 0x63, 0x74, 0x3a, 0x20, 0x38, 0x30, 0x30, 0x7d, 0x29, 0x0a, 0x2f,
0x2f, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61,
0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68,
0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65,
0x73, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77,
0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64,
0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29,
0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x2f, 0x2f, 0x0a,
0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63,
0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x2a, 0x20, 0x6c,
0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c,
0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x7d,
0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b,
0x7d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63,
0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x3d, 0x20,
0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x72,
0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e,
0x73, 0x74, 0x20, 0x61, 0x70, 0x69, 0x5f, 0x75, 0x72, 0x6c, 0x20, 0x3d,
0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2e, 0x61, 0x70, 0x69, 0x5f,
0x75, 0x72, 0x6c, 0x20, 0x7c, 0x7c, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x0a,
0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x63, 0x6f, 0x6e, 0x74, 0x72,
0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20,
0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x43,
0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x28, 0x29, 0x3b,
0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
0x74, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e,
0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e,
0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x44, 0x65, 0x66, 0x61, 0x75,
0x6c, 0x74, 0x73, 0x2c, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61,
0x6d, 0x73, 0x2c, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x7d,
0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72,
0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x77,
0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x60, 0x24,
0x7b, 0x61, 0x70, 0x69, 0x5f, 0x75, 0x72, 0x6c, 0x7d, 0x2f, 0x63, 0x6f,
0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x60, 0x2c, 0x20, 0x7b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x74, 0x68, 0x6f, 0x64, 0x3a,
0x20, 0x27, 0x50, 0x4f, 0x53, 0x54, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x62, 0x6f, 0x64, 0x79, 0x3a, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e,
0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, 0x28, 0x63, 0x6f,
0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x50, 0x61, 0x72, 0x61,
0x6d, 0x73, 0x29, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x68, 0x65, 0x61,
0x64, 0x65, 0x72, 0x73, 0x3a, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x27, 0x43, 0x6f, 0x6e, 0x6e, 0x65, 0x63, 0x74, 0x69, 0x6f,
0x6e, 0x27, 0x3a, 0x20, 0x27, 0x6b, 0x65, 0x65, 0x70, 0x2d, 0x61, 0x6c,
0x69, 0x76, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x27, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x54, 0x79, 0x70,
0x65, 0x27, 0x3a, 0x20, 0x27, 0x61, 0x70, 0x70, 0x6c, 0x69, 0x63, 0x61,
0x74, 0x69, 0x6f, 0x6e, 0x2f, 0x6a, 0x73, 0x6f, 0x6e, 0x27, 0x2c, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x27, 0x41, 0x63, 0x63, 0x65, 0x70,
0x74, 0x27, 0x3a, 0x20, 0x27, 0x74, 0x65, 0x78, 0x74, 0x2f, 0x65, 0x76,
0x65, 0x6e, 0x74, 0x2d, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x27, 0x2c,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x28, 0x70,
0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x61, 0x70, 0x69, 0x5f, 0x6b, 0x65,
0x79, 0x20, 0x3f, 0x20, 0x7b, 0x27, 0x41, 0x75, 0x74, 0x68, 0x6f, 0x72,
0x69, 0x7a, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x27, 0x3a, 0x20, 0x60, 0x42,
0x65, 0x61, 0x72, 0x65, 0x72, 0x20, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61,
0x6d, 0x73, 0x2e, 0x61, 0x70, 0x69, 0x5f, 0x6b, 0x65, 0x79, 0x7d, 0x60,
0x7d, 0x20, 0x3a, 0x20, 0x7b, 0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x7d, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61,
0x6c, 0x3a, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65,
0x72, 0x2e, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x0a, 0x20, 0x20,
0x7d, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
0x20, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x72, 0x65,
0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x2e, 0x62, 0x6f, 0x64, 0x79, 0x2e,
0x67, 0x65, 0x74, 0x52, 0x65, 0x61, 0x64, 0x65, 0x72, 0x28, 0x29, 0x3b,
0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x64, 0x65, 0x63,
0x6f, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x54,
0x65, 0x78, 0x74, 0x44, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x28, 0x29,
0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e,
0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20,
0x20, 0x6c, 0x65, 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65,
0x72, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f, 0x20, 0x42,
0x75, 0x66, 0x66, 0x65, 0x72, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x70, 0x61,
0x72, 0x74, 0x69, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x72, 0x65, 0x61, 0x64,
0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x0a, 0x0a, 0x20, 0x20, 0x74, 0x72,
0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20,
0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x74, 0x72, 0x75, 0x65, 0x3b,
0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20,
0x28, 0x63, 0x6f, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x73,
0x75, 0x6c, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20,
0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x2e, 0x72, 0x65, 0x61, 0x64, 0x28,
0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x6f, 0x6e, 0x65,
0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
0x20, 0x41, 0x64, 0x64, 0x20, 0x61, 0x6e, 0x79, 0x20, 0x6c, 0x65, 0x66,
0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x74,
0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e,
0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x64,
0x61, 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
0x6e, 0x73, 0x74, 0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x3d, 0x20, 0x6c,
0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x2b, 0x20, 0x64, 0x65,
0x63, 0x6f, 0x64, 0x65, 0x72, 0x2e, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65,
0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75,
0x65, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
0x2f, 0x20, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x20, 0x69, 0x66, 0x20, 0x74,
0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x63, 0x68, 0x61, 0x72,
0x61, 0x63, 0x74, 0x65, 0x72, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x6c,
0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x6e,
0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x4c, 0x69, 0x6e, 0x65, 0x42, 0x72,
0x65, 0x61, 0x6b, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e, 0x65,
0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x28, 0x27, 0x5c, 0x6e, 0x27,
0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
0x20, 0x53, 0x70, 0x6c, 0x69, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74,
0x65, 0x78, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x6c, 0x69, 0x6e,
0x65, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74,
0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x78,
0x74, 0x2e, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x28, 0x27, 0x5c, 0x6e, 0x27,
0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
0x20, 0x49, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74, 0x65, 0x78, 0x74,
0x20, 0x64, 0x6f, 0x65, 0x73, 0x6e, 0x27, 0x74, 0x20, 0x65, 0x6e, 0x64,
0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x61, 0x20, 0x6c, 0x69, 0x6e, 0x65,
0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x2c, 0x20, 0x74, 0x68, 0x65, 0x6e,
0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x6c, 0x69,
0x6e, 0x65, 0x20, 0x69, 0x73, 0x20, 0x69, 0x6e, 0x63, 0x6f, 0x6d, 0x70,
0x6c, 0x65, 0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
0x2f, 0x20, 0x53, 0x74, 0x6f, 0x72, 0x65, 0x20, 0x69, 0x74, 0x20, 0x69,
0x6e, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x74,
0x6f, 0x20, 0x62, 0x65, 0x20, 0x61, 0x64, 0x64, 0x65, 0x64, 0x20, 0x74,
0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6e, 0x65, 0x78, 0x74, 0x20, 0x63,
0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x64, 0x61, 0x74, 0x61,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21,
0x65, 0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x4c, 0x69, 0x6e, 0x65,
0x42, 0x72, 0x65, 0x61, 0x6b, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65,
0x72, 0x20, 0x3d, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x2e, 0x70, 0x6f,
0x70, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72,
0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f, 0x20, 0x52, 0x65,
0x73, 0x65, 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72,
0x20, 0x69, 0x66, 0x20, 0x77, 0x65, 0x20, 0x68, 0x61, 0x76, 0x65, 0x20,
0x61, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b,
0x20, 0x61, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x65, 0x6e, 0x64, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x50, 0x61, 0x72, 0x73, 0x65, 0x20,
0x61, 0x6c, 0x6c, 0x20, 0x73, 0x73, 0x65, 0x20, 0x65, 0x76, 0x65, 0x6e,
0x74, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61, 0x64, 0x64, 0x20, 0x74,
0x68, 0x65, 0x6d, 0x20, 0x74, 0x6f, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c,
0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
0x74, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x20, 0x3d, 0x20, 0x2f, 0x5e,
0x28, 0x5c, 0x53, 0x2b, 0x29, 0x3a, 0x5c, 0x73, 0x28, 0x2e, 0x2a, 0x29,
0x24, 0x2f, 0x67, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x66, 0x6f, 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c,
0x69, 0x6e, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73,
0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x20,
0x3d, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x2e, 0x65, 0x78, 0x65, 0x63,
0x28, 0x6c, 0x69, 0x6e, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6d, 0x61, 0x74, 0x63,
0x68, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x5b, 0x6d, 0x61,
0x74, 0x63, 0x68, 0x5b, 0x31, 0x5d, 0x5d, 0x20, 0x3d, 0x20, 0x6d, 0x61,
0x74, 0x63, 0x68, 0x5b, 0x32, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69, 0x6e, 0x63,
0x65, 0x20, 0x77, 0x65, 0x20, 0x6b, 0x6e, 0x6f, 0x77, 0x20, 0x74, 0x68,
0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e,
0x63, 0x70, 0x70, 0x2c, 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73, 0x20, 0x6a,
0x75, 0x73, 0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x74,
0x68, 0x65, 0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x64,
0x61, 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74,
0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73,
0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x4a,
0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65,
0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x72,
0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63,
0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
0x79, 0x69, 0x65, 0x6c, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x79, 0x69, 0x65, 0x6c, 0x64, 0x20,
0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
0x69, 0x66, 0x20, 0x77, 0x65, 0x20, 0x67, 0x6f, 0x74, 0x20, 0x61, 0x20,
0x73, 0x74, 0x6f, 0x70, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x20, 0x66,
0x72, 0x6f, 0x6d, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2c, 0x20,
0x77, 0x65, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x72, 0x65, 0x61,
0x6b, 0x20, 0x68, 0x65, 0x72, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72,
0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x73,
0x74, 0x6f, 0x70, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61,
0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69,
0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20,
0x3d, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74,
0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e,
0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d, 0x20,
0x66, 0x61, 0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72, 0x65,
0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75,
0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75,
0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x3d, 0x20, 0x4a,
0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65,
0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c,
0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x6d, 0x65, 0x73, 0x73,
0x61, 0x67, 0x65, 0x2e, 0x69, 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x73,
0x28, 0x27, 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61, 0x76, 0x61,
0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x29, 0x20, 0x7b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x68, 0x72, 0x6f, 0x77,
0x20, 0x61, 0x6e, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x74, 0x6f,
0x20, 0x62, 0x65, 0x20, 0x63, 0x61, 0x75, 0x67, 0x68, 0x74, 0x20, 0x62,
0x79, 0x20, 0x75, 0x70, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x20, 0x63,
0x61, 0x6c, 0x6c, 0x65, 0x72, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74,
0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72,
0x6f, 0x72, 0x28, 0x27, 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61,
0x76, 0x61, 0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x3b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x65,
0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e,
0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x5b, 0x24,
0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f,
0x72, 0x2e, 0x63, 0x6f, 0x64, 0x65, 0x7d, 0x20, 0x2d, 0x20, 0x24, 0x7b,
0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72,
0x2e, 0x74, 0x79, 0x70, 0x65, 0x7d, 0x5d, 0x3a, 0x20, 0x24, 0x7b, 0x72,
0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e,
0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x60, 0x29, 0x3b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28,
0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f,
0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c,
0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f,
0x72, 0x20, 0x24, 0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65,
0x72, 0x72, 0x6f, 0x72, 0x7d, 0x60, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20,
0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e,
0x6e, 0x61, 0x6d, 0x65, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62,
0x6f, 0x72, 0x74, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f,
0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c,
0x61, 0x6d, 0x61, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22,
0x2c, 0x20, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b,
0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c,
0x6c, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72,
0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65,
0x6e, 0x74, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61,
0x6c, 0x6c, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65,
0x74, 0x75, 0x72, 0x6e, 0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e,
0x74, 0x20, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61,
0x74, 0x20, 0x79, 0x6f, 0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75,
0x62, 0x73, 0x63, 0x72, 0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f,
0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65,
0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69,
0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74,
0x20, 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f,
0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27,
0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c,
0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72,
0x67, 0x65, 0x74, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a,
0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61,
0x64, 0x64, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65,
0x6e, 0x65, 0x72, 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65,
0x22, 0x2c, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d,
0x3e, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69,
0x74, 0x65, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74,
0x61, 0x69, 0x6c, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29,
0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f,
0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73,
0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74,
0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72,
0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69,
0x67, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b,
0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65,
0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e,
0x65, 0x77, 0x20, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67,
0x65, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79,
0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65,
0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28,
0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20,
0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f,
0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c,
0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68,
0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74,
0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b,
0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65,
0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64,
0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74,
0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45,
0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67,
0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c,
0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61,
0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28,
0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67,
0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65,
0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54,
0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74,
0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20,
0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28,
0x22, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b,
0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75,
0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65,
0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69,
0x6e, 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74,
0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65,
0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73,
0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e,
0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65,
0x6e, 0x74, 0x28, 0x22, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22,
0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20,
0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74,
0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54,
0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74,
0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20,
0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28,
0x22, 0x64, 0x6f, 0x6e, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65,
0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74,
0x65, 0x6e, 0x74, 0x20, 0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20,
0x20, 0x7d, 0x29, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74,
0x75, 0x72, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72,
0x67, 0x65, 0x74, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43,
0x61, 0x6c, 0x6c, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72,
0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d,
0x69, 0x73, 0x65, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73,
0x6f, 0x6c, 0x76, 0x65, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65,
0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74,
0x65, 0x78, 0x74, 0x2e, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f,
0x65, 0x73, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f,
0x72, 0x74, 0x20, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67,
0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70,
0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20,
0x20, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69,
0x73, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74,
0x68, 0x65, 0x6e, 0x28, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74,
0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65,
0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20,
0x6f, 0x72, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20,
0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65,
0x6e, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c,
0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28,
0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20,
0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e,
0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74,
0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70,
0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d,
0x73, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66,
0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20,
0x7b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e,
0x65, 0x77, 0x20, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61,
0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76,
0x65, 0x2c, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d,
0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20,
0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22,
0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77,
0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63,
0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61,
0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67,
0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20,
0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63,
0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
0x73, 0x6f, 0x6c, 0x76, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
0x74, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61,
0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20,
0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65,
0x63, 0x74, 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d,
0x3b, 0x0a, 0x0a, 0x2f, 0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64,
0x65, 0x70, 0x72, 0x65, 0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20,
0x2a, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f,
0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d,
0x70, 0x6c, 0x65, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e,
0x63, 0x20, 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63,
0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63,
0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20,
0x7b, 0x0a, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69,
0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75,
0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28,
0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70,
0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b,
0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20,
0x7d, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61,
0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b,
0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f,
0x20, 0x47, 0x65, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64,
0x65, 0x6c, 0x20, 0x69, 0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d,
0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e,
0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65,
0x66, 0x75, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74,
0x69, 0x6e, 0x67, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74,
0x65, 0x78, 0x74, 0x20, 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61,
0x6e, 0x64, 0x20, 0x73, 0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78,
0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c,
0x6c, 0x61, 0x6d, 0x61, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66,
0x6f, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x63,
0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20,
0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21,
0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73,
0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x61, 0x70, 0x69,
0x5f, 0x75, 0x72, 0x6c, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69,
0x67, 0x2e, 0x61, 0x70, 0x69, 0x5f, 0x75, 0x72, 0x6c, 0x20, 0x7c, 0x7c,
0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x20, 0x3d, 0x20, 0x61,
0x77, 0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x60,
0x24, 0x7b, 0x61, 0x70, 0x69, 0x5f, 0x75, 0x72, 0x6c, 0x7d, 0x2f, 0x70,
0x72, 0x6f, 0x70, 0x73, 0x60, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28,
0x72, 0x20, 0x3d, 0x3e, 0x20, 0x72, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x28,
0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x65,
0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69,
0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e,
0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x5f, 0x67, 0x65, 0x6e, 0x65,
0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69,
0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x72,
0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61,
0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67,
0x73, 0x3b, 0x0a, 0x7d, 0x0a
};
unsigned int completion_js_len = 5909;

View File

@ -8,13 +8,3 @@ PUBLIC=$DIR/public
echo "download js bundle files" echo "download js bundle files"
curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/index.js curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/index.js
echo >> $PUBLIC/index.js # add newline echo >> $PUBLIC/index.js # add newline
FILES=$(ls $PUBLIC)
cd $PUBLIC
for FILE in $FILES; do
echo "generate $FILE.hpp"
# use simple flag for old version of xxd
xxd -i $FILE > $DIR/$FILE.hpp
done

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -881,11 +881,11 @@
.replace(/&/g, '&amp;') .replace(/&/g, '&amp;')
.replace(/</g, '&lt;') .replace(/</g, '&lt;')
.replace(/>/g, '&gt;') .replace(/>/g, '&gt;')
.replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>') .replace(/(^|\n)#{1,6} ([^\n]*)(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1<h3>$2</h3>')
.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>') .replace(/\*\*(.*?)\*\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
.replace(/__(.*?)__/g, '<strong>$1</strong>') .replace(/__(.*?)__(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
.replace(/\*(.*?)\*/g, '<em>$1</em>') .replace(/\*(.*?)\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
.replace(/_(.*?)_/g, '<em>$1</em>') .replace(/_(.*?)_(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
.replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>') .replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
.replace(/`(.*?)`/g, '<code>$1</code>') .replace(/`(.*?)`/g, '<code>$1</code>')
.replace(/\n/gim, '<br />'); .replace(/\n/gim, '<br />');

View File

@ -854,7 +854,7 @@ struct server_context {
slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl); slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
slot.params.n_keep = json_value(data, "n_keep", slot.params.n_keep); slot.params.n_keep = json_value(data, "n_keep", slot.params.n_keep);
slot.params.n_discard = json_value(data, "n_discard", default_params.n_discard); slot.params.n_discard = json_value(data, "n_discard", default_params.n_discard);
slot.params.seed = json_value(data, "seed", default_params.seed); slot.sparams.seed = json_value(data, "seed", default_sparams.seed);
slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
@ -1028,7 +1028,6 @@ struct server_context {
send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST); send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
return false; return false;
} }
llama_set_rng_seed(ctx, slot.params.seed);
} }
slot.command = SLOT_COMMAND_LOAD_PROMPT; slot.command = SLOT_COMMAND_LOAD_PROMPT;
@ -1118,7 +1117,7 @@ struct server_context {
bool process_token(completion_token_output & result, server_slot & slot) { bool process_token(completion_token_output & result, server_slot & slot) {
// remember which tokens were sampled - used for repetition penalties during sampling // remember which tokens were sampled - used for repetition penalties during sampling
const std::string token_str = llama_token_to_piece(ctx, result.tok); const std::string token_str = llama_token_to_piece(ctx, result.tok, false);
slot.sampled = result.tok; slot.sampled = result.tok;
// search stop word and delete it // search stop word and delete it
@ -1201,13 +1200,34 @@ struct server_context {
}); });
} }
if (result.tok == llama_token_eos(model)) { if (llama_token_is_eog(model, result.tok)) {
slot.stopped_eos = true; slot.stopped_eos = true;
slot.has_next_token = false; slot.has_next_token = false;
LOG_VERBOSE("eos token found", {}); LOG_VERBOSE("eos token found", {});
} }
auto n_ctx_train = llama_n_ctx_train(model);
if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1
&& slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
LOG_WARNING("n_predict is not set and self-context extend is disabled."
" Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop", {
{ "id_slot", slot.id },
{ "params.n_predict", slot.params.n_predict },
{ "slot.n_prompt_tokens", slot.n_prompt_tokens },
{ "slot.n_decoded", slot.n_decoded },
{ "slot.n_predict", slot.n_predict },
{ "n_slots", params.n_parallel },
{ "slot.n_ctx", slot.n_ctx },
{ "n_ctx", n_ctx },
{ "n_ctx_train", n_ctx_train },
{ "ga_n", slot.ga_n },
});
slot.truncated = true;
slot.stopped_limit = true;
slot.has_next_token = false; // stop prediction
}
LOG_VERBOSE("next token", { LOG_VERBOSE("next token", {
{"id_slot", slot.id}, {"id_slot", slot.id},
{"id_task", slot.id_task}, {"id_task", slot.id_task},
@ -2142,7 +2162,7 @@ struct server_context {
}); });
// process the created batch of tokens // process the created batch of tokens
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i); const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
for (auto & slot : slots) { for (auto & slot : slots) {
@ -2372,7 +2392,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict); printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n"); printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n");
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n"); printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n");
printf(" --chat-template JINJA_TEMPLATE\n"); printf(" --chat-template JINJA_TEMPLATE\n");
@ -2803,43 +2823,11 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
invalid_param = true; invalid_param = true;
break; break;
} }
char * sep = strchr(argv[i], '='); if (!parse_kv_override(argv[i], params.kv_overrides)) {
if (sep == nullptr || sep - argv[i] >= 128) {
fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
invalid_param = true;
break;
}
struct llama_model_kv_override kvo;
std::strncpy(kvo.key, argv[i], sep - argv[i]);
kvo.key[sep - argv[i]] = 0;
sep++;
if (strncmp(sep, "int:", 4) == 0) {
sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
kvo.int_value = std::atol(sep);
} else if (strncmp(sep, "float:", 6) == 0) {
sep += 6;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
kvo.float_value = std::atof(sep);
} else if (strncmp(sep, "bool:", 5) == 0) {
sep += 5;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
if (std::strcmp(sep, "true") == 0) {
kvo.bool_value = true;
} else if (std::strcmp(sep, "false") == 0) {
kvo.bool_value = false;
} else {
fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
invalid_param = true;
break;
}
} else {
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
invalid_param = true; invalid_param = true;
break; break;
} }
params.kv_overrides.push_back(kvo);
} else { } else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
server_print_usage(argv[0], default_params, default_sparams); server_print_usage(argv[0], default_params, default_sparams);

View File

@ -29,7 +29,7 @@ To mitigate it, you can increase values in `n_predict`, `kv_size`.
cd ../../.. cd ../../..
mkdir build mkdir build
cd build cd build
cmake ../ cmake -DLLAMA_CURL=ON ../
cmake --build . --target server cmake --build . --target server
``` ```

View File

@ -0,0 +1,57 @@
@llama.cpp
@results
Feature: Results
Background: Server startup
Given a server listening on localhost:8080
And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
And a model file test-model-00001-of-00003.gguf
And 128 as batch size
And 256 KV cache size
And 128 max tokens to predict
Scenario Outline: Multi users completion
Given <n_slots> slots
And continuous batching
Then the server is starting
Then the server is healthy
Given 42 as seed
And a prompt:
"""
Write a very long story about AI.
"""
Given 42 as seed
And a prompt:
"""
Write a very long story about AI.
"""
Given 42 as seed
And a prompt:
"""
Write a very long story about AI.
"""
Given 42 as seed
And a prompt:
"""
Write a very long story about AI.
"""
Given 42 as seed
And a prompt:
"""
Write a very long story about AI.
"""
Given concurrent completion requests
Then the server is busy
Then the server is idle
And all slots are idle
Then all predictions are equal
Examples:
| n_slots |
| 1 |
| 2 |

View File

@ -61,6 +61,7 @@ def step_server_config(context, server_fqdn, server_port):
context.server_metrics = False context.server_metrics = False
context.server_process = None context.server_process = None
context.seed = None context.seed = None
context.draft = None
context.server_seed = None context.server_seed = None
context.user_api_key = None context.user_api_key = None
context.response_format = None context.response_format = None
@ -107,6 +108,11 @@ def step_n_gpu_layer(context, ngl):
context.n_gpu_layer = ngl context.n_gpu_layer = ngl
@step('{draft:d} as draft')
def step_draft(context, draft):
context.draft = draft
@step('{n_ctx:d} KV cache size') @step('{n_ctx:d} KV cache size')
def step_n_ctx(context, n_ctx): def step_n_ctx(context, n_ctx):
context.n_ctx = n_ctx context.n_ctx = n_ctx
@ -254,6 +260,15 @@ def step_n_tokens_predicted(context, predicted_n):
assert_n_tokens_predicted(context.completion, predicted_n) assert_n_tokens_predicted(context.completion, predicted_n)
@step('all predictions are equal')
@async_run_until_complete
async def step_predictions_equal(context):
n_completions = await gather_tasks_results(context)
assert n_completions >= 2, "need at least 2 completions"
assert_all_predictions_equal(context.tasks_result)
context.tasks_result = []
@step('the completion is truncated') @step('the completion is truncated')
def step_assert_completion_truncated(context): def step_assert_completion_truncated(context):
step_assert_completion_truncated(context, '') step_assert_completion_truncated(context, '')
@ -1020,6 +1035,23 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:' assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
f' {n_predicted} <> {expected_predicted_n}') f' {n_predicted} <> {expected_predicted_n}')
def assert_all_predictions_equal(completion_responses):
content_0 = completion_responses[0]['content']
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
print(f"content 0: {content_0}")
i = 1
for response in completion_responses[1:]:
content = response['content']
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
print(f"content {i}: {content}")
assert content == content_0, "contents not equal"
i += 1
async def gather_tasks_results(context): async def gather_tasks_results(context):
n_tasks = len(context.concurrent_tasks) n_tasks = len(context.concurrent_tasks)
@ -1148,6 +1180,8 @@ def start_server_background(context):
server_args.extend(['--ubatch-size', context.n_ubatch]) server_args.extend(['--ubatch-size', context.n_ubatch])
if context.n_gpu_layer: if context.n_gpu_layer:
server_args.extend(['--n-gpu-layers', context.n_gpu_layer]) server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
if context.draft is not None:
server_args.extend(['--draft', context.draft])
if context.server_continuous_batching: if context.server_continuous_batching:
server_args.append('--cont-batching') server_args.append('--cont-batching')
if context.server_embeddings: if context.server_embeddings:

View File

@ -4,9 +4,8 @@ set -eu
if [ $# -lt 1 ] if [ $# -lt 1 ]
then then
# Start @llama.cpp scenario # Start @llama.cpp scenario
behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
else else
behave "$@" behave "$@"
fi fi

View File

@ -381,10 +381,6 @@ static json oaicompat_completion_params_parse(
} else { } else {
llama_params["stop"] = json_value(body, "stop", json::array()); llama_params["stop"] = json_value(body, "stop", json::array());
} }
// Some chat templates don't use EOS token to stop generation
// We must add their end sequences to list of stop words
llama_params["stop"].push_back("<|im_end|>"); // chatml
llama_params["stop"].push_back("<end_of_turn>"); // gemma
// Handle "response_format" field // Handle "response_format" field
if (body.contains("response_format")) { if (body.contains("response_format")) {

View File

@ -133,8 +133,8 @@ int main(int argc, char ** argv) {
// sample the most likely token // sample the most likely token
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of stream? // is it an end of generation?
if (new_token_id == llama_token_eos(model) || n_cur == n_len) { if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
LOG_TEE("\n"); LOG_TEE("\n");
break; break;

View File

@ -360,7 +360,7 @@ int main(int argc, char ** argv) {
} }
} }
if (token_id == llama_token_eos(model_tgt)) { if (llama_token_is_eog(model_tgt, token_id)) {
has_eos = true; has_eos = true;
} }
++n_predict; ++n_predict;

View File

@ -20,11 +20,11 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1712791164, "lastModified": 1714076141,
"narHash": "sha256-3sbWO1mbpWsLepZGbWaMovSO7ndZeFqDSdX0hZ9nVyw=", "narHash": "sha256-Drmja/f5MRHZCskS6mvzFqxEaZMeciScCTFxWVLqWEY=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "1042fd8b148a9105f3c0aca3a6177fd1d9360ba5", "rev": "7bb2ccd8cdc44c91edba16c48d2c8f331fb3d856",
"type": "github" "type": "github"
}, },
"original": { "original": {

View File

@ -371,16 +371,16 @@ struct ggml_gallocr {
}; };
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) { ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1); ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
GGML_ASSERT(galloc != NULL); GGML_ASSERT(galloc != NULL);
galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1); galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
GGML_ASSERT(galloc->bufts != NULL); GGML_ASSERT(galloc->bufts != NULL);
galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1); galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
GGML_ASSERT(galloc->buffers != NULL); GGML_ASSERT(galloc->buffers != NULL);
galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1); galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
GGML_ASSERT(galloc->buf_tallocs != NULL); GGML_ASSERT(galloc->buf_tallocs != NULL);
for (int i = 0; i < n_bufs; i++) { for (int i = 0; i < n_bufs; i++) {
@ -646,8 +646,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
free(galloc->hash_set.keys); free(galloc->hash_set.keys);
free(galloc->hash_values); free(galloc->hash_values);
galloc->hash_set.size = hash_size; galloc->hash_set.size = hash_size;
galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size); galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
galloc->hash_values = calloc(sizeof(struct hash_node), hash_size); galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
GGML_ASSERT(galloc->hash_set.keys != NULL); GGML_ASSERT(galloc->hash_set.keys != NULL);
GGML_ASSERT(galloc->hash_values != NULL); GGML_ASSERT(galloc->hash_values != NULL);
} else { } else {
@ -667,7 +667,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
// set the node_allocs from the hash table // set the node_allocs from the hash table
if (galloc->n_nodes < graph->n_nodes) { if (galloc->n_nodes < graph->n_nodes) {
free(galloc->node_allocs); free(galloc->node_allocs);
galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes); galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
GGML_ASSERT(galloc->node_allocs != NULL); GGML_ASSERT(galloc->node_allocs != NULL);
} }
galloc->n_nodes = graph->n_nodes; galloc->n_nodes = graph->n_nodes;
@ -697,7 +697,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
} }
if (galloc->n_leafs < graph->n_leafs) { if (galloc->n_leafs < graph->n_leafs) {
free(galloc->leaf_allocs); free(galloc->leaf_allocs);
galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs); galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
GGML_ASSERT(galloc->leaf_allocs != NULL); GGML_ASSERT(galloc->leaf_allocs != NULL);
} }
galloc->n_leafs = graph->n_leafs; galloc->n_leafs = graph->n_leafs;

View File

@ -822,7 +822,11 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
switch (op->op) { switch (op->op) {
case GGML_OP_CPY: case GGML_OP_CPY:
return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && op->type != GGML_TYPE_IQ1_S; // missing type_traits.from_float return
op->type != GGML_TYPE_IQ2_XXS &&
op->type != GGML_TYPE_IQ2_XS &&
op->type != GGML_TYPE_IQ1_S &&
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type; return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
default: default:
@ -1721,23 +1725,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1); struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
// initialize hash table // initialize hash table
sched->hash_set = ggml_hash_set_new(graph_size); sched->hash_set = ggml_hash_set_new(graph_size);
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size); sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size); sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2; const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size); sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size); sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
sched->n_backends = n_backends; sched->n_backends = n_backends;
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1; sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
const int initial_splits_capacity = 16; const int initial_splits_capacity = 16;
sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity); sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
sched->splits_capacity = initial_splits_capacity; sched->splits_capacity = initial_splits_capacity;
for (int b = 0; b < n_backends; b++) { for (int b = 0; b < n_backends; b++) {
@ -1780,12 +1784,14 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
void ggml_backend_sched_reset(ggml_backend_sched_t sched) { void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
// reset state for the next run // reset state for the next run
size_t hash_size = sched->hash_set.size; if (!sched->is_reset) {
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT size_t hash_size = sched->hash_set.size;
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size); memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size); memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
sched->is_reset = true; sched->is_reset = true;
}
sched->is_alloc = false; sched->is_alloc = false;
} }
@ -1968,10 +1974,10 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) { struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
struct ggml_hash_set hash_set = { struct ggml_hash_set hash_set = {
/* .size = */ graph->visited_hash_table.size, /* .size = */ graph->visited_hash_table.size,
/* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT /* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
}; };
struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
bool * node_init = calloc(sizeof(node_init[0]), hash_set.size); bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
struct ggml_init_params params = { struct ggml_init_params params = {
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false), /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),

View File

@ -5,16 +5,16 @@
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t> template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) { static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
const int64_t i = 2*(blockDim.x*blockIdx.x + threadIdx.x); const int64_t i = (int64_t)2*(blockDim.x*blockIdx.x + threadIdx.x);
if (i >= k) { if (i >= k) {
return; return;
} }
const int64_t ib = i/qk; // block index const int64_t ib = i/qk; // block index
const int iqs = (i%qk)/qr; // quant index const int64_t iqs = (i%qk)/qr; // quant index
const int iybs = i - i%qk; // y block start index const int64_t iybs = i - i%qk; // y block start index
const int y_offset = qr == 1 ? 1 : qk/2; const int64_t y_offset = qr == 1 ? 1 : qk/2;
// dequantize // dequantize
dfloat2 v; dfloat2 v;
@ -29,7 +29,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
#if __CUDA_ARCH__ >= CC_PASCAL #if __CUDA_ARCH__ >= CC_PASCAL
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE; constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
const int i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x; const int64_t i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
const int * x0 = ((int *) vx) + blockIdx.x * nint; const int * x0 = ((int *) vx) + blockIdx.x * nint;
half2 * y2 = (half2 *) (y + i0); half2 * y2 = (half2 *) (y + i0);
@ -73,9 +73,9 @@ static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t
const int64_t i = blockIdx.x; const int64_t i = blockIdx.x;
// assume 32 threads // assume 32 threads
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
const int il = tid/8; const int64_t il = tid/8;
const int ir = tid%8; const int64_t ir = tid%8;
const int64_t ib = 8*i + ir; const int64_t ib = 8*i + ir;
if (ib >= nb32) { if (ib >= nb32) {
return; return;
@ -101,9 +101,9 @@ static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t
const int64_t i = blockIdx.x; const int64_t i = blockIdx.x;
// assume 32 threads // assume 32 threads
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
const int il = tid/8; const int64_t il = tid/8;
const int ir = tid%8; const int64_t ir = tid%8;
const int64_t ib = 8*i + ir; const int64_t ib = 8*i + ir;
if (ib >= nb32) { if (ib >= nb32) {
return; return;
@ -127,14 +127,14 @@ static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t
template<typename dst_t> template<typename dst_t>
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int i = blockIdx.x; const int64_t i = blockIdx.x;
const block_q2_K * x = (const block_q2_K *) vx; const block_q2_K * x = (const block_q2_K *) vx;
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
#if QK_K == 256 #if QK_K == 256
const int n = tid/32; const int64_t n = tid/32;
const int l = tid - 32*n; const int64_t l = tid - 32*n;
const int is = 8*n + l/16; const int64_t is = 8*n + l/16;
const uint8_t q = x[i].qs[32*n + l]; const uint8_t q = x[i].qs[32*n + l];
dst_t * y = yy + i*QK_K + 128*n; dst_t * y = yy + i*QK_K + 128*n;
@ -146,8 +146,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4); y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
#else #else
const int is = tid/16; // 0 or 1 const int64_t is = tid/16; // 0 or 1
const int il = tid%16; // 0...15 const int64_t il = tid%16; // 0...15
const uint8_t q = x[i].qs[il] >> (2*is); const uint8_t q = x[i].qs[il] >> (2*is);
dst_t * y = yy + i*QK_K + 16*is + il; dst_t * y = yy + i*QK_K + 16*is + il;
float dall = __low2half(x[i].dm); float dall = __low2half(x[i].dm);
@ -161,19 +161,19 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
template<typename dst_t> template<typename dst_t>
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int i = blockIdx.x; const int64_t i = blockIdx.x;
const block_q3_K * x = (const block_q3_K *) vx; const block_q3_K * x = (const block_q3_K *) vx;
#if QK_K == 256 #if QK_K == 256
const int r = threadIdx.x/4; const int64_t r = threadIdx.x/4;
const int tid = r/2; const int64_t tid = r/2;
const int is0 = r%2; const int64_t is0 = r%2;
const int l0 = 16*is0 + 4*(threadIdx.x%4); const int64_t l0 = 16*is0 + 4*(threadIdx.x%4);
const int n = tid / 4; const int64_t n = tid / 4;
const int j = tid - 4*n; const int64_t j = tid - 4*n;
uint8_t m = 1 << (4*n + j); uint8_t m = 1 << (4*n + j);
int is = 8*n + 2*j + is0; int64_t is = 8*n + 2*j + is0;
int shift = 2*j; int shift = 2*j;
int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) : int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
@ -189,11 +189,11 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
#else #else
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
const int is = tid/16; // 0 or 1 const int64_t is = tid/16; // 0 or 1
const int il = tid%16; // 0...15 const int64_t il = tid%16; // 0...15
const int im = il/8; // 0...1 const int64_t im = il/8; // 0...1
const int in = il%8; // 0...7 const int64_t in = il%8; // 0...7
dst_t * y = yy + i*QK_K + 16*is + il; dst_t * y = yy + i*QK_K + 16*is + il;
@ -227,15 +227,15 @@ template<typename dst_t>
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const block_q4_K * x = (const block_q4_K *) vx; const block_q4_K * x = (const block_q4_K *) vx;
const int i = blockIdx.x; const int64_t i = blockIdx.x;
#if QK_K == 256 #if QK_K == 256
// assume 32 threads // assume 32 threads
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
const int il = tid/8; const int64_t il = tid/8;
const int ir = tid%8; const int64_t ir = tid%8;
const int is = 2*il; const int64_t is = 2*il;
const int n = 4; const int64_t n = 4;
dst_t * y = yy + i*QK_K + 64*il + n*ir; dst_t * y = yy + i*QK_K + 64*il + n*ir;
@ -254,7 +254,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
y[l +32] = d2 * (q[l] >> 4) - m2; y[l +32] = d2 * (q[l] >> 4) - m2;
} }
#else #else
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
const uint8_t * q = x[i].qs; const uint8_t * q = x[i].qs;
dst_t * y = yy + i*QK_K; dst_t * y = yy + i*QK_K;
const float d = (float)x[i].dm[0]; const float d = (float)x[i].dm[0];
@ -268,14 +268,14 @@ template<typename dst_t>
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const block_q5_K * x = (const block_q5_K *) vx; const block_q5_K * x = (const block_q5_K *) vx;
const int i = blockIdx.x; const int64_t i = blockIdx.x;
#if QK_K == 256 #if QK_K == 256
// assume 64 threads - this is very slightly better than the one below // assume 64 threads - this is very slightly better than the one below
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
const int il = tid/16; // il is in 0...3 const int64_t il = tid/16; // il is in 0...3
const int ir = tid%16; // ir is in 0...15 const int64_t ir = tid%16; // ir is in 0...15
const int is = 2*il; // is is in 0...6 const int64_t is = 2*il; // is is in 0...6
dst_t * y = yy + i*QK_K + 64*il + 2*ir; dst_t * y = yy + i*QK_K + 64*il + 2*ir;
@ -298,11 +298,11 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2; y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2; y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
#else #else
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
const uint8_t q = x[i].qs[tid]; const uint8_t q = x[i].qs[tid];
const int im = tid/8; // 0...3 const int64_t im = tid/8; // 0...3
const int in = tid%8; // 0...7 const int64_t in = tid%8; // 0...7
const int is = tid/16; // 0 or 1 const int64_t is = tid/16; // 0 or 1
const uint8_t h = x[i].qh[in] >> im; const uint8_t h = x[i].qh[in] >> im;
const float d = x[i].d; const float d = x[i].d;
dst_t * y = yy + i*QK_K + tid; dst_t * y = yy + i*QK_K + tid;
@ -359,13 +359,13 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
template<typename dst_t> template<typename dst_t>
static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int i = blockIdx.x; const int64_t i = blockIdx.x;
const block_iq2_xxs * x = (const block_iq2_xxs *) vx; const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
#if QK_K == 256 #if QK_K == 256
const int il = tid/8; // 0...3 const int64_t il = tid/8; // 0...3
const int ib = tid%8; // 0...7 const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il; dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint16_t * q2 = x[i].qs + 4*ib; const uint16_t * q2 = x[i].qs + 4*ib;
const uint8_t * aux8 = (const uint8_t *)q2; const uint8_t * aux8 = (const uint8_t *)q2;
@ -383,13 +383,13 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
template<typename dst_t> template<typename dst_t>
static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int i = blockIdx.x; const int64_t i = blockIdx.x;
const block_iq2_xs * x = (const block_iq2_xs *) vx; const block_iq2_xs * x = (const block_iq2_xs *) vx;
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
#if QK_K == 256 #if QK_K == 256
const int il = tid/8; // 0...3 const int64_t il = tid/8; // 0...3
const int ib = tid%8; // 0...7 const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il; dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint16_t * q2 = x[i].qs + 4*ib; const uint16_t * q2 = x[i].qs + 4*ib;
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511)); const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
@ -405,13 +405,13 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
template<typename dst_t> template<typename dst_t>
static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int i = blockIdx.x; const int64_t i = blockIdx.x;
const block_iq2_s * x = (const block_iq2_s *) vx; const block_iq2_s * x = (const block_iq2_s *) vx;
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
#if QK_K == 256 #if QK_K == 256
const int il = tid/8; // 0...3 const int64_t il = tid/8; // 0...3
const int ib = tid%8; // 0...7 const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il; dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300))); const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
@ -426,13 +426,13 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
template<typename dst_t> template<typename dst_t>
static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int i = blockIdx.x; const int64_t i = blockIdx.x;
const block_iq3_xxs * x = (const block_iq3_xxs *) vx; const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
#if QK_K == 256 #if QK_K == 256
const int il = tid/8; // 0...3 const int64_t il = tid/8; // 0...3
const int ib = tid%8; // 0...7 const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il; dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint8_t * q3 = x[i].qs + 8*ib; const uint8_t * q3 = x[i].qs + 8*ib;
const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib; const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
@ -454,13 +454,13 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
template<typename dst_t> template<typename dst_t>
static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int i = blockIdx.x; const int64_t i = blockIdx.x;
const block_iq3_s * x = (const block_iq3_s *) vx; const block_iq3_s * x = (const block_iq3_s *) vx;
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
#if QK_K == 256 #if QK_K == 256
const int il = tid/8; // 0...3 const int64_t il = tid/8; // 0...3
const int ib = tid%8; // 0...7 const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il; dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint8_t * qs = x[i].qs + 8*ib; const uint8_t * qs = x[i].qs + 8*ib;
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256))); const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
@ -480,13 +480,13 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
template<typename dst_t> template<typename dst_t>
static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int i = blockIdx.x; const int64_t i = blockIdx.x;
const block_iq1_s * x = (const block_iq1_s *) vx; const block_iq1_s * x = (const block_iq1_s *) vx;
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
#if QK_K == 256 #if QK_K == 256
const int il = tid/8; // 0...3 const int64_t il = tid/8; // 0...3
const int ib = tid%8; // 0...7 const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il; dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA; const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1); const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
@ -506,18 +506,18 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
template<typename dst_t> template<typename dst_t>
static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int i = blockIdx.x; const int64_t i = blockIdx.x;
const block_iq1_m * x = (const block_iq1_m *) vx; const block_iq1_m * x = (const block_iq1_m *) vx;
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
#if QK_K == 256 #if QK_K == 256
const int il = tid/8; // 0...3 const int64_t il = tid/8; // 0...3
const int ib = tid%8; // 0...7 const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il; dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint16_t * sc = (const uint16_t *)x[i].scales; const uint16_t * sc = (const uint16_t *)x[i].scales;
iq1m_scale_t scale; iq1m_scale_t scale;
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4); const int64_t ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1); const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA; const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32; uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
@ -537,12 +537,12 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
template<typename dst_t> template<typename dst_t>
static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int i = blockIdx.x; const int64_t i = blockIdx.x;
const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL); const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
const int il = tid/8; // 0...3 const int64_t il = tid/8; // 0...3
const int ib = tid%8; // 0...7 const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 4*il; dst_t * y = yy + i*QK_K + 32*ib + 4*il;
const uint8_t * q4 = x[ib].qs + 4*il; const uint8_t * q4 = x[ib].qs + 4*il;
const float d = (float)x[ib].d; const float d = (float)x[ib].d;
@ -556,12 +556,12 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
#if QK_K != 64 #if QK_K != 64
template<typename dst_t> template<typename dst_t>
static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int i = blockIdx.x; const int64_t i = blockIdx.x;
const block_iq4_xs * x = (const block_iq4_xs *)vx; const block_iq4_xs * x = (const block_iq4_xs *)vx;
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
const int il = tid/8; // 0...3 const int64_t il = tid/8; // 0...3
const int ib = tid%8; // 0...7 const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 4*il; dst_t * y = yy + i*QK_K + 32*ib + 4*il;
const uint8_t * q4 = x[i].qs + 16*ib + 4*il; const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32); const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);

View File

@ -28,7 +28,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
extern __shared__ float data_soft_max_f32[]; extern __shared__ float data_soft_max_f32[];
float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
// shared memory buffer to cache values between iterations: // shared memory buffer to cache values between iterations:
float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + rowx*ncols; float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + (int64_t)rowx*ncols;
float max_val = -INFINITY; float max_val = -INFINITY;
@ -40,8 +40,8 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
break; break;
} }
const int ix = rowx*ncols + col; const int64_t ix = (int64_t)rowx*ncols + col;
const int iy = rowy*ncols + col; const int64_t iy = (int64_t)rowy*ncols + col;
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f); const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
@ -109,7 +109,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
return; return;
} }
const int idst = rowx*ncols + col; const int64_t idst = (int64_t)rowx*ncols + col;
dst[idst] = vals[col] * inv_sum; dst[idst] = vals[col] * inv_sum;
} }
} }

View File

@ -11,6 +11,12 @@
#include <string.h> // memcpy #include <string.h> // memcpy
#include <math.h> // fabsf #include <math.h> // fabsf
#undef MIN
#undef MAX
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
@ -45,7 +51,7 @@ extern "C" {
// 16-bit float // 16-bit float
// on Arm, we use __fp16 // on Arm, we use __fp16
// on x86, we use uint16_t // on x86, we use uint16_t
#if defined(__ARM_NEON) && !defined(_MSC_VER) #if defined(__ARM_NEON)
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example: // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
// //
@ -53,8 +59,262 @@ extern "C" {
// //
#include <arm_neon.h> #include <arm_neon.h>
#ifdef _MSC_VER
typedef uint16_t ggml_fp16_internal_t;
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
#else
typedef __fp16 ggml_fp16_internal_t; typedef __fp16 ggml_fp16_internal_t;
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
#endif // _MSC_VER
#if !defined(__aarch64__)
// 32-bit ARM compatibility
// vaddvq_s16
// vpaddq_s16
// vpaddq_s32
// vaddvq_s32
// vaddvq_f32
// vmaxvq_f32
// vcvtnq_s32_f32
// vzip1_u8
// vzip2_u8
inline static int32_t vaddvq_s16(int16x8_t v) {
return
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
}
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
return vcombine_s16(a0, b0);
}
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
return vcombine_s32(a0, b0);
}
inline static int32_t vaddvq_s32(int32x4_t v) {
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
}
inline static float vaddvq_f32(float32x4_t v) {
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
}
inline static float vmaxvq_f32(float32x4_t v) {
return
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
}
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
int32x4_t res;
res[0] = roundf(vgetq_lane_f32(v, 0));
res[1] = roundf(vgetq_lane_f32(v, 1));
res[2] = roundf(vgetq_lane_f32(v, 2));
res[3] = roundf(vgetq_lane_f32(v, 3));
return res;
}
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
uint8x8_t res;
res[0] = a[0]; res[1] = b[0];
res[2] = a[1]; res[3] = b[1];
res[4] = a[2]; res[5] = b[2];
res[6] = a[3]; res[7] = b[3];
return res;
}
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
uint8x8_t res;
res[0] = a[4]; res[1] = b[4];
res[2] = a[5]; res[3] = b[5];
res[4] = a[6]; res[5] = b[6];
res[6] = a[7]; res[7] = b[7];
return res;
}
// vld1q_s16_x2
// vld1q_u8_x2
// vld1q_u8_x4
// vld1q_s8_x2
// vld1q_s8_x4
// TODO: double-check these work correctly
typedef struct ggml_int16x8x2_t {
int16x8_t val[2];
} ggml_int16x8x2_t;
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
ggml_int16x8x2_t res;
res.val[0] = vld1q_s16(ptr + 0);
res.val[1] = vld1q_s16(ptr + 8);
return res;
}
typedef struct ggml_uint8x16x2_t {
uint8x16_t val[2];
} ggml_uint8x16x2_t;
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
ggml_uint8x16x2_t res;
res.val[0] = vld1q_u8(ptr + 0);
res.val[1] = vld1q_u8(ptr + 16);
return res;
}
typedef struct ggml_uint8x16x4_t {
uint8x16_t val[4];
} ggml_uint8x16x4_t;
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
ggml_uint8x16x4_t res;
res.val[0] = vld1q_u8(ptr + 0);
res.val[1] = vld1q_u8(ptr + 16);
res.val[2] = vld1q_u8(ptr + 32);
res.val[3] = vld1q_u8(ptr + 48);
return res;
}
typedef struct ggml_int8x16x2_t {
int8x16_t val[2];
} ggml_int8x16x2_t;
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
ggml_int8x16x2_t res;
res.val[0] = vld1q_s8(ptr + 0);
res.val[1] = vld1q_s8(ptr + 16);
return res;
}
typedef struct ggml_int8x16x4_t {
int8x16_t val[4];
} ggml_int8x16x4_t;
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
ggml_int8x16x4_t res;
res.val[0] = vld1q_s8(ptr + 0);
res.val[1] = vld1q_s8(ptr + 16);
res.val[2] = vld1q_s8(ptr + 32);
res.val[3] = vld1q_s8(ptr + 48);
return res;
}
// NOTE: not tested
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
int8x16_t res;
res[ 0] = a[b[ 0]];
res[ 1] = a[b[ 1]];
res[ 2] = a[b[ 2]];
res[ 3] = a[b[ 3]];
res[ 4] = a[b[ 4]];
res[ 5] = a[b[ 5]];
res[ 6] = a[b[ 6]];
res[ 7] = a[b[ 7]];
res[ 8] = a[b[ 8]];
res[ 9] = a[b[ 9]];
res[10] = a[b[10]];
res[11] = a[b[11]];
res[12] = a[b[12]];
res[13] = a[b[13]];
res[14] = a[b[14]];
res[15] = a[b[15]];
return res;
}
// NOTE: not tested
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
uint8x16_t res;
res[ 0] = a[b[ 0]];
res[ 1] = a[b[ 1]];
res[ 2] = a[b[ 2]];
res[ 3] = a[b[ 3]];
res[ 4] = a[b[ 4]];
res[ 5] = a[b[ 5]];
res[ 6] = a[b[ 6]];
res[ 7] = a[b[ 7]];
res[ 8] = a[b[ 8]];
res[ 9] = a[b[ 9]];
res[10] = a[b[10]];
res[11] = a[b[11]];
res[12] = a[b[12]];
res[13] = a[b[13]];
res[14] = a[b[14]];
res[15] = a[b[15]];
return res;
}
#else
#define ggml_int16x8x2_t int16x8x2_t
#define ggml_uint8x16x2_t uint8x16x2_t
#define ggml_uint8x16x4_t uint8x16x4_t
#define ggml_int8x16x2_t int8x16x2_t
#define ggml_int8x16x4_t int8x16x4_t
#define ggml_vld1q_s16_x2 vld1q_s16_x2
#define ggml_vld1q_u8_x2 vld1q_u8_x2
#define ggml_vld1q_u8_x4 vld1q_u8_x4
#define ggml_vld1q_s8_x2 vld1q_s8_x2
#define ggml_vld1q_s8_x4 vld1q_s8_x4
#define ggml_vqtbl1q_s8 vqtbl1q_s8
#define ggml_vqtbl1q_u8 vqtbl1q_u8
#endif // !defined(__aarch64__)
#if !defined(__ARM_FEATURE_DOTPROD)
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
}
#else
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
#endif // !defined(__ARM_FEATURE_DOTPROD)
#endif // defined(__ARM_NEON)
#if defined(__ARM_NEON) && !defined(__MSC_VER)
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
@ -75,8 +335,6 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
#else #else
typedef uint16_t ggml_fp16_internal_t;
#ifdef __wasm_simd128__ #ifdef __wasm_simd128__
#include <wasm_simd128.h> #include <wasm_simd128.h>
#else #else
@ -221,7 +479,7 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
#endif // __F16C__ #endif // __F16C__
#endif // __ARM_NEON #endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
// precomputed f32 table for f16 (256 KB) // precomputed f32 table for f16 (256 KB)
// defined in ggml.c, initialized in ggml_init() // defined in ggml.c, initialized in ggml_init()

View File

@ -14,47 +14,6 @@
#include <stdlib.h> // for qsort #include <stdlib.h> // for qsort
#include <stdio.h> // for GGML_ASSERT #include <stdio.h> // for GGML_ASSERT
#ifdef __ARM_NEON
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
//
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
//
#include <arm_neon.h>
#else
#ifdef __wasm_simd128__
#include <wasm_simd128.h>
#else
#if defined(__POWER9_VECTOR__) || defined(__powerpc64__)
#include <altivec.h>
#undef bool
#define bool _Bool
#else
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <intrin.h>
#else
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
#if !defined(__riscv)
#include <immintrin.h>
#endif
#endif
#endif
#endif
#endif
#endif
#ifdef __riscv_v_intrinsic
#include <riscv_vector.h>
#endif
#undef MIN
#undef MAX
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define UNUSED GGML_UNUSED #define UNUSED GGML_UNUSED
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7 // some compilers don't provide _mm256_set_m128i, e.g. gcc 7
@ -276,258 +235,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
#endif // __AVX__ || __AVX2__ || __AVX512F__ #endif // __AVX__ || __AVX2__ || __AVX512F__
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
#if defined(__ARM_NEON)
#ifdef _MSC_VER
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
#else
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
#endif
#if !defined(__aarch64__)
// 64-bit compatibility
// vaddvq_s16
// vpaddq_s16
// vpaddq_s32
// vaddvq_s32
// vaddvq_f32
// vmaxvq_f32
// vcvtnq_s32_f32
// vzip1_u8
// vzip2_u8
inline static int32_t vaddvq_s16(int16x8_t v) {
return
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
}
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
return vcombine_s16(a0, b0);
}
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
return vcombine_s32(a0, b0);
}
inline static int32_t vaddvq_s32(int32x4_t v) {
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
}
inline static float vaddvq_f32(float32x4_t v) {
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
}
inline static float vmaxvq_f32(float32x4_t v) {
return
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
}
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
int32x4_t res;
res[0] = roundf(vgetq_lane_f32(v, 0));
res[1] = roundf(vgetq_lane_f32(v, 1));
res[2] = roundf(vgetq_lane_f32(v, 2));
res[3] = roundf(vgetq_lane_f32(v, 3));
return res;
}
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
uint8x8_t res;
res[0] = a[0]; res[1] = b[0];
res[2] = a[1]; res[3] = b[1];
res[4] = a[2]; res[5] = b[2];
res[6] = a[3]; res[7] = b[3];
return res;
}
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
uint8x8_t res;
res[0] = a[4]; res[1] = b[4];
res[2] = a[5]; res[3] = b[5];
res[4] = a[6]; res[5] = b[6];
res[6] = a[7]; res[7] = b[7];
return res;
}
// vld1q_s16_x2
// vld1q_u8_x2
// vld1q_u8_x4
// vld1q_s8_x2
// vld1q_s8_x4
// TODO: double-check these work correctly
typedef struct ggml_int16x8x2_t {
int16x8_t val[2];
} ggml_int16x8x2_t;
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
ggml_int16x8x2_t res;
res.val[0] = vld1q_s16(ptr + 0);
res.val[1] = vld1q_s16(ptr + 8);
return res;
}
typedef struct ggml_uint8x16x2_t {
uint8x16_t val[2];
} ggml_uint8x16x2_t;
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
ggml_uint8x16x2_t res;
res.val[0] = vld1q_u8(ptr + 0);
res.val[1] = vld1q_u8(ptr + 16);
return res;
}
typedef struct ggml_uint8x16x4_t {
uint8x16_t val[4];
} ggml_uint8x16x4_t;
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
ggml_uint8x16x4_t res;
res.val[0] = vld1q_u8(ptr + 0);
res.val[1] = vld1q_u8(ptr + 16);
res.val[2] = vld1q_u8(ptr + 32);
res.val[3] = vld1q_u8(ptr + 48);
return res;
}
typedef struct ggml_int8x16x2_t {
int8x16_t val[2];
} ggml_int8x16x2_t;
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
ggml_int8x16x2_t res;
res.val[0] = vld1q_s8(ptr + 0);
res.val[1] = vld1q_s8(ptr + 16);
return res;
}
typedef struct ggml_int8x16x4_t {
int8x16_t val[4];
} ggml_int8x16x4_t;
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
ggml_int8x16x4_t res;
res.val[0] = vld1q_s8(ptr + 0);
res.val[1] = vld1q_s8(ptr + 16);
res.val[2] = vld1q_s8(ptr + 32);
res.val[3] = vld1q_s8(ptr + 48);
return res;
}
// NOTE: not tested
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
int8x16_t res;
res[ 0] = a[b[ 0]];
res[ 1] = a[b[ 1]];
res[ 2] = a[b[ 2]];
res[ 3] = a[b[ 3]];
res[ 4] = a[b[ 4]];
res[ 5] = a[b[ 5]];
res[ 6] = a[b[ 6]];
res[ 7] = a[b[ 7]];
res[ 8] = a[b[ 8]];
res[ 9] = a[b[ 9]];
res[10] = a[b[10]];
res[11] = a[b[11]];
res[12] = a[b[12]];
res[13] = a[b[13]];
res[14] = a[b[14]];
res[15] = a[b[15]];
return res;
}
// NOTE: not tested
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
uint8x16_t res;
res[ 0] = a[b[ 0]];
res[ 1] = a[b[ 1]];
res[ 2] = a[b[ 2]];
res[ 3] = a[b[ 3]];
res[ 4] = a[b[ 4]];
res[ 5] = a[b[ 5]];
res[ 6] = a[b[ 6]];
res[ 7] = a[b[ 7]];
res[ 8] = a[b[ 8]];
res[ 9] = a[b[ 9]];
res[10] = a[b[10]];
res[11] = a[b[11]];
res[12] = a[b[12]];
res[13] = a[b[13]];
res[14] = a[b[14]];
res[15] = a[b[15]];
return res;
}
#else
#define ggml_int16x8x2_t int16x8x2_t
#define ggml_uint8x16x2_t uint8x16x2_t
#define ggml_uint8x16x4_t uint8x16x4_t
#define ggml_int8x16x2_t int8x16x2_t
#define ggml_int8x16x4_t int8x16x4_t
#define ggml_vld1q_s16_x2 vld1q_s16_x2
#define ggml_vld1q_u8_x2 vld1q_u8_x2
#define ggml_vld1q_u8_x4 vld1q_u8_x4
#define ggml_vld1q_s8_x2 vld1q_s8_x2
#define ggml_vld1q_s8_x4 vld1q_s8_x4
#define ggml_vqtbl1q_s8 vqtbl1q_s8
#define ggml_vqtbl1q_u8 vqtbl1q_u8
#endif
#if !defined(__ARM_FEATURE_DOTPROD)
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
}
#else
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
#endif
#endif
#if defined(__ARM_NEON) || defined(__wasm_simd128__) #if defined(__ARM_NEON) || defined(__wasm_simd128__)
#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s #define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s) #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
@ -12676,3 +12383,287 @@ void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k)
block_iq2_s * restrict y = vy; block_iq2_s * restrict y = vy;
quantize_row_iq2_s_reference(x, y, k); quantize_row_iq2_s_reference(x, y, k);
} }
static bool validate_float(float f, size_t i) {
if (isinf(f)) {
fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i);
return false;
}
if (isnan(f)) {
fprintf(stderr, "ggml_validate_row_data: found nan value at block %zu\n", i);
return false;
}
return true;
}
static bool isinf_fp16(ggml_fp16_t f) {
return (f & 0x7c00) == 0x7c00 && (f & 0x03ff) == 0;
}
static bool isnan_fp16(ggml_fp16_t f) {
return (f & 0x7c00) == 0x7c00 && (f & 0x03ff) != 0;
}
static bool validate_fp16(ggml_fp16_t f, size_t i) {
if (isinf_fp16(f)) {
fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i);
return false;
}
if (isnan_fp16(f)) {
fprintf(stderr, "ggml_validate_row_data: found nan value at block %zu\n", i);
return false;
}
return true;
}
#define VALIDATE_ROW_DATA_D_F16_IMPL(type, data, nb) \
const type * q = (const type *) (data); \
for (size_t i = 0; i < (nb); ++i) { \
if (!validate_fp16(q[i].d, i)) { \
return false; \
} \
}
#define VALIDATE_ROW_DATA_DM_F16_IMPL(type, data, nb, d, m) \
const type * q = (const type *) (data); \
for (size_t i = 0; i < (nb); ++i) { \
if (!validate_fp16(q[i].d, i) || !validate_fp16(q[i].m, i)) { \
return false; \
} \
}
bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes) {
if (type < 0 || type >= GGML_TYPE_COUNT) {
fprintf(stderr, "%s: invalid type %d\n", __func__, type);
return false;
}
if (nbytes % ggml_type_size(type) != 0) {
fprintf(stderr, "%s: invalid size %zu for type %d\n", __func__, nbytes, type);
return false;
}
const size_t nb = nbytes/ggml_type_size(type);
switch (type) {
case GGML_TYPE_F16:
{
const ggml_fp16_t * f = (const ggml_fp16_t *) data;
size_t i = 0;
#if defined(__AVX2__)
for (; i + 15 < nb; i += 16) {
__m256i v = _mm256_loadu_si256((const __m256i *)(f + i));
__m256i vexp = _mm256_and_si256(v, _mm256_set1_epi16(0x7c00));
__m256i cmp = _mm256_cmpeq_epi16(vexp, _mm256_set1_epi16(0x7c00));
int mask = _mm256_movemask_epi8(cmp);
if (mask) {
for (size_t j = 0; j < 16; ++j) {
if (!validate_fp16(f[i + j], i + j)) {
return false;
}
}
GGML_UNREACHABLE();
}
}
#elif defined(__ARM_NEON)
for (; i + 7 < nb; i += 8) {
uint16x8_t v = vld1q_u16(f + i);
uint16x8_t vexp = vandq_u16(v, vdupq_n_u16(0x7c00));
uint16x8_t cmp = vceqq_u16(vexp, vdupq_n_u16(0x7c00));
uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(cmp, 4)), 0);
if (mask) {
for (size_t j = 0; j < 8; ++j) {
if (!validate_fp16(f[i + j], i + j)) {
return false;
}
}
GGML_UNREACHABLE();
}
}
#endif
for (; i < nb; ++i) {
if (!validate_fp16(f[i], i)) {
return false;
}
}
} break;
case GGML_TYPE_F32:
{
const float * f = (const float *) data;
size_t i = 0;
#if defined(__AVX2__)
for (; i + 7 < nb; i += 8) {
__m256i v = _mm256_loadu_si256((const __m256i *)(f + i));
__m256i vexp = _mm256_and_si256(v, _mm256_set1_epi32(0x7f800000));
__m256i cmp = _mm256_cmpeq_epi32(vexp, _mm256_set1_epi32(0x7f800000));
int mask = _mm256_movemask_epi8(cmp);
if (mask) {
for (size_t j = 0; j < 8; ++j) {
if (!validate_float(f[i + j], i + j)) {
return false;
}
}
GGML_UNREACHABLE();
}
}
#elif defined(__ARM_NEON)
for (; i + 3 < nb; i += 4) {
uint32x4_t v = vld1q_u32((const uint32_t *)f + i);
uint32x4_t vexp = vandq_u32(v, vdupq_n_u32(0x7f800000));
uint32x4_t cmp = vceqq_u32(vexp, vdupq_n_u32(0x7f800000));
uint64_t mask = vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(cmp, 8)), 0);
if (mask) {
for (size_t j = 0; j < 4; ++j) {
if (!validate_float(f[i + j], i + j)) {
return false;
}
}
GGML_UNREACHABLE();
}
}
#endif
for (; i < nb; ++i) {
if (!validate_float(f[i], i)) {
return false;
}
}
} break;
case GGML_TYPE_F64:
{
const double * f = (const double *) data;
for (size_t i = 0; i < nb; ++i) {
if (!validate_float(f[i], i)) {
return false;
}
}
} break;
case GGML_TYPE_Q4_0:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_q4_0, data, nb);
} break;
case GGML_TYPE_Q4_1:
{
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_1, data, nb, d, m);
} break;
case GGML_TYPE_Q5_0:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_q5_0, data, nb);
} break;
case GGML_TYPE_Q5_1:
{
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q5_1, data, nb, d, m);
} break;
case GGML_TYPE_Q8_0:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_q8_0, data, nb);
} break;
case GGML_TYPE_Q2_K:
{
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q2_K, data, nb, d, dmin);
} break;
case GGML_TYPE_Q3_K:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_q3_K, data, nb);
} break;
case GGML_TYPE_Q4_K:
{
#ifdef GGML_QKK_64
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_K, data, nb, d[0], d[1]);
#else
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_K, data, nb, d, dmin);
#endif
} break;
case GGML_TYPE_Q5_K:
{
#ifdef GGML_QKK_64
VALIDATE_ROW_DATA_D_F16_IMPL(block_q5_K, data, nb);
#else
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q5_K, data, nb, d, dmin);
#endif
} break;
case GGML_TYPE_Q6_K:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_q6_K, data, nb);
} break;
case GGML_TYPE_Q8_K:
{
const block_q8_K * q = (const block_q8_K *) data;
for (size_t i = 0; i < nb; ++i) {
if (!validate_float(q[i].d, i)) {
return false;
}
}
} break;
case GGML_TYPE_IQ1_S:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq1_s, data, nb);
} break;
case GGML_TYPE_IQ1_M:
{
const block_iq1_m * q = (const block_iq1_m *) data;
for (size_t i = 0; i < nb; ++i) {
#if QK_K == 64
if (!validate_fp16(q[i].d, i)) {
return false;
}
#else
iq1m_scale_t scale;
const uint16_t * sc = (const uint16_t *)q[i].scales;
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
if (!validate_fp16(scale.f16, i)) {
return false;
}
#endif
}
} break;
case GGML_TYPE_IQ2_XXS:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_xxs, data, nb);
} break;
case GGML_TYPE_IQ2_XS:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_xs, data, nb);
} break;
case GGML_TYPE_IQ2_S:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_s, data, nb);
} break;
case GGML_TYPE_IQ3_XXS:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq3_xxs, data, nb);
} break;
case GGML_TYPE_IQ3_S:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq3_s, data, nb);
} break;
case GGML_TYPE_IQ4_XS:
#if QK_K != 64
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_xs, data, nb);
} break;
#endif
// with QK_K == 64, iq4_xs is iq4_nl
case GGML_TYPE_IQ4_NL:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
} break;
case GGML_TYPE_I8:
case GGML_TYPE_I16:
case GGML_TYPE_I32:
case GGML_TYPE_I64:
// nothing to validate
break;
default:
{
fprintf(stderr, "%s: invalid type %d\n", __func__, type);
return false;
}
}
return true;
}

View File

@ -13416,11 +13416,16 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
version += std::to_string(prop.get_minor_version()); version += std::to_string(prop.get_minor_version());
device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), ""); device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
std::string name = std::string(prop.get_name());
name = std::regex_replace(name, std::regex("\\(R\\)"), "");
name = std::regex_replace(name, std::regex("\\(TM\\)"), "");
fprintf(stderr, "|%2d|%18s|%45s|%10s|%11d|%8d|%7d|%15lu|\n", id, device_type.c_str(), auto global_mem_size = prop.get_global_mem_size()/1000000;
prop.get_name(), version.c_str(), prop.get_max_compute_units(),
fprintf(stderr, "|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
name.c_str(), version.c_str(), prop.get_max_compute_units(),
prop.get_max_work_group_size(), prop.get_max_sub_group_size(), prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
prop.get_global_mem_size()); global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
} }
void ggml_backend_sycl_print_sycl_devices() { void ggml_backend_sycl_print_sycl_devices() {
@ -13428,9 +13433,10 @@ void ggml_backend_sycl_print_sycl_devices() {
int device_count = dpct::dev_mgr::instance().device_count(); int device_count = dpct::dev_mgr::instance().device_count();
std::map<std::string, size_t> DeviceNums; std::map<std::string, size_t> DeviceNums;
fprintf(stderr, "found %d SYCL devices:\n", device_count); fprintf(stderr, "found %d SYCL devices:\n", device_count);
fprintf(stderr, "| | | |Compute |Max compute|Max work|Max sub| |\n"); fprintf(stderr, "| | | | |Max | |Max |Global | |\n");
fprintf(stderr, "|ID| Device Type| Name|capability|units |group |group |Global mem size|\n"); fprintf(stderr, "| | | | |compute|Max work|sub |mem | |\n");
fprintf(stderr, "|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|\n"); fprintf(stderr, "|ID| Device Type| Name|Version|units |group |group|size | Driver version|\n");
fprintf(stderr, "|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|\n");
for (int id = 0; id < device_count; ++id) { for (int id = 0; id < device_count; ++id) {
sycl::device device = dpct::dev_mgr::instance().get_device(id); sycl::device device = dpct::dev_mgr::instance().get_device(id);
sycl::backend backend = device.get_backend(); sycl::backend backend = device.get_backend();

63
ggml.c
View File

@ -858,18 +858,6 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
// simd mappings // simd mappings
// //
#if defined(__ARM_NEON)
#if !defined(__aarch64__)
// 64-bit compatibility
inline static float vaddvq_f32(float32x4_t v) {
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
}
#endif
#endif
// we define a common set of C macros which map to specific intrinsics based on the current architecture // we define a common set of C macros which map to specific intrinsics based on the current architecture
// we then implement the fundamental computation operations below using only these macros // we then implement the fundamental computation operations below using only these macros
// adding support for new architectures requires to define the corresponding SIMD macros // adding support for new architectures requires to define the corresponding SIMD macros
@ -10825,7 +10813,7 @@ static void ggml_compute_forward_mul_mat(
#endif #endif
#if GGML_USE_LLAMAFILE #if GGML_USE_LLAMAFILE
if (nb10 == ggml_type_size(src1->type)) { if (src1_cont) {
for (int64_t i13 = 0; i13 < ne13; i13++) for (int64_t i13 = 0; i13 < ne13; i13++)
for (int64_t i12 = 0; i12 < ne12; i12++) for (int64_t i12 = 0; i12 < ne12; i12++)
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type), if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
@ -10878,15 +10866,13 @@ UseGgmlGemm1:;
const size_t row_size = ggml_row_size(vec_dot_type, ne10); const size_t row_size = ggml_row_size(vec_dot_type, ne10);
#if GGML_USE_LLAMAFILE #if GGML_USE_LLAMAFILE
if (nb10 == ggml_type_size(src1->type) || src1->type != vec_dot_type) { if (src1->type != vec_dot_type) {
for (int64_t i13 = 0; i13 < ne13; i13++) for (int64_t i13 = 0; i13 < ne13; i13++)
for (int64_t i12 = 0; i12 < ne12; i12++) for (int64_t i12 = 0; i12 < ne12; i12++)
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type), if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
nb01/ggml_type_size(src0->type), nb01/ggml_type_size(src0->type),
(const char *)wdata + ggml_row_size(vec_dot_type, (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
nb12/ggml_type_size(src1->type)*i12 +
nb13/ggml_type_size(src1->type)*i13),
row_size/ggml_type_size(vec_dot_type), row_size/ggml_type_size(vec_dot_type),
(char *)dst->data + i12*nb2 + i13*nb3, (char *)dst->data + i12*nb2 + i13*nb3,
nb1/ggml_type_size(dst->type), nb1/ggml_type_size(dst->type),
@ -20628,7 +20614,7 @@ static void gguf_free_kv(struct gguf_kv * kv) {
} }
struct gguf_context * gguf_init_empty(void) { struct gguf_context * gguf_init_empty(void) {
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context));
memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic)); memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
ctx->header.version = GGUF_VERSION; ctx->header.version = GGUF_VERSION;
@ -20673,7 +20659,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
bool ok = true; bool ok = true;
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context));
// read the header // read the header
{ {
@ -20710,9 +20696,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// read the kv pairs // read the kv pairs
{ {
ctx->kv = GGML_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv)); const uint64_t n_kv = ctx->header.n_kv;
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) { // header.n_kv will hold the actual value of pairs that were successfully read in the loop below
ctx->header.n_kv = 0;
ctx->kv = GGML_CALLOC(n_kv, sizeof(struct gguf_kv));
for (uint64_t i = 0; i < n_kv; ++i) {
struct gguf_kv * kv = &ctx->kv[i]; struct gguf_kv * kv = &ctx->kv[i];
//fprintf(stderr, "%s: reading kv %d\n", __func__, i); //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
@ -20761,7 +20751,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
return NULL; return NULL;
} }
kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * gguf_type_size(kv->value.arr.type)); kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, gguf_type_size(kv->value.arr.type));
ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset); ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
} break; } break;
@ -20775,7 +20765,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
return NULL; return NULL;
} }
kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * sizeof(struct gguf_str)); kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, sizeof(struct gguf_str));
for (uint64_t j = 0; j < kv->value.arr.n; ++j) { for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset); ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
@ -20791,6 +20781,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
if (!ok) { if (!ok) {
break; break;
} }
ctx->header.n_kv++;
} }
if (!ok) { if (!ok) {
@ -20803,7 +20795,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// read the tensor infos // read the tensor infos
{ {
ctx->infos = GGML_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info)); ctx->infos = GGML_CALLOC(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
struct gguf_tensor_info * info = &ctx->infos[i]; struct gguf_tensor_info * info = &ctx->infos[i];
@ -20824,8 +20816,17 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset); ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset); ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
// TODO: return an error instead of crashing with GGML_ASSERT
gguf_tensor_info_sanitize(info); gguf_tensor_info_sanitize(info);
// make sure there is no duplicated tensor names
for (uint64_t j = 0; j < i; ++j) {
if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) {
fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data);
ok = false;
}
}
if (!ok) { if (!ok) {
fprintf(stderr, "%s: failed to read tensor info\n", __func__); fprintf(stderr, "%s: failed to read tensor info\n", __func__);
fclose(file); fclose(file);
@ -20994,7 +20995,7 @@ void gguf_free(struct gguf_context * ctx) {
GGML_FREE(ctx->infos); GGML_FREE(ctx->infos);
} }
GGML_ALIGNED_FREE(ctx); GGML_FREE(ctx);
} }
const char * gguf_type_name(enum gguf_type type) { const char * gguf_type_name(enum gguf_type type) {
@ -21305,7 +21306,7 @@ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_ty
ctx->kv[idx].type = GGUF_TYPE_ARRAY; ctx->kv[idx].type = GGUF_TYPE_ARRAY;
ctx->kv[idx].value.arr.type = type; ctx->kv[idx].value.arr.type = type;
ctx->kv[idx].value.arr.n = n; ctx->kv[idx].value.arr.n = n;
ctx->kv[idx].value.arr.data = GGML_MALLOC(n*gguf_type_size(type)); ctx->kv[idx].value.arr.data = GGML_CALLOC(n, gguf_type_size(type));
memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type)); memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
} }
@ -21315,7 +21316,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
ctx->kv[idx].type = GGUF_TYPE_ARRAY; ctx->kv[idx].type = GGUF_TYPE_ARRAY;
ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING; ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
ctx->kv[idx].value.arr.n = n; ctx->kv[idx].value.arr.n = n;
ctx->kv[idx].value.arr.data = GGML_MALLOC(n*sizeof(struct gguf_str)); ctx->kv[idx].value.arr.data = GGML_CALLOC(n, sizeof(struct gguf_str));
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i]; struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
str->n = strlen(data[i]); str->n = strlen(data[i]);
@ -21342,7 +21343,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
case GGUF_TYPE_ARRAY: case GGUF_TYPE_ARRAY:
{ {
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) { if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
const char ** data = GGML_MALLOC(src->kv[i].value.arr.n*sizeof(char *)); const char ** data = GGML_CALLOC(src->kv[i].value.arr.n, sizeof(char *));
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) { for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data; data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
} }
@ -21362,6 +21363,10 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
void gguf_add_tensor( void gguf_add_tensor(
struct gguf_context * ctx, struct gguf_context * ctx,
const struct ggml_tensor * tensor) { const struct ggml_tensor * tensor) {
if (gguf_find_tensor(ctx, tensor->name) != -1) {
GGML_ASSERT(false && "duplicated tensor name");
}
const int idx = ctx->header.n_tensors; const int idx = ctx->header.n_tensors;
ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info)); ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
@ -21430,7 +21435,7 @@ struct gguf_buf {
static struct gguf_buf gguf_buf_init(size_t size) { static struct gguf_buf gguf_buf_init(size_t size) {
struct gguf_buf buf = { struct gguf_buf buf = {
/*buf.data =*/ size == 0 ? NULL : GGML_MALLOC(size), /*buf.data =*/ size == 0 ? NULL : GGML_CALLOC(1, size),
/*buf.size =*/ size, /*buf.size =*/ size,
/*buf.offset =*/ 0, /*buf.offset =*/ 0,
}; };

2
ggml.h
View File

@ -762,6 +762,8 @@ extern "C" {
// use this to compute the memory overhead of a tensor // use this to compute the memory overhead of a tensor
GGML_API size_t ggml_tensor_overhead(void); GGML_API size_t ggml_tensor_overhead(void);
GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes);
// main // main
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);

View File

@ -124,6 +124,7 @@ class MODEL_ARCH(IntEnum):
QWEN2 = auto() QWEN2 = auto()
QWEN2MOE = auto() QWEN2MOE = auto()
PHI2 = auto() PHI2 = auto()
PHI3 = auto()
PLAMO = auto() PLAMO = auto()
CODESHELL = auto() CODESHELL = auto()
ORION = auto() ORION = auto()
@ -200,6 +201,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.QWEN2: "qwen2", MODEL_ARCH.QWEN2: "qwen2",
MODEL_ARCH.QWEN2MOE: "qwen2moe", MODEL_ARCH.QWEN2MOE: "qwen2moe",
MODEL_ARCH.PHI2: "phi2", MODEL_ARCH.PHI2: "phi2",
MODEL_ARCH.PHI3: "phi3",
MODEL_ARCH.PLAMO: "plamo", MODEL_ARCH.PLAMO: "plamo",
MODEL_ARCH.CODESHELL: "codeshell", MODEL_ARCH.CODESHELL: "codeshell",
MODEL_ARCH.ORION: "orion", MODEL_ARCH.ORION: "orion",
@ -550,6 +552,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
], ],
MODEL_ARCH.PHI3: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_QKV,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.CODESHELL: [ MODEL_ARCH.CODESHELL: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.POS_EMBD, MODEL_TENSOR.POS_EMBD,
@ -872,6 +888,7 @@ GGML_QUANT_SIZES = {
GGMLQuantizationType.I32: (1, 4), GGMLQuantizationType.I32: (1, 4),
GGMLQuantizationType.I64: (1, 8), GGMLQuantizationType.I64: (1, 8),
GGMLQuantizationType.F64: (1, 8), GGMLQuantizationType.F64: (1, 8),
GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32),
} }

View File

@ -234,8 +234,14 @@ class GGUFReader:
def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None: def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None:
tensors = [] tensors = []
tensor_names = set() # keep track of name to prevent duplicated tensors
for field in fields: for field in fields:
_name_len, name_data, _n_dims, dims, raw_dtype, offset_tensor = field.parts _name_len, name_data, _n_dims, dims, raw_dtype, offset_tensor = field.parts
# check if there's any tensor having same name already in the list
tensor_name = str(bytes(name_data), encoding = 'utf-8')
if tensor_name in tensor_names:
raise ValueError(f'Found duplicated tensor with name {tensor_name}')
tensor_names.add(tensor_name)
ggml_type = GGMLQuantizationType(raw_dtype[0]) ggml_type = GGMLQuantizationType(raw_dtype[0])
n_elems = np.prod(dims) n_elems = np.prod(dims)
block_size, type_size = GGML_QUANT_SIZES[ggml_type] block_size, type_size = GGML_QUANT_SIZES[ggml_type]
@ -267,7 +273,7 @@ class GGUFReader:
item_count = n_bytes item_count = n_bytes
item_type = np.uint8 item_type = np.uint8
tensors.append(ReaderTensor( tensors.append(ReaderTensor(
name = str(bytes(name_data), encoding = 'utf-8'), name = tensor_name,
tensor_type = ggml_type, tensor_type = ggml_type,
shape = dims, shape = dims,
n_elements = n_elems, n_elements = n_elems,

View File

@ -63,6 +63,7 @@ class GGUFWriter:
self.kv_data_count = 0 self.kv_data_count = 0
self.ti_data = bytearray() self.ti_data = bytearray()
self.ti_data_count = 0 self.ti_data_count = 0
self.ti_names = set()
self.use_temp_file = use_temp_file self.use_temp_file = use_temp_file
self.temp_file = None self.temp_file = None
self.tensors = [] self.tensors = []
@ -197,6 +198,10 @@ class GGUFWriter:
if self.state is not WriterState.EMPTY: if self.state is not WriterState.EMPTY:
raise ValueError(f'Expected output file to be empty, got {self.state}') raise ValueError(f'Expected output file to be empty, got {self.state}')
if name in self.ti_names:
raise ValueError(f'Duplicated tensor name {name}')
self.ti_names.add(name)
encoded_name = name.encode("utf8") encoded_name = name.encode("utf8")
self.ti_data += self._pack("Q", len(encoded_name)) self.ti_data += self._pack("Q", len(encoded_name))
self.ti_data += encoded_name self.ti_data += encoded_name

View File

@ -117,6 +117,7 @@ class TensorNameMap:
"h.{bid}.attn.c_attn", # gpt2 "h.{bid}.attn.c_attn", # gpt2
"transformer.h.{bid}.mixer.Wqkv", # phi2 "transformer.h.{bid}.mixer.Wqkv", # phi2
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert "encoder.layers.{bid}.attn.Wqkv", # nomic-bert
"model.layers.{bid}.self_attn.qkv_proj" # phi3
), ),
# Attention query # Attention query
@ -234,6 +235,7 @@ class TensorNameMap:
"h.{bid}.mlp.c_fc", # gpt2 "h.{bid}.mlp.c_fc", # gpt2
"transformer.h.{bid}.mlp.fc1", # phi2 "transformer.h.{bid}.mlp.fc1", # phi2
"model.layers.{bid}.mlp.fc1", # phi2 "model.layers.{bid}.mlp.fc1", # phi2
"model.layers.{bid}.mlp.gate_up_proj", # phi3
"model.layers.layers.{bid}.mlp.up_proj", # plamo "model.layers.layers.{bid}.mlp.up_proj", # plamo
"model.layers.{bid}.feed_forward.w3", # internlm2 "model.layers.{bid}.feed_forward.w3", # internlm2
"encoder.layers.{bid}.mlp.fc11", # nomic-bert "encoder.layers.{bid}.mlp.fc11", # nomic-bert

584
llama.cpp

File diff suppressed because it is too large Load Diff

44
llama.h
View File

@ -195,15 +195,19 @@ extern "C" {
LLAMA_KV_OVERRIDE_TYPE_INT, LLAMA_KV_OVERRIDE_TYPE_INT,
LLAMA_KV_OVERRIDE_TYPE_FLOAT, LLAMA_KV_OVERRIDE_TYPE_FLOAT,
LLAMA_KV_OVERRIDE_TYPE_BOOL, LLAMA_KV_OVERRIDE_TYPE_BOOL,
LLAMA_KV_OVERRIDE_TYPE_STR,
}; };
struct llama_model_kv_override { struct llama_model_kv_override {
char key[128];
enum llama_model_kv_override_type tag; enum llama_model_kv_override_type tag;
char key[128];
union { union {
int64_t int_value; int64_t val_i64;
double float_value; double val_f64;
bool bool_value; bool val_bool;
char val_str[128];
}; };
}; };
@ -232,9 +236,10 @@ extern "C" {
const struct llama_model_kv_override * kv_overrides; const struct llama_model_kv_override * kv_overrides;
// Keep the booleans together to avoid misalignment during copy-by-value. // Keep the booleans together to avoid misalignment during copy-by-value.
bool vocab_only; // only load the vocabulary, no weights bool vocab_only; // only load the vocabulary, no weights
bool use_mmap; // use mmap if possible bool use_mmap; // use mmap if possible
bool use_mlock; // force system to keep model in RAM bool use_mlock; // force system to keep model in RAM
bool check_tensors; // validate model tensor data
}; };
struct llama_context_params { struct llama_context_params {
@ -288,6 +293,7 @@ extern "C" {
bool quantize_output_tensor; // quantize output.weight bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
bool pure; // quantize all tensors to the default type bool pure; // quantize all tensors to the default type
bool keep_split; // quantize to the same number of shards
void * imatrix; // pointer to importance matrix data void * imatrix; // pointer to importance matrix data
void * kv_overrides; // pointer to vector containing overrides void * kv_overrides; // pointer to vector containing overrides
} llama_model_quantize_params; } llama_model_quantize_params;
@ -390,8 +396,10 @@ extern "C" {
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model); LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model); LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model); LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
@ -783,6 +791,9 @@ extern "C" {
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token); LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
// Special tokens // Special tokens
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
@ -796,7 +807,7 @@ extern "C" {
// Returns -1 if unknown, 1 for true or 0 for false. // Returns -1 if unknown, 1 for true or 0 for false.
LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model); LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
// codellama infill tokens // Codellama infill tokens
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
@ -825,11 +836,13 @@ extern "C" {
// Uses the vocabulary in the provided context. // Uses the vocabulary in the provided context.
// Does not write null terminator to the buffer. // Does not write null terminator to the buffer.
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens. // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
// @param special If true, special tokens are rendered in the output.
LLAMA_API int32_t llama_token_to_piece( LLAMA_API int32_t llama_token_to_piece(
const struct llama_model * model, const struct llama_model * model,
llama_token token, llama_token token,
char * buf, char * buf,
int32_t length); int32_t length,
bool special);
/// Apply chat template. Inspired by hf apply_chat_template() on python. /// Apply chat template. Inspired by hf apply_chat_template() on python.
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model" /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
@ -982,7 +995,7 @@ extern "C" {
struct llama_context * ctx, struct llama_context * ctx,
llama_token_data_array * candidates); llama_token_data_array * candidates);
/// @details Randomly selects a token from the candidates based on their probabilities. /// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
LLAMA_API llama_token llama_sample_token( LLAMA_API llama_token llama_sample_token(
struct llama_context * ctx, struct llama_context * ctx,
llama_token_data_array * candidates); llama_token_data_array * candidates);
@ -1069,8 +1082,9 @@ extern "C" {
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
#ifdef LLAMA_API_INTERNAL #ifdef LLAMA_API_INTERNAL
#include <vector> #include <random>
#include <string> #include <string>
#include <vector>
struct ggml_tensor; struct ggml_tensor;
@ -1107,6 +1121,10 @@ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
const std::string & src, const std::string & src,
llama_partial_utf8 partial_start); llama_partial_utf8 partial_start);
// Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
// This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng);
#endif // LLAMA_API_INTERNAL #endif // LLAMA_API_INTERNAL
#endif // LLAMA_H #endif // LLAMA_H

BIN
media/matmul.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 260 KiB

1238
media/matmul.svg Normal file

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 51 KiB

16
scripts/xxd.cmake Normal file
View File

@ -0,0 +1,16 @@
# CMake equivalent of `xxd -i ${INPUT} ${OUTPUT}`
# Usage: cmake -DINPUT=examples/server/public/index.html -DOUTPUT=examples/server/index.html.hpp -P scripts/xxd.cmake
SET(INPUT "" CACHE STRING "Input File")
SET(OUTPUT "" CACHE STRING "Output File")
get_filename_component(filename "${INPUT}" NAME)
string(REGEX REPLACE "\\.|-" "_" name "${filename}")
file(READ "${INPUT}" hex_data HEX)
string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," hex_sequence "${hex_data}")
string(LENGTH ${hex_data} hex_len)
math(EXPR len "${hex_len} / 2")
file(WRITE "${OUTPUT}" "unsigned char ${name}[] = {${hex_sequence}};\nunsigned int ${name}_len = ${len};\n")

1015
sgemm.cpp

File diff suppressed because it is too large Load Diff

View File

@ -1,11 +1,13 @@
#pragma once #pragma once
#include <stdint.h>
#include <stdbool.h> #include <stdbool.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
bool llamafile_sgemm(int, int, int, const void *, int, const void *, int, bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
void *, int, int, int, int, int, int, int); const void *, int64_t, void *, int64_t, int, int,
int, int, int, int);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -46,7 +46,11 @@ int main(void) {
// No template included in tokenizer_config.json, so this template likely needs to be manually set. // No template included in tokenizer_config.json, so this template likely needs to be manually set.
"{%- for message in messages %}{%- if message['role'] == 'system' -%}{{-'SYSTEM: ' + message['content'] + '\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'USER: ' + message['content'] + '\n'-}}{%- else -%}{{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'ASSISTANT:'-}}{%- endif -%}", "{%- for message in messages %}{%- if message['role'] == 'system' -%}{{-'SYSTEM: ' + message['content'] + '\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'USER: ' + message['content'] + '\n'-}}{%- else -%}{{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'ASSISTANT:'-}}{%- endif -%}",
// CohereForAI/c4ai-command-r-plus // CohereForAI/c4ai-command-r-plus
"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}" "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
// Llama-3
"{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
// Phi-3
"{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + ' ' + message['content'] + '<|end|> ' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|> ' }}{% else %}{{ eos_token }}{% endif %}"
}; };
std::vector<std::string> expected_output = { std::vector<std::string> expected_output = {
// teknium/OpenHermes-2.5-Mistral-7B // teknium/OpenHermes-2.5-Mistral-7B
@ -73,6 +77,10 @@ int main(void) {
"SYSTEM: You are a helpful assistant\nUSER: Hello\nASSISTANT: Hi there</s>\nUSER: Who are you\nASSISTANT: I am an assistant </s>\nUSER: Another question\nASSISTANT:", "SYSTEM: You are a helpful assistant\nUSER: Hello\nASSISTANT: Hi there</s>\nUSER: Who are you\nASSISTANT: I am an assistant </s>\nUSER: Another question\nASSISTANT:",
// CohereForAI/c4ai-command-r-plus // CohereForAI/c4ai-command-r-plus
"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
// Llama 3
"<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi there<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAnother question<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
// Phi 3
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\nI am an assistant<|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
}; };
std::vector<char> formatted_chat(1024); std::vector<char> formatted_chat(1024);
int32_t res; int32_t res;