diff --git a/.devops/full-musa.Dockerfile b/.devops/full-musa.Dockerfile new file mode 100644 index 000000000..34ba856d3 --- /dev/null +++ b/.devops/full-musa.Dockerfile @@ -0,0 +1,26 @@ +ARG UBUNTU_VERSION=22.04 +# This needs to generally match the container host's environment. +ARG MUSA_VERSION=rc3.1.0 +# Target the MUSA build image +ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION} + +FROM ${BASE_MUSA_DEV_CONTAINER} AS build + +RUN apt-get update && \ + apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1 + +COPY requirements.txt requirements.txt +COPY requirements requirements + +RUN pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt + +WORKDIR /app + +COPY . . + +RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ + cmake --build build --config Release -j$(nproc) && \ + cp build/bin/* . + +ENTRYPOINT ["/app/.devops/tools.sh"] diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile index 680d1cb92..df496bcd2 100644 --- a/.devops/full-rocm.Dockerfile +++ b/.devops/full-rocm.Dockerfile @@ -11,7 +11,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 # This is mostly tied to rocBLAS supported archs. -ARG ROCM_DOCKER_ARCH=\ +ARG ROCM_DOCKER_ARCH="\ gfx803 \ gfx900 \ gfx906 \ @@ -21,7 +21,7 @@ ARG ROCM_DOCKER_ARCH=\ gfx1030 \ gfx1100 \ gfx1101 \ - gfx1102 + gfx1102" COPY requirements.txt requirements.txt COPY requirements requirements @@ -34,7 +34,7 @@ WORKDIR /app COPY . . # Set nvcc architecture -ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} +ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH} # Enable ROCm ENV GGML_HIPBLAS=1 ENV CC=/opt/rocm/llvm/bin/clang diff --git a/.devops/llama-cli-musa.Dockerfile b/.devops/llama-cli-musa.Dockerfile new file mode 100644 index 000000000..b5696794f --- /dev/null +++ b/.devops/llama-cli-musa.Dockerfile @@ -0,0 +1,30 @@ +ARG UBUNTU_VERSION=22.04 +# This needs to generally match the container host's environment. +ARG MUSA_VERSION=rc3.1.0 +# Target the MUSA build image +ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION} +# Target the MUSA runtime image +ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} + +FROM ${BASE_MUSA_DEV_CONTAINER} AS build + +RUN apt-get update && \ + apt-get install -y build-essential git cmake + +WORKDIR /app + +COPY . . + +RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ + cmake --build build --config Release --target llama-cli -j$(nproc) + +FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime + +RUN apt-get update && \ + apt-get install -y libgomp1 + +COPY --from=build /app/build/ggml/src/libggml.so /libggml.so +COPY --from=build /app/build/src/libllama.so /libllama.so +COPY --from=build /app/build/bin/llama-cli /llama-cli + +ENTRYPOINT [ "/llama-cli" ] diff --git a/.devops/llama-cli-rocm.Dockerfile b/.devops/llama-cli-rocm.Dockerfile index c3d1ab067..e60c747bd 100644 --- a/.devops/llama-cli-rocm.Dockerfile +++ b/.devops/llama-cli-rocm.Dockerfile @@ -11,7 +11,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 # This is mostly tied to rocBLAS supported archs. -ARG ROCM_DOCKER_ARCH=\ +ARG ROCM_DOCKER_ARCH="\ gfx803 \ gfx900 \ gfx906 \ @@ -21,7 +21,7 @@ ARG ROCM_DOCKER_ARCH=\ gfx1030 \ gfx1100 \ gfx1101 \ - gfx1102 + gfx1102" COPY requirements.txt requirements.txt COPY requirements requirements @@ -34,7 +34,7 @@ WORKDIR /app COPY . . # Set nvcc architecture -ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} +ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH} # Enable ROCm ENV GGML_HIPBLAS=1 ENV CC=/opt/rocm/llvm/bin/clang diff --git a/.devops/llama-server-musa.Dockerfile b/.devops/llama-server-musa.Dockerfile new file mode 100644 index 000000000..193a6d77c --- /dev/null +++ b/.devops/llama-server-musa.Dockerfile @@ -0,0 +1,35 @@ +ARG UBUNTU_VERSION=22.04 +# This needs to generally match the container host's environment. +ARG MUSA_VERSION=rc3.1.0 +# Target the MUSA build image +ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION} +# Target the MUSA runtime image +ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} + +FROM ${BASE_MUSA_DEV_CONTAINER} AS build + +RUN apt-get update && \ + apt-get install -y build-essential git cmake libcurl4-openssl-dev + +WORKDIR /app + +COPY . . + +RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ + cmake --build build --config Release --target llama-server -j$(nproc) + +FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime + +RUN apt-get update && \ + apt-get install -y libcurl4-openssl-dev libgomp1 curl + +COPY --from=build /app/build/ggml/src/libggml.so /libggml.so +COPY --from=build /app/build/src/libllama.so /libllama.so +COPY --from=build /app/build/bin/llama-server /llama-server + +# Must be set to 0.0.0.0 so it can listen to requests from host machine +ENV LLAMA_ARG_HOST=0.0.0.0 + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/llama-server" ] diff --git a/.devops/llama-server-rocm.Dockerfile b/.devops/llama-server-rocm.Dockerfile index fd0e19ad6..8553af75b 100644 --- a/.devops/llama-server-rocm.Dockerfile +++ b/.devops/llama-server-rocm.Dockerfile @@ -11,7 +11,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 # This is mostly tied to rocBLAS supported archs. -ARG ROCM_DOCKER_ARCH=\ +ARG ROCM_DOCKER_ARCH="\ gfx803 \ gfx900 \ gfx906 \ @@ -21,7 +21,7 @@ ARG ROCM_DOCKER_ARCH=\ gfx1030 \ gfx1100 \ gfx1101 \ - gfx1102 + gfx1102" COPY requirements.txt requirements.txt COPY requirements requirements @@ -34,7 +34,7 @@ WORKDIR /app COPY . . # Set nvcc architecture -ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} +ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH} # Enable ROCm ENV GGML_HIPBLAS=1 ENV CC=/opt/rocm/llvm/bin/clang diff --git a/.github/workflows/bench.yml.disabled b/.github/workflows/bench.yml.disabled index bfdbb4ef5..1c8787ef7 100644 --- a/.github/workflows/bench.yml.disabled +++ b/.github/workflows/bench.yml.disabled @@ -27,10 +27,10 @@ on: push: branches: - master - paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] + paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] pull_request_target: types: [opened, synchronize, reopened] - paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] + paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] schedule: - cron: '04 2 * * *' diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a54c5de99..423173b97 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -19,6 +19,11 @@ concurrency: group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} cancel-in-progress: true +# Fine-grant permission +# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token +permissions: + contents: write # for creating release + env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} GGML_NLOOP: 3 @@ -956,6 +961,7 @@ jobs: cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin echo "cp oneAPI running time dll files to ./build/bin done" 7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/* @@ -1031,7 +1037,7 @@ jobs: run: | $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" - cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON + cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON cmake --build build -j ${env:NUMBER_OF_PROCESSORS} md "build\bin\rocblas\library\" cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\" diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml index 69c9f4f69..f63860d14 100644 --- a/.github/workflows/close-issue.yml +++ b/.github/workflows/close-issue.yml @@ -3,6 +3,11 @@ on: schedule: - cron: "42 0 * * *" +# Fine-grant permission +# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token +permissions: + issues: write + jobs: close-issues: runs-on: ubuntu-latest diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index a4ac9b217..a953cdac9 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -43,6 +43,9 @@ jobs: - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } + - { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" } + - { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" } + - { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" } # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } #- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml index 4aa4b2379..0da6acdf1 100644 --- a/.github/workflows/nix-ci-aarch64.yml +++ b/.github/workflows/nix-ci-aarch64.yml @@ -21,6 +21,13 @@ concurrency: group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} cancel-in-progress: true +# Fine-grant permission +# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token +permissions: + # https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub + id-token: write + contents: read + jobs: nix-build-aarch64: runs-on: ubuntu-latest diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml index 8955f38d0..8ecbbe53b 100644 --- a/.github/workflows/nix-ci.yml +++ b/.github/workflows/nix-ci.yml @@ -12,6 +12,13 @@ concurrency: group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} cancel-in-progress: true +# Fine-grant permission +# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token +permissions: + # https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub + id-token: write + contents: read + jobs: nix-eval: strategy: diff --git a/.github/workflows/python-type-check.yml b/.github/workflows/python-type-check.yml index e5ff5e6d7..373bb6010 100644 --- a/.github/workflows/python-type-check.yml +++ b/.github/workflows/python-type-check.yml @@ -4,11 +4,13 @@ on: push: paths: - '.github/workflows/python-type-check.yml' + - 'pyrightconfig.json' - '**.py' - '**/requirements*.txt' pull_request: paths: - '.github/workflows/python-type-check.yml' + - 'pyrightconfig.json' - '**.py' - '**/requirements*.txt' @@ -33,6 +35,6 @@ jobs: - name: Type-check with Pyright uses: jakebailey/pyright-action@v2 with: - version: 1.1.370 + version: 1.1.382 level: warning warnings: true diff --git a/CMakeLists.txt b/CMakeLists.txt index 973907819..ef0932a7b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -62,6 +62,9 @@ option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF) option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF) option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF) +# utils +option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE}) + # extra artifacts option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) @@ -85,6 +88,10 @@ if (NOT DEFINED GGML_LLAMAFILE) set(GGML_LLAMAFILE_DEFAULT ON) endif() +if (NOT DEFINED GGML_AMX) + set(GGML_AMX ON) +endif() + if (NOT DEFINED GGML_CUDA_GRAPHS) set(GGML_CUDA_GRAPHS_DEFAULT ON) endif() @@ -191,17 +198,19 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc" DESTINATION lib/pkgconfig) # -# programs, examples and tests +# utils, programs, examples and tests # -add_subdirectory(common) +if (LLAMA_BUILD_COMMON) + add_subdirectory(common) +endif() -if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION) +if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION) include(CTest) add_subdirectory(tests) -endif () +endif() -if (LLAMA_BUILD_EXAMPLES) +if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES) add_subdirectory(examples) add_subdirectory(pocs) endif() diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a9e000e52..4c882c254 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,24 +1,23 @@ # Pull requests (for contributors) - Test your changes: - - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library + - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the `ggml` library - Execute [the full CI locally on your machine](ci/README.md) before publishing -- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs. - - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience -- Consider allowing write access to your branch for faster review +- Optionally rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs +- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly - If your PR becomes stale, don't hesitate to ping the maintainers in the comments # Pull requests (for collaborators) - Squash-merge PRs - Use the following format for the squashed commit title: ` : (#)`. For example: `utils : fix typo in utils.py (#1234)` -- Optionally, pick a `` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules +- Optionally pick a `` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules # Coding guidelines - Avoid adding third-party dependencies, extra files, extra headers, etc. - Always consider cross-compatibility with other operating systems and architectures -- Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple +- Avoid fancy-looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple - There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a` - Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963) @@ -27,3 +26,8 @@ ![matmul](media/matmul.png) +# Resources + +The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects: + +https://github.com/ggerganov/llama.cpp/projects diff --git a/Makefile b/Makefile index 6bbdcb2e3..764a1cbd3 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,6 @@ BUILD_TARGETS = \ llama-batched \ llama-batched-bench \ llama-bench \ - llama-benchmark-matmult \ llama-cli \ llama-convert-llama2c-to-ggml \ llama-embedding \ @@ -71,7 +70,7 @@ TEST_TARGETS = \ # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \ - retrieval speculative infill tokenize benchmark-matmult parallel export-lora lookahead lookup passkey gritlm + retrieval speculative infill tokenize parallel export-lora lookahead lookup passkey gritlm # Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them. # We don't want to clutter things too much, so we only build replacements for the most commonly used binaries. @@ -97,11 +96,6 @@ GGML_METAL := 1 DEPRECATE_WARNING := 1 endif -ifdef LLAMA_OPENMP -GGML_OPENMP := 1 -DEPRECATE_WARNING := 1 -endif - ifdef LLAMA_RPC GGML_RPC := 1 DEPRECATE_WARNING := 1 @@ -588,6 +582,11 @@ ifndef GGML_NO_LLAMAFILE OBJ_GGML += ggml/src/llamafile/sgemm.o endif +ifndef GGML_NO_AMX + MK_CPPFLAGS += -DGGML_USE_AMX + OBJ_GGML += ggml/src/ggml-amx.o ggml/src/ggml-amx/mmq.o +endif + ifdef GGML_RPC MK_CPPFLAGS += -DGGML_USE_RPC OBJ_GGML += ggml/src/ggml-rpc.o @@ -1059,10 +1058,11 @@ ggml/src/ggml-alloc.o: \ $(CC) $(CFLAGS) -c $< -o $@ ggml/src/ggml-backend.o: \ - ggml/src/ggml-backend.c \ + ggml/src/ggml-backend.cpp \ + ggml/src/ggml-backend-impl.h \ ggml/include/ggml.h \ ggml/include/ggml-backend.h - $(CC) $(CFLAGS) -c $< -o $@ + $(CXX) $(CXXFLAGS) -c $< -o $@ ggml/src/ggml-quants.o: \ ggml/src/ggml-quants.c \ @@ -1091,6 +1091,19 @@ ggml/src/llamafile/sgemm.o: \ $(CXX) $(CXXFLAGS) -c $< -o $@ endif # GGML_NO_LLAMAFILE +ifndef GGML_NO_AMX +ggml/src/ggml-amx.o: \ + ggml/src/ggml-amx.cpp \ + ggml/include/ggml-amx.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ggml/src/ggml-amx/mmq.o: \ + ggml/src/ggml-amx/mmq.cpp \ + ggml/src/ggml-amx/mmq.h \ + ggml/include/ggml.h + $(CXX) $(CXXFLAGS) -c $< -o $@ +endif + ifdef GGML_RPC ggml/src/ggml-rpc.o: \ ggml/src/ggml-rpc.cpp \ @@ -1251,6 +1264,7 @@ clean: rm -vrf ggml/src/ggml-metal-embed.metal rm -vrf ggml/src/ggml-cuda/*.o rm -vrf ggml/src/ggml-cuda/template-instances/*.o + rm -vrf ggml/src/ggml-amx/*.o rm -rvf $(BUILD_TARGETS) rm -rvf $(TEST_TARGETS) rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp @@ -1539,16 +1553,6 @@ common/build-info.o: common/build-info.cpp tests: $(TEST_TARGETS) -llama-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp \ - $(OBJ_GGML) common/build-info.o - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -run-benchmark-matmult: llama-benchmark-matmult - ./$@ - -.PHONY: run-benchmark-matmult swift - tests/test-arg-parser: tests/test-arg-parser.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) diff --git a/Package.swift b/Package.swift index 1d90b47bf..3a17e6c34 100644 --- a/Package.swift +++ b/Package.swift @@ -11,7 +11,7 @@ var sources = [ "src/unicode-data.cpp", "ggml/src/ggml.c", "ggml/src/ggml-alloc.c", - "ggml/src/ggml-backend.c", + "ggml/src/ggml-backend.cpp", "ggml/src/ggml-quants.c", "ggml/src/ggml-aarch64.c", ] diff --git a/README.md b/README.md index ce954f713..8fe1f4b4b 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ## Hot topics -- Huggingface GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor) +- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669** +- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor) ---- @@ -28,9 +29,9 @@ variety of hardware - locally and in the cloud. - Plain C/C++ implementation without any dependencies - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks -- AVX, AVX2 and AVX512 support for x86 architectures +- AVX, AVX2, AVX512 and AMX support for x86 architectures - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use -- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP) +- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA) - Vulkan and SYCL backend support - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity @@ -91,6 +92,8 @@ Typically finetunes of the base models below are supported as well. - [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct) - [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a) - [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat) +- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a) +- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM) (instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md)) @@ -120,6 +123,7 @@ Typically finetunes of the base models below are supported as well. - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs) - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp) +- C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html) - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s) - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj) - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn) @@ -128,6 +132,8 @@ Typically finetunes of the base models below are supported as well. - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart) - PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326) - Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp) +- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift) +- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama) **UI:** @@ -167,12 +173,15 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [AIKit](https://github.com/sozercan/aikit) (MIT) - [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL) - [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT) +- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL) +- [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT) *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)* **Tools:** - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML +- [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage - [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example) @@ -181,6 +190,7 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp - [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs +- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly **Games:** - [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you. @@ -409,7 +419,7 @@ Please refer to [Build llama.cpp locally](./docs/build.md) | [BLAS](./docs/build.md#blas-build) | All | | [BLIS](./docs/backend/BLIS.md) | All | | [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU | -| [MUSA](./docs/build.md#musa) | Moore Threads GPU | +| [MUSA](./docs/build.md#musa) | Moore Threads MTT GPU | | [CUDA](./docs/build.md#cuda) | Nvidia GPU | | [hipBLAS](./docs/build.md#hipblas) | AMD GPU | | [Vulkan](./docs/build.md#vulkan) | GPU | @@ -441,7 +451,7 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio - Contributors can open PRs - Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch - Collaborators will be invited based on contributions -- Any help with managing issues and PRs is very appreciated! +- Any help with managing issues, PRs and projects is very appreciated! - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions - Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information - Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205) diff --git a/ci/run.sh b/ci/run.sh index 1ac08ee4e..e06778219 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -1,4 +1,4 @@ -#/bin/bash +#!/bin/bash # # sample usage: # @@ -712,6 +712,82 @@ function gg_run_embd_bge_small { set +e } +function gg_sum_embd_bge_small { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'BGE Small (BERT):\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" + gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" +} + +# rerank_tiny + +function gg_run_rerank_tiny { + cd ${SRC} + + gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json + gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer.json + gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer_config.json + gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/special_tokens_map.json + gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/resolve/main/pytorch_model.bin + gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/sentence_bert_config.json + gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/vocab.txt + gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/modules.json + gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json + + gg_wget models-mnt/rerank-tiny/1_Pooling https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/1_Pooling/config.json + + path_models="../models-mnt/rerank-tiny" + + rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release + + set -e + + (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log + + python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf + + model_f16="${path_models}/ggml-model-f16.gguf" + + # for this model, the SEP token is "" + (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?hi\nwhat is panda?it's a bear\nwhat is panda?The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log + + # sample output + # rerank score 0: 0.029 + # rerank score 1: 0.029 + # rerank score 2: 0.135 + + # check that the score is in the range [$3, $4] + function check_score { + qnt="$1" + score=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) + + if [ $(echo "$score < $3" | bc) -eq 1 ] || [ $(echo "$score > $4" | bc) -eq 1 ]; then + printf ' - %s @ %s (FAIL: score not in range [%s, %s])\n' "$qnt" "$score" "$3" "$4" + return 20 + fi + + printf ' - %s @ %s OK\n' "$qnt" "$score" + return 0 + } + + check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 0")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log + check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 1")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log + check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.30" | tee -a $OUT/${ci}-rk-f16.log + + set +e +} + +function gg_sum_rerank_tiny { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'Rerank Tiny (Jina):\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-rk-f16.log)" +} + function gg_check_build_requirements { if ! command -v cmake &> /dev/null; then gg_printf 'cmake not found, please install' @@ -726,15 +802,6 @@ function gg_check_build_requirements { fi } -function gg_sum_embd_bge_small { - gg_printf '### %s\n\n' "${ci}" - - gg_printf 'BGE Small (BERT):\n' - gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" - gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" - gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" -} - ## main export LLAMA_LOG_PREFIX=1 @@ -762,6 +829,7 @@ test $ret -eq 0 && gg_run ctest_release if [ -z ${GG_BUILD_LOW_PERF} ]; then test $ret -eq 0 && gg_run embd_bge_small + test $ret -eq 0 && gg_run rerank_tiny if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then test $ret -eq 0 && gg_run test_scripts_debug diff --git a/common/arg.cpp b/common/arg.cpp index 4fe57216c..77f40b4a4 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -17,27 +17,27 @@ using json = nlohmann::ordered_json; -llama_arg & llama_arg::set_examples(std::initializer_list examples) { +common_arg & common_arg::set_examples(std::initializer_list examples) { this->examples = std::move(examples); return *this; } -llama_arg & llama_arg::set_env(const char * env) { +common_arg & common_arg::set_env(const char * env) { help = help + "\n(env: " + env + ")"; this->env = env; return *this; } -llama_arg & llama_arg::set_sparam() { +common_arg & common_arg::set_sparam() { is_sparam = true; return *this; } -bool llama_arg::in_example(enum llama_example ex) { +bool common_arg::in_example(enum llama_example ex) { return examples.find(ex) != examples.end(); } -bool llama_arg::get_value_from_env(std::string & output) { +bool common_arg::get_value_from_env(std::string & output) { if (env == nullptr) return false; char * value = std::getenv(env); if (value) { @@ -47,7 +47,7 @@ bool llama_arg::get_value_from_env(std::string & output) { return false; } -bool llama_arg::has_value_from_env() { +bool common_arg::has_value_from_env() { return env != nullptr && std::getenv(env); } @@ -78,7 +78,7 @@ static std::vector break_str_into_lines(std::string input, size_t m return result; } -std::string llama_arg::to_string() { +std::string common_arg::to_string() { // params for printing to console const static int n_leading_spaces = 40; const static int n_char_per_line_help = 70; // TODO: detect this based on current console @@ -119,33 +119,7 @@ std::string llama_arg::to_string() { // utils // -#ifdef __GNUC__ -#ifdef __MINGW32__ -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) -#else -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) -#endif -#else -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) -#endif - -LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) -static std::string format(const char * fmt, ...) { - va_list ap; - va_list ap2; - va_start(ap, fmt); - va_copy(ap2, ap); - int size = vsnprintf(NULL, 0, fmt, ap); - GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT - std::vector buf(size + 1); - int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); - GGML_ASSERT(size2 == size); - va_end(ap2); - va_end(ap); - return std::string(buf.data(), size); -} - -static void gpt_params_handle_model_default(gpt_params & params) { +static void common_params_handle_model_default(common_params & params) { if (!params.hf_repo.empty()) { // short-hand to avoid specifying --hf-file -> default it to --model if (params.hf_file.empty()) { @@ -171,12 +145,12 @@ static void gpt_params_handle_model_default(gpt_params & params) { // CLI argument parsing functions // -static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx_arg) { +static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) { std::string arg; const std::string arg_prefix = "--"; - gpt_params & params = ctx_arg.params; + common_params & params = ctx_arg.params; - std::unordered_map arg_to_options; + std::unordered_map arg_to_options; for (auto & opt : ctx_arg.options) { for (const auto & arg : opt.args) { arg_to_options[arg] = &opt; @@ -199,7 +173,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx continue; } } catch (std::exception & e) { - throw std::invalid_argument(format( + throw std::invalid_argument(string_format( "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what())); } } @@ -220,7 +194,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx std::replace(arg.begin(), arg.end(), '_', '-'); } if (arg_to_options.find(arg) == arg_to_options.end()) { - throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str())); + throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str())); } auto opt = *arg_to_options[arg]; if (opt.has_value_from_env()) { @@ -252,7 +226,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx continue; } } catch (std::exception & e) { - throw std::invalid_argument(format( + throw std::invalid_argument(string_format( "error while handling argument \"%s\": %s\n\n" "usage:\n%s\n\nto show complete usage, run with -h", arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str())); @@ -268,7 +242,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); } - gpt_params_handle_model_default(params); + common_params_handle_model_default(params); if (params.escape) { string_process_escapes(params.prompt); @@ -284,19 +258,23 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx params.kv_overrides.back().key[0] = 0; } + if (params.reranking && params.embedding) { + throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both"); + } + return true; } -static void gpt_params_print_usage(gpt_params_context & ctx_arg) { - auto print_options = [](std::vector & options) { - for (llama_arg * opt : options) { +static void common_params_print_usage(common_params_context & ctx_arg) { + auto print_options = [](std::vector & options) { + for (common_arg * opt : options) { printf("%s", opt->to_string().c_str()); } }; - std::vector common_options; - std::vector sparam_options; - std::vector specific_options; + std::vector common_options; + std::vector sparam_options; + std::vector specific_options; for (auto & opt : ctx_arg.options) { // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example if (opt.is_sparam) { @@ -316,17 +294,17 @@ static void gpt_params_print_usage(gpt_params_context & ctx_arg) { print_options(specific_options); } -bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **)) { - auto ctx_arg = gpt_params_parser_init(params, ex, print_usage); - const gpt_params params_org = ctx_arg.params; // the example can modify the default params +bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) { + auto ctx_arg = common_params_parser_init(params, ex, print_usage); + const common_params params_org = ctx_arg.params; // the example can modify the default params try { - if (!gpt_params_parse_ex(argc, argv, ctx_arg)) { + if (!common_params_parse_ex(argc, argv, ctx_arg)) { ctx_arg.params = params_org; return false; } if (ctx_arg.params.usage) { - gpt_params_print_usage(ctx_arg); + common_params_print_usage(ctx_arg); if (ctx_arg.print_usage) { ctx_arg.print_usage(argc, argv); } @@ -341,16 +319,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example return true; } -gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **)) { - gpt_params_context ctx_arg(params); +common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) { + common_params_context ctx_arg(params); ctx_arg.print_usage = print_usage; ctx_arg.ex = ex; std::string sampler_type_chars; std::string sampler_type_names; for (const auto & sampler : params.sparams.samplers) { - sampler_type_chars += gpt_sampler_type_to_chr(sampler); - sampler_type_names += gpt_sampler_type_to_str(sampler) + ";"; + sampler_type_chars += common_sampler_type_to_chr(sampler); + sampler_type_names += common_sampler_type_to_str(sampler) + ";"; } sampler_type_names.pop_back(); @@ -362,374 +340,374 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example */ - auto add_opt = [&](llama_arg arg) { + auto add_opt = [&](common_arg arg) { if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) { ctx_arg.options.push_back(std::move(arg)); } }; - add_opt(llama_arg( + add_opt(common_arg( {"-h", "--help", "--usage"}, "print usage and exit", - [](gpt_params & params) { + [](common_params & params) { params.usage = true; } )); - add_opt(llama_arg( + add_opt(common_arg( {"--version"}, "show version and build info", - [](gpt_params &) { + [](common_params &) { fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); exit(0); } )); - add_opt(llama_arg( + add_opt(common_arg( {"--verbose-prompt"}, - format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), - [](gpt_params & params) { + string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), + [](common_params & params) { params.verbose_prompt = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + )); + add_opt(common_arg( {"--no-display-prompt"}, - format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), - [](gpt_params & params) { + string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), + [](common_params & params) { params.display_prompt = false; } ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + add_opt(common_arg( {"-co", "--color"}, - format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), - [](gpt_params & params) { + string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), + [](common_params & params) { params.use_color = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); - add_opt(llama_arg( + add_opt(common_arg( {"-t", "--threads"}, "N", - format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), - [](gpt_params & params, int value) { + string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), + [](common_params & params, int value) { params.cpuparams.n_threads = value; if (params.cpuparams.n_threads <= 0) { params.cpuparams.n_threads = std::thread::hardware_concurrency(); } } ).set_env("LLAMA_ARG_THREADS")); - add_opt(llama_arg( + add_opt(common_arg( {"-tb", "--threads-batch"}, "N", "number of threads to use during batch and prompt processing (default: same as --threads)", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.cpuparams_batch.n_threads = value; if (params.cpuparams_batch.n_threads <= 0) { params.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); } } )); - add_opt(llama_arg( + add_opt(common_arg( {"-td", "--threads-draft"}, "N", "number of threads to use during generation (default: same as --threads)", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.draft_cpuparams.n_threads = value; if (params.draft_cpuparams.n_threads <= 0) { params.draft_cpuparams.n_threads = std::thread::hardware_concurrency(); } } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"-tbd", "--threads-batch-draft"}, "N", "number of threads to use during batch and prompt processing (default: same as --threads-draft)", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.draft_cpuparams_batch.n_threads = value; if (params.draft_cpuparams_batch.n_threads <= 0) { params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency(); } } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"-C", "--cpu-mask"}, "M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")", - [](gpt_params & params, const std::string & mask) { + [](common_params & params, const std::string & mask) { params.cpuparams.mask_valid = true; if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) { throw std::invalid_argument("invalid cpumask"); } } )); - add_opt(llama_arg( + add_opt(common_arg( {"-Cr", "--cpu-range"}, "lo-hi", "range of CPUs for affinity. Complements --cpu-mask", - [](gpt_params & params, const std::string & range) { + [](common_params & params, const std::string & range) { params.cpuparams.mask_valid = true; if (!parse_cpu_range(range, params.cpuparams.cpumask)) { throw std::invalid_argument("invalid range"); } } )); - add_opt(llama_arg( + add_opt(common_arg( {"--cpu-strict"}, "<0|1>", - format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu), - [](gpt_params & params, const std::string & value) { + string_format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu), + [](common_params & params, const std::string & value) { params.cpuparams.strict_cpu = std::stoul(value); } )); - add_opt(llama_arg( + add_opt(common_arg( {"--prio"}, "N", - format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority), - [](gpt_params & params, int prio) { + string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority), + [](common_params & params, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); } params.cpuparams.priority = (enum ggml_sched_priority) prio; } )); - add_opt(llama_arg( + add_opt(common_arg( {"--poll"}, "<0...100>", - format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll), - [](gpt_params & params, const std::string & value) { + string_format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll), + [](common_params & params, const std::string & value) { params.cpuparams.poll = std::stoul(value); } )); - add_opt(llama_arg( + add_opt(common_arg( {"-Cb", "--cpu-mask-batch"}, "M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)", - [](gpt_params & params, const std::string & mask) { + [](common_params & params, const std::string & mask) { params.cpuparams_batch.mask_valid = true; if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) { throw std::invalid_argument("invalid cpumask"); } } )); - add_opt(llama_arg( + add_opt(common_arg( {"-Crb", "--cpu-range-batch"}, "lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch", - [](gpt_params & params, const std::string & range) { + [](common_params & params, const std::string & range) { params.cpuparams_batch.mask_valid = true; if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) { throw std::invalid_argument("invalid range"); } } )); - add_opt(llama_arg( + add_opt(common_arg( {"--cpu-strict-batch"}, "<0|1>", "use strict CPU placement (default: same as --cpu-strict)", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.cpuparams_batch.strict_cpu = value; } )); - add_opt(llama_arg( + add_opt(common_arg( {"--prio-batch"}, "N", - format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority), - [](gpt_params & params, int prio) { + string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority), + [](common_params & params, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); } params.cpuparams_batch.priority = (enum ggml_sched_priority) prio; } )); - add_opt(llama_arg( + add_opt(common_arg( {"--poll-batch"}, "<0|1>", "use polling to wait for work (default: same as --poll)", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.cpuparams_batch.poll = value; } )); - add_opt(llama_arg( + add_opt(common_arg( {"-Cd", "--cpu-mask-draft"}, "M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", - [](gpt_params & params, const std::string & mask) { + [](common_params & params, const std::string & mask) { params.draft_cpuparams.mask_valid = true; if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) { throw std::invalid_argument("invalid cpumask"); } } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"-Crd", "--cpu-range-draft"}, "lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft", - [](gpt_params & params, const std::string & range) { + [](common_params & params, const std::string & range) { params.draft_cpuparams.mask_valid = true; if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) { throw std::invalid_argument("invalid range"); } } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"--cpu-strict-draft"}, "<0|1>", "Use strict CPU placement for draft model (default: same as --cpu-strict)", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.draft_cpuparams.strict_cpu = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"--prio-draft"}, "N", - format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority), - [](gpt_params & params, int prio) { + string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority), + [](common_params & params, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); } params.draft_cpuparams.priority = (enum ggml_sched_priority) prio; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"--poll-draft"}, "<0|1>", "Use polling to wait for draft model work (default: same as --poll])", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.draft_cpuparams.poll = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"-Cbd", "--cpu-mask-batch-draft"}, "M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", - [](gpt_params & params, const std::string & mask) { + [](common_params & params, const std::string & mask) { params.draft_cpuparams_batch.mask_valid = true; if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) { throw std::invalid_argument("invalid cpumask"); } } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)", - [](gpt_params & params, const std::string & range) { + [](common_params & params, const std::string & range) { params.draft_cpuparams_batch.mask_valid = true; if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) { throw std::invalid_argument("invalid cpumask"); } } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"--cpu-strict-batch-draft"}, "<0|1>", "Use strict CPU placement for draft model (default: --cpu-strict-draft)", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.draft_cpuparams_batch.strict_cpu = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"--prio-batch-draft"}, "N", - format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority), - [](gpt_params & params, int prio) { + string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority), + [](common_params & params, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); } params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"--poll-batch-draft"}, "<0|1>", "Use polling to wait for draft model work (default: --poll-draft)", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.draft_cpuparams_batch.poll = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"--draft"}, "N", - format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft), - [](gpt_params & params, int value) { + string_format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft), + [](common_params & params, int value) { params.n_draft = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); - add_opt(llama_arg( + add_opt(common_arg( {"-ps", "--p-split"}, "N", - format("speculative decoding split probability (default: %.1f)", (double)params.p_split), - [](gpt_params & params, const std::string & value) { + string_format("speculative decoding split probability (default: %.1f)", (double)params.p_split), + [](common_params & params, const std::string & value) { params.p_split = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"-lcs", "--lookup-cache-static"}, "FNAME", "path to static lookup cache to use for lookup decoding (not updated by generation)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.lookup_cache_static = value; } ).set_examples({LLAMA_EXAMPLE_LOOKUP})); - add_opt(llama_arg( + add_opt(common_arg( {"-lcd", "--lookup-cache-dynamic"}, "FNAME", "path to dynamic lookup cache to use for lookup decoding (updated by generation)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.lookup_cache_dynamic = value; } ).set_examples({LLAMA_EXAMPLE_LOOKUP})); - add_opt(llama_arg( + add_opt(common_arg( {"-c", "--ctx-size"}, "N", - format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), - [](gpt_params & params, int value) { + string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), + [](common_params & params, int value) { params.n_ctx = value; } ).set_env("LLAMA_ARG_CTX_SIZE")); - add_opt(llama_arg( + add_opt(common_arg( {"-n", "--predict", "--n-predict"}, "N", - format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), - [](gpt_params & params, int value) { + string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), + [](common_params & params, int value) { params.n_predict = value; } ).set_env("LLAMA_ARG_N_PREDICT")); - add_opt(llama_arg( + add_opt(common_arg( {"-b", "--batch-size"}, "N", - format("logical maximum batch size (default: %d)", params.n_batch), - [](gpt_params & params, int value) { + string_format("logical maximum batch size (default: %d)", params.n_batch), + [](common_params & params, int value) { params.n_batch = value; } ).set_env("LLAMA_ARG_BATCH")); - add_opt(llama_arg( + add_opt(common_arg( {"-ub", "--ubatch-size"}, "N", - format("physical maximum batch size (default: %d)", params.n_ubatch), - [](gpt_params & params, int value) { + string_format("physical maximum batch size (default: %d)", params.n_ubatch), + [](common_params & params, int value) { params.n_ubatch = value; } ).set_env("LLAMA_ARG_UBATCH")); - add_opt(llama_arg( + add_opt(common_arg( {"--keep"}, "N", - format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), - [](gpt_params & params, int value) { + string_format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), + [](common_params & params, int value) { params.n_keep = value; } )); - add_opt(llama_arg( + add_opt(common_arg( {"--no-context-shift"}, - format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"), - [](gpt_params & params) { + string_format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"), + [](common_params & params) { params.ctx_shift = false; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); - add_opt(llama_arg( + add_opt(common_arg( {"--chunks"}, "N", - format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), - [](gpt_params & params, int value) { + string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), + [](common_params & params, int value) { params.n_chunks = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL})); - add_opt(llama_arg( + add_opt(common_arg( {"-fa", "--flash-attn"}, - format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), - [](gpt_params & params) { + string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), + [](common_params & params) { params.flash_attn = true; } ).set_env("LLAMA_ARG_FLASH_ATTN")); - add_opt(llama_arg( + add_opt(common_arg( {"-p", "--prompt"}, "PROMPT", ex == LLAMA_EXAMPLE_MAIN ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt" : "prompt to start generation with", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.prompt = value; } )); - add_opt(llama_arg( + add_opt(common_arg( {"--no-perf"}, - format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), - [](gpt_params & params) { + string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), + [](common_params & params) { params.no_perf = true; params.sparams.no_perf = true; } ).set_env("LLAMA_ARG_NO_PERF")); - add_opt(llama_arg( + add_opt(common_arg( {"-f", "--file"}, "FNAME", "a file containing the prompt (default: none)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::ifstream file(value); if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } // store the external file name in params params.prompt_file = value; @@ -739,24 +717,24 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, } } )); - add_opt(llama_arg( + add_opt(common_arg( {"--in-file"}, "FNAME", "an input file (repeat to specify multiple files)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::ifstream file(value); if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } params.in_files.push_back(value); } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(llama_arg( + add_opt(common_arg( {"-bf", "--binary-file"}, "FNAME", "binary file containing the prompt (default: none)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::ifstream file(value, std::ios::binary); if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } // store the external file name in params params.prompt_file = value; @@ -766,287 +744,301 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str()); } )); - add_opt(llama_arg( + add_opt(common_arg( {"-e", "--escape"}, - format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), - [](gpt_params & params) { + string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), + [](common_params & params) { params.escape = true; } )); - add_opt(llama_arg( + add_opt(common_arg( {"--no-escape"}, "do not process escape sequences", - [](gpt_params & params) { + [](common_params & params) { params.escape = false; } )); - add_opt(llama_arg( + add_opt(common_arg( {"-ptc", "--print-token-count"}, "N", - format("print token count every N tokens (default: %d)", params.n_print), - [](gpt_params & params, int value) { + string_format("print token count every N tokens (default: %d)", params.n_print), + [](common_params & params, int value) { params.n_print = value; } ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + add_opt(common_arg( {"--prompt-cache"}, "FNAME", "file to cache prompt state for faster startup (default: none)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.path_prompt_cache = value; } ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + add_opt(common_arg( {"--prompt-cache-all"}, "if specified, saves user input and generations to cache as well\n", - [](gpt_params & params) { + [](common_params & params) { params.prompt_cache_all = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + add_opt(common_arg( {"--prompt-cache-ro"}, "if specified, uses the prompt cache but does not update it", - [](gpt_params & params) { + [](common_params & params) { params.prompt_cache_ro = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + add_opt(common_arg( {"-r", "--reverse-prompt"}, "PROMPT", "halt generation at PROMPT, return control in interactive mode\n", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.antiprompt.emplace_back(value); } ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + add_opt(common_arg( {"-sp", "--special"}, - format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), - [](gpt_params & params) { + string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), + [](common_params & params) { params.special = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( + add_opt(common_arg( {"-cnv", "--conversation"}, - format( + string_format( "run in conversation mode:\n" "- does not print special tokens and suffix/prefix\n" "- interactive mode is also enabled\n" "(default: %s)", params.conversation ? "true" : "false" ), - [](gpt_params & params) { + [](common_params & params) { params.conversation = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + add_opt(common_arg( {"-i", "--interactive"}, - format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), - [](gpt_params & params) { + string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), + [](common_params & params) { params.interactive = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + add_opt(common_arg( {"-if", "--interactive-first"}, - format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), - [](gpt_params & params) { + string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), + [](common_params & params) { params.interactive_first = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + add_opt(common_arg( {"-mli", "--multiline-input"}, "allows you to write or paste multiple lines without ending each in '\\'", - [](gpt_params & params) { + [](common_params & params) { params.multiline_input = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + add_opt(common_arg( {"--in-prefix-bos"}, "prefix BOS to user inputs, preceding the `--in-prefix` string", - [](gpt_params & params) { + [](common_params & params) { params.input_prefix_bos = true; params.enable_chat_template = false; } ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + add_opt(common_arg( {"--in-prefix"}, "STRING", "string to prefix user inputs with (default: empty)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.input_prefix = value; params.enable_chat_template = false; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); - add_opt(llama_arg( + add_opt(common_arg( {"--in-suffix"}, "STRING", "string to suffix after user inputs with (default: empty)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.input_suffix = value; params.enable_chat_template = false; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); - add_opt(llama_arg( + add_opt(common_arg( {"--no-warmup"}, "skip warming up the model with an empty run", - [](gpt_params & params) { + [](common_params & params) { params.warmup = false; } ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + add_opt(common_arg( {"--spm-infill"}, - format( + string_format( "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" ), - [](gpt_params & params) { + [](common_params & params) { params.spm_infill = true; } ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL})); - add_opt(llama_arg( + add_opt(common_arg( {"--samplers"}, "SAMPLERS", - format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), - [](gpt_params & params, const std::string & value) { + string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), + [](common_params & params, const std::string & value) { const auto sampler_names = string_split(value, ';'); - params.sparams.samplers = gpt_sampler_types_from_names(sampler_names, true); + params.sparams.samplers = common_sampler_types_from_names(sampler_names, true); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"-s", "--seed"}, "SEED", - format("RNG seed (default: %u, use random seed for %u)", params.sparams.seed, LLAMA_DEFAULT_SEED), - [](gpt_params & params, const std::string & value) { + string_format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED), + [](common_params & params, const std::string & value) { params.sparams.seed = std::stoul(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--sampling-seq"}, "SEQUENCE", - format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), - [](gpt_params & params, const std::string & value) { - params.sparams.samplers = gpt_sampler_types_from_chars(value); + string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), + [](common_params & params, const std::string & value) { + params.sparams.samplers = common_sampler_types_from_chars(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--ignore-eos"}, "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)", - [](gpt_params & params) { + [](common_params & params) { params.sparams.ignore_eos = true; } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--penalize-nl"}, - format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"), - [](gpt_params & params) { + string_format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"), + [](common_params & params) { params.sparams.penalize_nl = true; } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--temp"}, "N", - format("temperature (default: %.1f)", (double)params.sparams.temp), - [](gpt_params & params, const std::string & value) { + string_format("temperature (default: %.1f)", (double)params.sparams.temp), + [](common_params & params, const std::string & value) { params.sparams.temp = std::stof(value); params.sparams.temp = std::max(params.sparams.temp, 0.0f); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--top-k"}, "N", - format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k), - [](gpt_params & params, int value) { + string_format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k), + [](common_params & params, int value) { params.sparams.top_k = value; } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--top-p"}, "N", - format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p), - [](gpt_params & params, const std::string & value) { + string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p), + [](common_params & params, const std::string & value) { params.sparams.top_p = std::stof(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--min-p"}, "N", - format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p), - [](gpt_params & params, const std::string & value) { + string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p), + [](common_params & params, const std::string & value) { params.sparams.min_p = std::stof(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--tfs"}, "N", - format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z), - [](gpt_params & params, const std::string & value) { + string_format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z), + [](common_params & params, const std::string & value) { params.sparams.tfs_z = std::stof(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( + {"--xtc-probability"}, "N", + string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability), + [](common_params & params, const std::string & value) { + params.sparams.xtc_probability = std::stof(value); + } + ).set_sparam()); + add_opt(common_arg( + {"--xtc-threshold"}, "N", + string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold), + [](common_params & params, const std::string & value) { + params.sparams.xtc_threshold = std::stof(value); + } + ).set_sparam()); + add_opt(common_arg( {"--typical"}, "N", - format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p), - [](gpt_params & params, const std::string & value) { + string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p), + [](common_params & params, const std::string & value) { params.sparams.typ_p = std::stof(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--repeat-last-n"}, "N", - format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n), - [](gpt_params & params, int value) { + string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n), + [](common_params & params, int value) { params.sparams.penalty_last_n = value; params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--repeat-penalty"}, "N", - format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat), - [](gpt_params & params, const std::string & value) { + string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat), + [](common_params & params, const std::string & value) { params.sparams.penalty_repeat = std::stof(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--presence-penalty"}, "N", - format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present), - [](gpt_params & params, const std::string & value) { + string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present), + [](common_params & params, const std::string & value) { params.sparams.penalty_present = std::stof(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--frequency-penalty"}, "N", - format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq), - [](gpt_params & params, const std::string & value) { + string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq), + [](common_params & params, const std::string & value) { params.sparams.penalty_freq = std::stof(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--dynatemp-range"}, "N", - format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range), - [](gpt_params & params, const std::string & value) { + string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range), + [](common_params & params, const std::string & value) { params.sparams.dynatemp_range = std::stof(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--dynatemp-exp"}, "N", - format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent), - [](gpt_params & params, const std::string & value) { + string_format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent), + [](common_params & params, const std::string & value) { params.sparams.dynatemp_exponent = std::stof(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--mirostat"}, "N", - format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" + string_format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.sparams.mirostat = value; } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--mirostat-lr"}, "N", - format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta), - [](gpt_params & params, const std::string & value) { + string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta), + [](common_params & params, const std::string & value) { params.sparams.mirostat_eta = std::stof(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--mirostat-ent"}, "N", - format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau), - [](gpt_params & params, const std::string & value) { + string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau), + [](common_params & params, const std::string & value) { params.sparams.mirostat_tau = std::stof(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n" "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::stringstream ss(value); llama_token key; char sign; @@ -1063,20 +1055,20 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, } } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--grammar"}, "GRAMMAR", - format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()), - [](gpt_params & params, const std::string & value) { + string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()), + [](common_params & params, const std::string & value) { params.sparams.grammar = value; } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--grammar-file"}, "FNAME", "file to read grammar from", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::ifstream file(value); if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } std::copy( std::istreambuf_iterator(file), @@ -1085,293 +1077,294 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"-j", "--json-schema"}, "SCHEMA", "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.sparams.grammar = json_schema_to_grammar(json::parse(value)); } ).set_sparam()); - add_opt(llama_arg( - {"--pooling"}, "{none,mean,cls,last}", + add_opt(common_arg( + {"--pooling"}, "{none,mean,cls,last,rank}", "pooling type for embeddings, use model default if unspecified", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } - else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } + else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; } + else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; } else { throw std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING")); - add_opt(llama_arg( - {"--attention"}, "{causal,non,causal}", + add_opt(common_arg( + {"--attention"}, "{causal,non-causal}", "attention type for embeddings, use model default if unspecified", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } else { throw std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); - add_opt(llama_arg( + add_opt(common_arg( {"--rope-scaling"}, "{none,linear,yarn}", "RoPE frequency scaling method, defaults to linear unless specified by the model", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } else { throw std::invalid_argument("invalid value"); } } ).set_env("LLAMA_ARG_ROPE_SCALING_TYPE")); - add_opt(llama_arg( + add_opt(common_arg( {"--rope-scale"}, "N", "RoPE context scaling factor, expands context by a factor of N", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.rope_freq_scale = 1.0f / std::stof(value); } ).set_env("LLAMA_ARG_ROPE_SCALE")); - add_opt(llama_arg( + add_opt(common_arg( {"--rope-freq-base"}, "N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.rope_freq_base = std::stof(value); } ).set_env("LLAMA_ARG_ROPE_FREQ_BASE")); - add_opt(llama_arg( + add_opt(common_arg( {"--rope-freq-scale"}, "N", "RoPE frequency scaling factor, expands context by a factor of 1/N", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.rope_freq_scale = std::stof(value); } ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE")); - add_opt(llama_arg( + add_opt(common_arg( {"--yarn-orig-ctx"}, "N", - format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx), - [](gpt_params & params, int value) { + string_format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx), + [](common_params & params, int value) { params.yarn_orig_ctx = value; } ).set_env("LLAMA_ARG_YARN_ORIG_CTX")); - add_opt(llama_arg( + add_opt(common_arg( {"--yarn-ext-factor"}, "N", - format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), - [](gpt_params & params, const std::string & value) { + string_format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), + [](common_params & params, const std::string & value) { params.yarn_ext_factor = std::stof(value); } ).set_env("LLAMA_ARG_YARN_EXT_FACTOR")); - add_opt(llama_arg( + add_opt(common_arg( {"--yarn-attn-factor"}, "N", - format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), - [](gpt_params & params, const std::string & value) { + string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), + [](common_params & params, const std::string & value) { params.yarn_attn_factor = std::stof(value); } ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR")); - add_opt(llama_arg( + add_opt(common_arg( {"--yarn-beta-slow"}, "N", - format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), - [](gpt_params & params, const std::string & value) { + string_format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), + [](common_params & params, const std::string & value) { params.yarn_beta_slow = std::stof(value); } ).set_env("LLAMA_ARG_YARN_BETA_SLOW")); - add_opt(llama_arg( + add_opt(common_arg( {"--yarn-beta-fast"}, "N", - format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), - [](gpt_params & params, const std::string & value) { + string_format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), + [](common_params & params, const std::string & value) { params.yarn_beta_fast = std::stof(value); } ).set_env("LLAMA_ARG_YARN_BETA_FAST")); - add_opt(llama_arg( + add_opt(common_arg( {"-gan", "--grp-attn-n"}, "N", - format("group-attention factor (default: %d)", params.grp_attn_n), - [](gpt_params & params, int value) { + string_format("group-attention factor (default: %d)", params.grp_attn_n), + [](common_params & params, int value) { params.grp_attn_n = value; } - ).set_env("LLAMA_ARG_GRP_ATTN_N")); - add_opt(llama_arg( + ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY})); + add_opt(common_arg( {"-gaw", "--grp-attn-w"}, "N", - format("group-attention width (default: %.1f)", (double)params.grp_attn_w), - [](gpt_params & params, int value) { + string_format("group-attention width (default: %d)", params.grp_attn_w), + [](common_params & params, int value) { params.grp_attn_w = value; } - ).set_env("LLAMA_ARG_GRP_ATTN_W")); - add_opt(llama_arg( + ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( {"-dkvc", "--dump-kv-cache"}, "verbose print of the KV cache", - [](gpt_params & params) { + [](common_params & params) { params.dump_kv_cache = true; } )); - add_opt(llama_arg( + add_opt(common_arg( {"-nkvo", "--no-kv-offload"}, "disable KV offload", - [](gpt_params & params) { + [](common_params & params) { params.no_kv_offload = true; } ).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); - add_opt(llama_arg( + add_opt(common_arg( {"-ctk", "--cache-type-k"}, "TYPE", - format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), - [](gpt_params & params, const std::string & value) { + string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), + [](common_params & params, const std::string & value) { // TODO: get the type right here params.cache_type_k = value; } ).set_env("LLAMA_ARG_CACHE_TYPE_K")); - add_opt(llama_arg( + add_opt(common_arg( {"-ctv", "--cache-type-v"}, "TYPE", - format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), - [](gpt_params & params, const std::string & value) { + string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), + [](common_params & params, const std::string & value) { // TODO: get the type right here params.cache_type_v = value; } ).set_env("LLAMA_ARG_CACHE_TYPE_V")); - add_opt(llama_arg( + add_opt(common_arg( {"--perplexity", "--all-logits"}, - format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), - [](gpt_params & params) { + string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), + [](common_params & params) { params.logits_all = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( + add_opt(common_arg( {"--hellaswag"}, "compute HellaSwag score over random tasks from datafile supplied with -f", - [](gpt_params & params) { + [](common_params & params) { params.hellaswag = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( + add_opt(common_arg( {"--hellaswag-tasks"}, "N", - format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), - [](gpt_params & params, int value) { + string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), + [](common_params & params, int value) { params.hellaswag_tasks = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( + add_opt(common_arg( {"--winogrande"}, "compute Winogrande score over random tasks from datafile supplied with -f", - [](gpt_params & params) { + [](common_params & params) { params.winogrande = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( + add_opt(common_arg( {"--winogrande-tasks"}, "N", - format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), - [](gpt_params & params, int value) { + string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), + [](common_params & params, int value) { params.winogrande_tasks = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( + add_opt(common_arg( {"--multiple-choice"}, "compute multiple choice score over random tasks from datafile supplied with -f", - [](gpt_params & params) { + [](common_params & params) { params.multiple_choice = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( + add_opt(common_arg( {"--multiple-choice-tasks"}, "N", - format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), - [](gpt_params & params, int value) { + string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), + [](common_params & params, int value) { params.multiple_choice_tasks = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( + add_opt(common_arg( {"--kl-divergence"}, "computes KL-divergence to logits provided via --kl-divergence-base", - [](gpt_params & params) { + [](common_params & params) { params.kl_divergence = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( + add_opt(common_arg( {"--save-all-logits", "--kl-divergence-base"}, "FNAME", "set logits file", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.logits_file = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( + add_opt(common_arg( {"--ppl-stride"}, "N", - format("stride for perplexity calculation (default: %d)", params.ppl_stride), - [](gpt_params & params, int value) { + string_format("stride for perplexity calculation (default: %d)", params.ppl_stride), + [](common_params & params, int value) { params.ppl_stride = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( + add_opt(common_arg( {"--ppl-output-type"}, "<0|1>", - format("output type for perplexity calculation (default: %d)", params.ppl_output_type), - [](gpt_params & params, int value) { + string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type), + [](common_params & params, int value) { params.ppl_output_type = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( + add_opt(common_arg( {"-dt", "--defrag-thold"}, "N", - format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), - [](gpt_params & params, const std::string & value) { + string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), + [](common_params & params, const std::string & value) { params.defrag_thold = std::stof(value); } ).set_env("LLAMA_ARG_DEFRAG_THOLD")); - add_opt(llama_arg( + add_opt(common_arg( {"-np", "--parallel"}, "N", - format("number of parallel sequences to decode (default: %d)", params.n_parallel), - [](gpt_params & params, int value) { + string_format("number of parallel sequences to decode (default: %d)", params.n_parallel), + [](common_params & params, int value) { params.n_parallel = value; } ).set_env("LLAMA_ARG_N_PARALLEL")); - add_opt(llama_arg( + add_opt(common_arg( {"-ns", "--sequences"}, "N", - format("number of sequences to decode (default: %d)", params.n_sequences), - [](gpt_params & params, int value) { + string_format("number of sequences to decode (default: %d)", params.n_sequences), + [](common_params & params, int value) { params.n_sequences = value; } ).set_examples({LLAMA_EXAMPLE_PARALLEL})); - add_opt(llama_arg( + add_opt(common_arg( {"-cb", "--cont-batching"}, - format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), - [](gpt_params & params) { + string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), + [](common_params & params) { params.cont_batching = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING")); - add_opt(llama_arg( + add_opt(common_arg( {"-nocb", "--no-cont-batching"}, "disable continuous batching", - [](gpt_params & params) { + [](common_params & params) { params.cont_batching = false; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING")); - add_opt(llama_arg( + add_opt(common_arg( {"--mmproj"}, "FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.mmproj = value; } ).set_examples({LLAMA_EXAMPLE_LLAVA})); - add_opt(llama_arg( + add_opt(common_arg( {"--image"}, "FILE", "path to an image file. use with multimodal models. Specify multiple times for batching", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.image.emplace_back(value); } ).set_examples({LLAMA_EXAMPLE_LLAVA})); -#ifdef GGML_USE_RPC - add_opt(llama_arg( - {"--rpc"}, "SERVERS", - "comma separated list of RPC servers", - [](gpt_params & params, const std::string & value) { - params.rpc_servers = value; - } - ).set_env("LLAMA_ARG_RPC")); -#endif - add_opt(llama_arg( + if (llama_supports_rpc()) { + add_opt(common_arg( + {"--rpc"}, "SERVERS", + "comma separated list of RPC servers", + [](common_params & params, const std::string & value) { + params.rpc_servers = value; + } + ).set_env("LLAMA_ARG_RPC")); + } + add_opt(common_arg( {"--mlock"}, "force system to keep model in RAM rather than swapping or compressing", - [](gpt_params & params) { + [](common_params & params) { params.use_mlock = true; } ).set_env("LLAMA_ARG_MLOCK")); - add_opt(llama_arg( + add_opt(common_arg( {"--no-mmap"}, "do not memory-map model (slower load but may reduce pageouts if not using mlock)", - [](gpt_params & params) { + [](common_params & params) { params.use_mmap = false; } ).set_env("LLAMA_ARG_NO_MMAP")); - add_opt(llama_arg( + add_opt(common_arg( {"--numa"}, "TYPE", "attempt optimizations that help on some NUMA systems\n" "- distribute: spread execution evenly over all nodes\n" @@ -1379,17 +1372,17 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, "- numactl: use the CPU map provided by numactl\n" "if run without this previously, it is recommended to drop the system page cache before using this\n" "see https://github.com/ggerganov/llama.cpp/issues/1437", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } else { throw std::invalid_argument("invalid value"); } } ).set_env("LLAMA_ARG_NUMA")); - add_opt(llama_arg( + add_opt(common_arg( {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", "number of layers to store in VRAM", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.n_gpu_layers = value; if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); @@ -1397,10 +1390,10 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, } } ).set_env("LLAMA_ARG_N_GPU_LAYERS")); - add_opt(llama_arg( + add_opt(common_arg( {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", "number of layers to store in VRAM for the draft model", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.n_gpu_layers_draft = value; if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); @@ -1408,13 +1401,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, } } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"-sm", "--split-mode"}, "{none,layer,row}", "how to split the model across multiple GPUs, one of:\n" "- none: use one GPU only\n" "- layer (default): split layers and KV across GPUs\n" "- row: split rows across GPUs", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::string arg_next = value; if (arg_next == "none") { params.split_mode = LLAMA_SPLIT_MODE_NONE; @@ -1434,10 +1427,10 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, } } ).set_env("LLAMA_ARG_SPLIT_MODE")); - add_opt(llama_arg( + add_opt(common_arg( {"-ts", "--tensor-split"}, "N0,N1,N2,...", "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::string arg_next = value; // split string by , and / @@ -1446,7 +1439,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, std::vector split_arg{ it, {} }; if (split_arg.size() >= llama_max_devices()) { throw std::invalid_argument( - format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices()) + string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices()) ); } for (size_t i = 0; i < llama_max_devices(); ++i) { @@ -1461,308 +1454,315 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, } } ).set_env("LLAMA_ARG_TENSOR_SPLIT")); - add_opt(llama_arg( + add_opt(common_arg( {"-mg", "--main-gpu"}, "INDEX", - format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), - [](gpt_params & params, int value) { + string_format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), + [](common_params & params, int value) { params.main_gpu = value; if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n"); } } ).set_env("LLAMA_ARG_MAIN_GPU")); - add_opt(llama_arg( + add_opt(common_arg( {"--check-tensors"}, - format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), - [](gpt_params & params) { + string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), + [](common_params & params) { params.check_tensors = true; } )); - add_opt(llama_arg( + add_opt(common_arg( {"--override-kv"}, "KEY=TYPE:VALUE", "advanced option to override model metadata by key. may be specified multiple times.\n" "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) { - throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str())); + throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str())); } } )); - add_opt(llama_arg( + add_opt(common_arg( {"--lora"}, "FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.lora_adapters.push_back({ std::string(value), 1.0 }); } // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); - add_opt(llama_arg( + add_opt(common_arg( {"--lora-scaled"}, "FNAME", "SCALE", "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)", - [](gpt_params & params, const std::string & fname, const std::string & scale) { + [](common_params & params, const std::string & fname, const std::string & scale) { params.lora_adapters.push_back({ fname, std::stof(scale) }); } // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); - add_opt(llama_arg( + add_opt(common_arg( {"--control-vector"}, "FNAME", "add a control vector\nnote: this argument can be repeated to add multiple control vectors", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.control_vectors.push_back({ 1.0f, value, }); } )); - add_opt(llama_arg( + add_opt(common_arg( {"--control-vector-scaled"}, "FNAME", "SCALE", "add a control vector with user defined scaling SCALE\n" "note: this argument can be repeated to add multiple scaled control vectors", - [](gpt_params & params, const std::string & fname, const std::string & scale) { + [](common_params & params, const std::string & fname, const std::string & scale) { params.control_vectors.push_back({ std::stof(scale), fname }); } )); - add_opt(llama_arg( + add_opt(common_arg( {"--control-vector-layer-range"}, "START", "END", "layer range to apply the control vector(s) to, start and end inclusive", - [](gpt_params & params, const std::string & start, const std::string & end) { + [](common_params & params, const std::string & start, const std::string & end) { params.control_vector_layer_start = std::stoi(start); params.control_vector_layer_end = std::stoi(end); } )); - add_opt(llama_arg( + add_opt(common_arg( {"-a", "--alias"}, "STRING", "set alias for model name (to be used by REST API)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.model_alias = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS")); - add_opt(llama_arg( + add_opt(common_arg( {"-m", "--model"}, "FNAME", ex == LLAMA_EXAMPLE_EXPORT_LORA ? std::string("model path from which to load base model") - : format( + : string_format( "model path (default: `models/$filename` with filename from `--hf-file` " "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH ), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.model = value; } ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL")); - add_opt(llama_arg( + add_opt(common_arg( {"-md", "--model-draft"}, "FNAME", "draft model for speculative decoding (default: unused)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.model_draft = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"-mu", "--model-url"}, "MODEL_URL", "model download url (default: unused)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.model_url = value; } ).set_env("LLAMA_ARG_MODEL_URL")); - add_opt(llama_arg( + add_opt(common_arg( {"-hfr", "--hf-repo"}, "REPO", "Hugging Face model repository (default: unused)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.hf_repo = value; } ).set_env("LLAMA_ARG_HF_REPO")); - add_opt(llama_arg( + add_opt(common_arg( {"-hff", "--hf-file"}, "FILE", "Hugging Face model file (default: unused)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.hf_file = value; } ).set_env("LLAMA_ARG_HF_FILE")); - add_opt(llama_arg( + add_opt(common_arg( {"-hft", "--hf-token"}, "TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.hf_token = value; } ).set_env("HF_TOKEN")); - add_opt(llama_arg( + add_opt(common_arg( {"--context-file"}, "FNAME", "file to load context from (repeat to specify multiple files)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::ifstream file(value, std::ios::binary); if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } params.context_files.push_back(value); } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); - add_opt(llama_arg( + add_opt(common_arg( {"--chunk-size"}, "N", - format("minimum length of embedded text chunks (default: %d)", params.chunk_size), - [](gpt_params & params, int value) { + string_format("minimum length of embedded text chunks (default: %d)", params.chunk_size), + [](common_params & params, int value) { params.chunk_size = value; } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); - add_opt(llama_arg( + add_opt(common_arg( {"--chunk-separator"}, "STRING", - format("separator between chunks (default: '%s')", params.chunk_separator.c_str()), - [](gpt_params & params, const std::string & value) { + string_format("separator between chunks (default: '%s')", params.chunk_separator.c_str()), + [](common_params & params, const std::string & value) { params.chunk_separator = value; } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); - add_opt(llama_arg( + add_opt(common_arg( {"--junk"}, "N", - format("number of times to repeat the junk text (default: %d)", params.n_junk), - [](gpt_params & params, int value) { + string_format("number of times to repeat the junk text (default: %d)", params.n_junk), + [](common_params & params, int value) { params.n_junk = value; } ).set_examples({LLAMA_EXAMPLE_PASSKEY})); - add_opt(llama_arg( + add_opt(common_arg( {"--pos"}, "N", - format("position of the passkey in the junk text (default: %d)", params.i_pos), - [](gpt_params & params, int value) { + string_format("position of the passkey in the junk text (default: %d)", params.i_pos), + [](common_params & params, int value) { params.i_pos = value; } ).set_examples({LLAMA_EXAMPLE_PASSKEY})); - add_opt(llama_arg( + add_opt(common_arg( {"-o", "--output", "--output-file"}, "FNAME", - format("output file (default: '%s')", + string_format("output file (default: '%s')", ex == LLAMA_EXAMPLE_EXPORT_LORA ? params.lora_outfile.c_str() : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR ? params.cvector_outfile.c_str() : params.out_file.c_str()), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.out_file = value; params.cvector_outfile = value; params.lora_outfile = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA})); - add_opt(llama_arg( + add_opt(common_arg( {"-ofreq", "--output-frequency"}, "N", - format("output the imatrix every N iterations (default: %d)", params.n_out_freq), - [](gpt_params & params, int value) { + string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq), + [](common_params & params, int value) { params.n_out_freq = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(llama_arg( + add_opt(common_arg( {"--save-frequency"}, "N", - format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), - [](gpt_params & params, int value) { + string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), + [](common_params & params, int value) { params.n_save_freq = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(llama_arg( + add_opt(common_arg( {"--process-output"}, - format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"), - [](gpt_params & params) { + string_format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"), + [](common_params & params) { params.process_output = true; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(llama_arg( + add_opt(common_arg( {"--no-ppl"}, - format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), - [](gpt_params & params) { + string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), + [](common_params & params) { params.compute_ppl = false; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(llama_arg( + add_opt(common_arg( {"--chunk", "--from-chunk"}, "N", - format("start processing the input from chunk N (default: %d)", params.i_chunk), - [](gpt_params & params, int value) { + string_format("start processing the input from chunk N (default: %d)", params.i_chunk), + [](common_params & params, int value) { params.i_chunk = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(llama_arg( + add_opt(common_arg( {"-pps"}, - format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), - [](gpt_params & params) { + string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), + [](common_params & params) { params.is_pp_shared = true; } ).set_examples({LLAMA_EXAMPLE_BENCH})); - add_opt(llama_arg( + add_opt(common_arg( {"-npp"}, "n0,n1,...", "number of prompt tokens", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { auto p = string_split(value, ','); params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); } ).set_examples({LLAMA_EXAMPLE_BENCH})); - add_opt(llama_arg( + add_opt(common_arg( {"-ntg"}, "n0,n1,...", "number of text generation tokens", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { auto p = string_split(value, ','); params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); } ).set_examples({LLAMA_EXAMPLE_BENCH})); - add_opt(llama_arg( + add_opt(common_arg( {"-npl"}, "n0,n1,...", "number of parallel prompts", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { auto p = string_split(value, ','); params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); } ).set_examples({LLAMA_EXAMPLE_BENCH})); - add_opt(llama_arg( + add_opt(common_arg( {"--embd-normalize"}, "N", - format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize), - [](gpt_params & params, int value) { + string_format("normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize), + [](common_params & params, int value) { params.embd_normalize = value; } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); - add_opt(llama_arg( + add_opt(common_arg( {"--embd-output-format"}, "FORMAT", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.embd_out = value; } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); - add_opt(llama_arg( + add_opt(common_arg( {"--embd-separator"}, "STRING", - "separator of embendings (default \\n) for example \"<#sep#>\"", - [](gpt_params & params, const std::string & value) { + "separator of embeddings (default \\n) for example \"<#sep#>\"", + [](common_params & params, const std::string & value) { params.embd_sep = value; } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); - add_opt(llama_arg( + add_opt(common_arg( {"--host"}, "HOST", - format("ip address to listen (default: %s)", params.hostname.c_str()), - [](gpt_params & params, const std::string & value) { + string_format("ip address to listen (default: %s)", params.hostname.c_str()), + [](common_params & params, const std::string & value) { params.hostname = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST")); - add_opt(llama_arg( + add_opt(common_arg( {"--port"}, "PORT", - format("port to listen (default: %d)", params.port), - [](gpt_params & params, int value) { + string_format("port to listen (default: %d)", params.port), + [](common_params & params, int value) { params.port = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT")); - add_opt(llama_arg( + add_opt(common_arg( {"--path"}, "PATH", - format("path to serve static files from (default: %s)", params.public_path.c_str()), - [](gpt_params & params, const std::string & value) { + string_format("path to serve static files from (default: %s)", params.public_path.c_str()), + [](common_params & params, const std::string & value) { params.public_path = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH")); - add_opt(llama_arg( + add_opt(common_arg( {"--embedding", "--embeddings"}, - format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), - [](gpt_params & params) { + string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), + [](common_params & params) { params.embedding = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS")); - add_opt(llama_arg( + add_opt(common_arg( + {"--reranking", "--rerank"}, + string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"), + [](common_params & params) { + params.reranking = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING")); + add_opt(common_arg( {"--api-key"}, "KEY", "API key to use for authentication (default: none)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.api_keys.push_back(value); } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY")); - add_opt(llama_arg( + add_opt(common_arg( {"--api-key-file"}, "FNAME", "path to file containing API keys (default: none)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::ifstream key_file(value); if (!key_file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } std::string key; while (std::getline(key_file, key)) { @@ -1773,70 +1773,74 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, key_file.close(); } ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( + add_opt(common_arg( {"--ssl-key-file"}, "FNAME", "path to file a PEM-encoded SSL private key", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.ssl_file_key = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE")); - add_opt(llama_arg( + add_opt(common_arg( {"--ssl-cert-file"}, "FNAME", "path to file a PEM-encoded SSL certificate", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.ssl_file_cert = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE")); - add_opt(llama_arg( + add_opt(common_arg( {"-to", "--timeout"}, "N", - format("server read/write timeout in seconds (default: %d)", params.timeout_read), - [](gpt_params & params, int value) { + string_format("server read/write timeout in seconds (default: %d)", params.timeout_read), + [](common_params & params, int value) { params.timeout_read = value; params.timeout_write = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT")); - add_opt(llama_arg( + add_opt(common_arg( {"--threads-http"}, "N", - format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http), - [](gpt_params & params, int value) { + string_format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http), + [](common_params & params, int value) { params.n_threads_http = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP")); - add_opt(llama_arg( - {"-spf", "--system-prompt-file"}, "FNAME", - "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications", - [](gpt_params & params, const std::string & value) { - std::ifstream file(value); - if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); - } - std::string system_prompt; - std::copy( - std::istreambuf_iterator(file), - std::istreambuf_iterator(), - std::back_inserter(system_prompt) - ); - params.system_prompt = system_prompt; + add_opt(common_arg( + {"--cache-reuse"}, "N", + string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse), + [](common_params & params, int value) { + params.n_cache_reuse = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE")); + add_opt(common_arg( {"--metrics"}, - format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), - [](gpt_params & params) { + string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), + [](common_params & params) { params.endpoint_metrics = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS")); - add_opt(llama_arg( + add_opt(common_arg( + {"--slots"}, + string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), + [](common_params & params) { + params.endpoint_slots = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS")); + add_opt(common_arg( + {"--props"}, + string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"), + [](common_params & params) { + params.endpoint_props = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS")); + add_opt(common_arg( {"--no-slots"}, - format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), - [](gpt_params & params) { + "disables slots monitoring endpoint", + [](common_params & params) { params.endpoint_slots = false; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS")); - add_opt(llama_arg( + add_opt(common_arg( {"--slot-save-path"}, "PATH", "path to save slot kv cache (default: disabled)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.slot_save_path = value; // if doesn't end with DIRECTORY_SEPARATOR, add it if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { @@ -1844,22 +1848,22 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, } } ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( + add_opt(common_arg( {"--jinja"}, "use jinja template for chat (default: disabled)", - [](gpt_params & params) { + [](common_params & params) { params.use_jinja = true; } ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( + add_opt(common_arg( {"--chat-template"}, "JINJA_TEMPLATE", "set custom jinja chat template (default: template taken from model's metadata)\n" "if suffix/prefix are specified, template will be disabled\n" "only commonly used templates are accepted (unless --jinja is set before this flag):\n" "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template", - [](gpt_params & params, const std::string & value) { - if (!llama_chat_verify_template(value, params.use_jinja)) { - throw std::runtime_error(format( + [](common_params & params, const std::string & value) { + if (!common_chat_verify_template(value, params.use_jinja)) { + throw std::runtime_error(string_format( "error: the supplied chat template is not supported: %s%s\n", value.c_str(), params.use_jinja ? "" : "\nnote: llama.cpp does not use jinja parser, we only support commonly used templates" @@ -1868,16 +1872,16 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.chat_template = value; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); - add_opt(llama_arg( + add_opt(common_arg( {"--chat-template-file"}, "JINJA_TEMPLATE_FILE", "set custom jinja chat template file (default: template taken from model's metadata)\n" "if suffix/prefix are specified, template will be disabled\n" "only commonly used templates are accepted (unless --jinja is set before this flag):\n" "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::ifstream file(value); if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } std::string chat_template; std::copy( @@ -1885,8 +1889,8 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, std::istreambuf_iterator(), std::back_inserter(chat_template) ); - if (!llama_chat_verify_template(chat_template, params.use_jinja)) { - throw std::runtime_error(format( + if (!common_chat_verify_template(chat_template, params.use_jinja)) { + throw std::runtime_error(string_format( "error: the supplied chat template is not supported: %s%s\n", value.c_str(), params.use_jinja ? "" : "\nnote: llama.cpp does not use jinja parser, we only support commonly used templates" @@ -1895,31 +1899,31 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.chat_template = chat_template; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE")); - add_opt(llama_arg( + add_opt(common_arg( {"-sps", "--slot-prompt-similarity"}, "SIMILARITY", - format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), - [](gpt_params & params, const std::string & value) { + string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), + [](common_params & params, const std::string & value) { params.slot_prompt_similarity = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( + add_opt(common_arg( {"--lora-init-without-apply"}, - format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"), - [](gpt_params & params) { + string_format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"), + [](common_params & params) { params.lora_init_without_apply = true; } ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( + add_opt(common_arg( {"--simple-io"}, "use basic IO for better compatibility in subprocesses and limited consoles", - [](gpt_params & params) { + [](common_params & params) { params.simple_io = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); - add_opt(llama_arg( + add_opt(common_arg( {"-ld", "--logdir"}, "LOGDIR", "path under which to save YAML logs (no logging if unset)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.logdir = value; if (params.logdir.back() != DIRECTORY_SEPARATOR) { @@ -1927,101 +1931,101 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, } } )); - add_opt(llama_arg( + add_opt(common_arg( {"--positive-file"}, "FNAME", - format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), - [](gpt_params & params, const std::string & value) { + string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), + [](common_params & params, const std::string & value) { params.cvector_positive_file = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(llama_arg( + add_opt(common_arg( {"--negative-file"}, "FNAME", - format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()), - [](gpt_params & params, const std::string & value) { + string_format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()), + [](common_params & params, const std::string & value) { params.cvector_negative_file = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(llama_arg( + add_opt(common_arg( {"--pca-batch"}, "N", - format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch), - [](gpt_params & params, int value) { + string_format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch), + [](common_params & params, int value) { params.n_pca_batch = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(llama_arg( + add_opt(common_arg( {"--pca-iter"}, "N", - format("number of iterations used for PCA (default: %d)", params.n_pca_iterations), - [](gpt_params & params, int value) { + string_format("number of iterations used for PCA (default: %d)", params.n_pca_iterations), + [](common_params & params, int value) { params.n_pca_iterations = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(llama_arg( + add_opt(common_arg( {"--method"}, "{pca, mean}", "dimensionality reduction method to be used (default: pca)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; } else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } else { throw std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(llama_arg( + add_opt(common_arg( {"--output-format"}, "{md,jsonl}", "output format for batched-bench results (default: md)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; } else if (value == "md") { params.batched_bench_output_jsonl = false; } else { std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_BENCH})); - add_opt(llama_arg( + add_opt(common_arg( {"--log-disable"}, "Log disable", - [](gpt_params &) { - gpt_log_pause(gpt_log_main()); + [](common_params &) { + common_log_pause(common_log_main()); } )); - add_opt(llama_arg( + add_opt(common_arg( {"--log-file"}, "FNAME", "Log to file", - [](gpt_params &, const std::string & value) { - gpt_log_set_file(gpt_log_main(), value.c_str()); + [](common_params &, const std::string & value) { + common_log_set_file(common_log_main(), value.c_str()); } )); - add_opt(llama_arg( + add_opt(common_arg( {"--log-colors"}, "Enable colored logging", - [](gpt_params &) { - gpt_log_set_colors(gpt_log_main(), true); + [](common_params &) { + common_log_set_colors(common_log_main(), true); } ).set_env("LLAMA_LOG_COLORS")); - add_opt(llama_arg( + add_opt(common_arg( {"-v", "--verbose", "--log-verbose"}, "Set verbosity level to infinity (i.e. log all messages, useful for debugging)", - [](gpt_params & params) { + [](common_params & params) { params.verbosity = INT_MAX; - gpt_log_set_verbosity_thold(INT_MAX); + common_log_set_verbosity_thold(INT_MAX); } )); - add_opt(llama_arg( + add_opt(common_arg( {"-lv", "--verbosity", "--log-verbosity"}, "N", "Set the verbosity threshold. Messages with a higher verbosity will be ignored.", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.verbosity = value; - gpt_log_set_verbosity_thold(value); + common_log_set_verbosity_thold(value); } ).set_env("LLAMA_LOG_VERBOSITY")); - add_opt(llama_arg( + add_opt(common_arg( {"--log-prefix"}, "Enable prefx in log messages", - [](gpt_params &) { - gpt_log_set_prefix(gpt_log_main(), true); + [](common_params &) { + common_log_set_prefix(common_log_main(), true); } ).set_env("LLAMA_LOG_PREFIX")); - add_opt(llama_arg( + add_opt(common_arg( {"--log-timestamps"}, "Enable timestamps in log messages", - [](gpt_params &) { - gpt_log_set_timestamps(gpt_log_main(), true); + [](common_params &) { + common_log_set_timestamps(common_log_main(), true); } ).set_env("LLAMA_LOG_TIMESTAMPS")); diff --git a/common/arg.h b/common/arg.h index 413de2c88..a6700d323 100644 --- a/common/arg.h +++ b/common/arg.h @@ -10,7 +10,7 @@ // CLI argument parsing // -struct llama_arg { +struct common_arg { std::set examples = {LLAMA_EXAMPLE_COMMON}; std::vector args; const char * value_hint = nullptr; // help text or example for arg value @@ -18,60 +18,60 @@ struct llama_arg { const char * env = nullptr; std::string help; bool is_sparam = false; // is current arg a sampling param? - void (*handler_void) (gpt_params & params) = nullptr; - void (*handler_string) (gpt_params & params, const std::string &) = nullptr; - void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr; - void (*handler_int) (gpt_params & params, int) = nullptr; + void (*handler_void) (common_params & params) = nullptr; + void (*handler_string) (common_params & params, const std::string &) = nullptr; + void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr; + void (*handler_int) (common_params & params, int) = nullptr; - llama_arg( + common_arg( const std::initializer_list & args, const char * value_hint, const std::string & help, - void (*handler)(gpt_params & params, const std::string &) + void (*handler)(common_params & params, const std::string &) ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {} - llama_arg( + common_arg( const std::initializer_list & args, const char * value_hint, const std::string & help, - void (*handler)(gpt_params & params, int) + void (*handler)(common_params & params, int) ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {} - llama_arg( + common_arg( const std::initializer_list & args, const std::string & help, - void (*handler)(gpt_params & params) + void (*handler)(common_params & params) ) : args(args), help(help), handler_void(handler) {} // support 2 values for arg - llama_arg( + common_arg( const std::initializer_list & args, const char * value_hint, const char * value_hint_2, const std::string & help, - void (*handler)(gpt_params & params, const std::string &, const std::string &) + void (*handler)(common_params & params, const std::string &, const std::string &) ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} - llama_arg & set_examples(std::initializer_list examples); - llama_arg & set_env(const char * env); - llama_arg & set_sparam(); + common_arg & set_examples(std::initializer_list examples); + common_arg & set_env(const char * env); + common_arg & set_sparam(); bool in_example(enum llama_example ex); bool get_value_from_env(std::string & output); bool has_value_from_env(); std::string to_string(); }; -struct gpt_params_context { +struct common_params_context { enum llama_example ex = LLAMA_EXAMPLE_COMMON; - gpt_params & params; - std::vector options; + common_params & params; + std::vector options; void(*print_usage)(int, char **) = nullptr; - gpt_params_context(gpt_params & params) : params(params) {} + common_params_context(common_params & params) : params(params) {} }; // parse input arguments from CLI // if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message) -bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); +bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); // function to be used by test-arg-parser -gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); +common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); diff --git a/common/common.cpp b/common/common.cpp index 909aa1970..781d35f86 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -24,10 +25,10 @@ #include #include #include +#include #include #include #include -#include #if defined(__APPLE__) && defined(__MACH__) #include @@ -364,10 +365,10 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD return true; } -void gpt_init() { +void common_init() { llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) { - if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) { - gpt_log_add(gpt_log_main(), level, "%s", text); + if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) { + common_log_add(common_log_main(), level, "%s", text); } }, NULL); @@ -380,7 +381,7 @@ void gpt_init() { LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type); } -std::string gpt_params_get_system_info(const gpt_params & params) { +std::string common_params_get_system_info(const common_params & params) { std::ostringstream os; os << "system_info: n_threads = " << params.cpuparams.n_threads; @@ -402,6 +403,21 @@ std::string gpt_params_get_system_info(const gpt_params & params) { // String utils // +std::string string_format(const char * fmt, ...) { + va_list ap; + va_list ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), size); +} + std::vector string_split(std::string input, char separator) { std::vector parts; size_t separator_pos = input.find(separator); @@ -495,7 +511,7 @@ std::string string_from(const struct llama_context * ctx, const std::vector & lora_adapters) { +void common_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters) { llama_lora_adapter_clear(ctx); for (auto & la : lora_adapters) { if (la.scale != 0.0f) { @@ -981,7 +1025,7 @@ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector curl(curl_easy_init(), &curl_easy_cleanup); @@ -1188,15 +1237,15 @@ static bool llama_download_file(const std::string & url, const std::string & pat } // Send a HEAD request to retrieve the etag and last-modified headers - struct llama_load_model_from_url_headers { + struct common_load_model_from_url_headers { std::string etag; std::string last_modified; }; - llama_load_model_from_url_headers headers; + common_load_model_from_url_headers headers; { typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *); auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { - llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata; + common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata; static std::regex header_regex("([^:]+): (.*)\r\n"); static std::regex etag_regex("ETag", std::regex_constants::icase); @@ -1332,7 +1381,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat return true; } -struct llama_model * llama_load_model_from_url( +struct llama_model * common_load_model_from_url( const char * model_url, const char * path_model, const char * hf_token, @@ -1343,7 +1392,7 @@ struct llama_model * llama_load_model_from_url( return NULL; } - if (!llama_download_file(model_url, path_model, hf_token)) { + if (!common_download_file(model_url, path_model, hf_token)) { return NULL; } @@ -1396,7 +1445,7 @@ struct llama_model * llama_load_model_from_url( char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0}; llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split); - return llama_download_file(split_url, split_path, hf_token); + return common_download_file(split_url, split_path, hf_token); }, idx)); } @@ -1411,7 +1460,7 @@ struct llama_model * llama_load_model_from_url( return llama_load_model_from_file(path_model, params); } -struct llama_model * llama_load_model_from_hf( +struct llama_model * common_load_model_from_hf( const char * repo, const char * model, const char * path_model, @@ -1431,12 +1480,12 @@ struct llama_model * llama_load_model_from_hf( model_url += "/resolve/main/"; model_url += model; - return llama_load_model_from_url(model_url.c_str(), path_model, hf_token, params); + return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params); } #else -struct llama_model * llama_load_model_from_url( +struct llama_model * common_load_model_from_url( const char * /*model_url*/, const char * /*path_model*/, const char * /*hf_token*/, @@ -1445,7 +1494,7 @@ struct llama_model * llama_load_model_from_url( return nullptr; } -struct llama_model * llama_load_model_from_hf( +struct llama_model * common_load_model_from_hf( const char * /*repo*/, const char * /*model*/, const char * /*path_model*/, @@ -1461,16 +1510,18 @@ struct llama_model * llama_load_model_from_hf( // Batch utils // -void llama_batch_clear(struct llama_batch & batch) { +void common_batch_clear(struct llama_batch & batch) { batch.n_tokens = 0; } -void llama_batch_add( +void common_batch_add( struct llama_batch & batch, llama_token id, llama_pos pos, const std::vector & seq_ids, bool logits) { + GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded"); + batch.token [batch.n_tokens] = id; batch.pos [batch.n_tokens] = pos; batch.n_seq_id[batch.n_tokens] = seq_ids.size(); @@ -1486,15 +1537,15 @@ void llama_batch_add( // Vocab utils // -std::vector llama_tokenize( +std::vector common_tokenize( const struct llama_context * ctx, const std::string & text, bool add_special, bool parse_special) { - return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special); + return common_tokenize(llama_get_model(ctx), text, add_special, parse_special); } -std::vector llama_tokenize( +std::vector common_tokenize( const struct llama_model * model, const std::string & text, bool add_special, @@ -1513,7 +1564,7 @@ std::vector llama_tokenize( return result; } -static std::string _llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) { +static std::string _common_token_to_piece(const struct llama_model * model, llama_token token, bool special) { std::string piece; piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n' const int n_chars = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special); @@ -1529,11 +1580,11 @@ static std::string _llama_token_to_piece(const struct llama_model * model, llama return piece; } -std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { - return _llama_token_to_piece(llama_get_model(ctx), token, special); +std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { + return _common_token_to_piece(llama_get_model(ctx), token, special); } -std::string llama_detokenize(llama_context * ctx, const std::vector & tokens, bool special) { +std::string common_detokenize(llama_context * ctx, const std::vector & tokens, bool special) { std::string text; text.resize(std::max(text.capacity(), tokens.size())); int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); @@ -1553,7 +1604,7 @@ std::string llama_detokenize(llama_context * ctx, const std::vector // Chat template utils // -bool llama_chat_verify_template(const std::string & tmpl, bool use_jinja) { +bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) { if (use_jinja) { try { auto chat_template = minja::chat_template(tmpl, "", ""); @@ -1580,9 +1631,9 @@ bool llama_chat_verify_template(const std::string & tmpl, bool use_jinja) { return res >= 0; } -std::string llama_chat_apply_template(const struct llama_model * model, +std::string common_chat_apply_template(const struct llama_model * model, const std::string & tmpl, - const std::vector & msgs, + const std::vector & msgs, bool add_ass) { int alloc_size = 0; bool fallback = false; // indicate if we must fallback to default chatml @@ -1624,35 +1675,35 @@ std::string llama_chat_apply_template(const struct llama_model * model, return formatted_chat; } -std::string llama_chat_format_single(const struct llama_model * model, +std::string common_chat_format_single(const struct llama_model * model, const std::string & tmpl, - const std::vector & past_msg, - const llama_chat_msg & new_msg, + const std::vector & past_msg, + const common_chat_msg & new_msg, bool add_ass) { std::ostringstream ss; - auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false); - std::vector chat_new(past_msg); + auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false); + std::vector chat_new(past_msg); // if the past_msg ends with a newline, we must preserve it in the formatted version if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') { ss << "\n"; }; // format chat with new_msg chat_new.push_back(new_msg); - auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass); + auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass); // get the diff part ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size()); return ss.str(); } -std::string llama_chat_format_example(const struct llama_model * model, +std::string common_chat_format_example(const struct llama_model * model, const std::string & tmpl) { - std::vector msgs = { + std::vector msgs = { {"system", "You are a helpful assistant"}, {"user", "Hello"}, {"assistant", "Hi there"}, {"user", "How are you?"}, }; - return llama_chat_apply_template(model, tmpl, msgs, true); + return common_chat_apply_template(model, tmpl, msgs, true); } static std::string _llama_model_meta_val_str(const struct llama_model * model, const char * key) { @@ -1674,8 +1725,8 @@ minja::chat_template llama_chat_template_from_model( std::string chat_template = chat_template_override ? chat_template_override : _llama_model_meta_val_str(model, "tokenizer.chat_template"); - auto bos_token = _llama_token_to_piece(model, llama_token_bos(model), true); - auto eos_token = _llama_token_to_piece(model, llama_token_eos(model), true); + auto bos_token = _common_token_to_piece(model, llama_token_bos(model), true); + auto eos_token = _common_token_to_piece(model, llama_token_eos(model), true); return {std::move(chat_template), bos_token, eos_token}; } @@ -1683,7 +1734,7 @@ minja::chat_template llama_chat_template_from_model( // KV cache utils // -void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) { +void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) { static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+"; printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d", @@ -1706,7 +1757,7 @@ void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) { printf("\n=== Done dumping\n"); } -void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) { +void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) { static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n", @@ -1758,7 +1809,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz // Embedding utils // -void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm) { +void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) { double sum = 0.0; switch (embd_norm) { @@ -1792,7 +1843,7 @@ void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm) } } -float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n){ +float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){ double sum = 0.0; double sum1 = 0.0; double sum2 = 0.0; @@ -1818,8 +1869,8 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n) // Control vector utils // -static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) { - llama_control_vector_data result = { -1, {} }; +static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) { + common_control_vector_data result = { -1, {} }; ggml_context * ctx = nullptr; struct gguf_init_params meta_gguf_params = { @@ -1903,11 +1954,11 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr return result; } -llama_control_vector_data llama_control_vector_load(const std::vector & load_infos) { - llama_control_vector_data result = { -1, {} }; +common_control_vector_data common_control_vector_load(const std::vector & load_infos) { + common_control_vector_data result = { -1, {} }; for (const auto & info : load_infos) { - auto cur = llama_control_vector_load_one(info); + auto cur = common_control_vector_load_one(info); if (cur.n_embd == -1) { result.n_embd = -1; @@ -1999,7 +2050,7 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha } } -void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx, +void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx, const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) { const auto & sparams = params.sparams; @@ -2141,6 +2192,8 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); + fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability); + fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold); fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p); fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false"); diff --git a/common/common.h b/common/common.h index 1cb518a70..844afa3f1 100644 --- a/common/common.h +++ b/common/common.h @@ -30,12 +30,12 @@ // Forward declaration namespace minja { class chat_template; } -struct llama_lora_adapter_info { +struct common_lora_adapter_info { std::string path; float scale; }; -struct llama_lora_adapter_container : llama_lora_adapter_info { +struct common_lora_adapter_container : common_lora_adapter_info { struct llama_lora_adapter * adapter; }; @@ -45,7 +45,7 @@ extern char const * LLAMA_COMMIT; extern char const * LLAMA_COMPILER; extern char const * LLAMA_BUILD_TARGET; -struct llama_control_vector_load_info; +struct common_control_vector_load_info; // // CPU utils @@ -88,14 +88,16 @@ enum llama_example { LLAMA_EXAMPLE_COUNT, }; -enum gpt_sampler_type { - GPT_SAMPLER_TYPE_NONE = 0, - GPT_SAMPLER_TYPE_TOP_K = 1, - GPT_SAMPLER_TYPE_TOP_P = 2, - GPT_SAMPLER_TYPE_MIN_P = 3, - GPT_SAMPLER_TYPE_TFS_Z = 4, - GPT_SAMPLER_TYPE_TYPICAL_P = 5, - GPT_SAMPLER_TYPE_TEMPERATURE = 6, +enum common_sampler_type { + COMMON_SAMPLER_TYPE_NONE = 0, + COMMON_SAMPLER_TYPE_TOP_K = 1, + COMMON_SAMPLER_TYPE_TOP_P = 2, + COMMON_SAMPLER_TYPE_MIN_P = 3, + COMMON_SAMPLER_TYPE_TFS_Z = 4, + COMMON_SAMPLER_TYPE_TYPICAL_P = 5, + COMMON_SAMPLER_TYPE_TEMPERATURE = 6, + COMMON_SAMPLER_TYPE_XTC = 7, + COMMON_SAMPLER_TYPE_INFILL = 8, }; // dimensionality reduction methods, used by cvector-generator @@ -105,7 +107,7 @@ enum dimre_method { }; // sampler parameters -struct gpt_sampler_params { +struct common_sampler_params { uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler int32_t n_prev = 64; // number of previous tokens to remember @@ -114,6 +116,8 @@ struct gpt_sampler_params { int32_t top_k = 40; // <= 0 to use vocab size float top_p = 0.95f; // 1.0 = disabled float min_p = 0.05f; // 0.0 = disabled + float xtc_probability = 0.00f; // 0.0 = disabled + float xtc_threshold = 0.10f; // > 0.5 disables XTC float tfs_z = 1.00f; // 1.0 = disabled float typ_p = 1.00f; // typical_p, 1.0 = disabled float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities @@ -130,13 +134,15 @@ struct gpt_sampler_params { bool ignore_eos = false; bool no_perf = false; // disable performance metrics - std::vector samplers = { - GPT_SAMPLER_TYPE_TOP_K, - GPT_SAMPLER_TYPE_TFS_Z, - GPT_SAMPLER_TYPE_TYPICAL_P, - GPT_SAMPLER_TYPE_TOP_P, - GPT_SAMPLER_TYPE_MIN_P, - GPT_SAMPLER_TYPE_TEMPERATURE + + std::vector samplers = { + COMMON_SAMPLER_TYPE_TOP_K, + COMMON_SAMPLER_TYPE_TFS_Z, + COMMON_SAMPLER_TYPE_TYPICAL_P, + COMMON_SAMPLER_TYPE_TOP_P, + COMMON_SAMPLER_TYPE_MIN_P, + COMMON_SAMPLER_TYPE_XTC, + COMMON_SAMPLER_TYPE_TEMPERATURE, }; std::string grammar; // optional BNF-like grammar to constrain sampling @@ -148,7 +154,7 @@ struct gpt_sampler_params { std::string print() const; }; -struct gpt_params { +struct common_params { int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 0; // context size int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) @@ -190,7 +196,7 @@ struct gpt_params { enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings - struct gpt_sampler_params sparams; + struct common_sampler_params sparams; std::string model = ""; // model path // NOLINT std::string model_draft = ""; // draft model for speculative decoding // NOLINT @@ -215,9 +221,9 @@ struct gpt_params { std::vector kv_overrides; bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply) - std::vector lora_adapters; // lora adapter path with user defined scale + std::vector lora_adapters; // lora adapter path with user defined scale - std::vector control_vectors; // control vector with user defined scale + std::vector control_vectors; // control vector with user defined scale int32_t verbosity = 0; int32_t control_vector_layer_start = -1; // layer range for control vector @@ -275,20 +281,22 @@ struct gpt_params { // embedding bool embedding = false; // get only sentence embedding - int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) + int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix - std::string embd_sep = "\n"; // separator of embendings + std::string embd_sep = "\n"; // separator of embeddings + bool reranking = false; // enable reranking support on server // server params int32_t port = 8080; // server listens on this network port int32_t timeout_read = 600; // http read timeout in seconds int32_t timeout_write = timeout_read; // http write timeout in seconds - int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) + int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) + int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting std::string hostname = "127.0.0.1"; std::string public_path = ""; // NOLINT std::string chat_template = ""; // NOLINT - std::string system_prompt = ""; // NOLINT + // std::string system_prompt = ""; // NOLINT bool use_jinja = false; // NOLINT bool enable_chat_template = true; @@ -297,7 +305,10 @@ struct gpt_params { std::string ssl_file_key = ""; // NOLINT std::string ssl_file_cert = ""; // NOLINT - bool endpoint_slots = true; + // "advanced" endpoints are disabled by default for better security + bool webui = true; + bool endpoint_slots = false; + bool endpoint_props = false; // only control POST requests, not GET bool endpoint_metrics = false; bool log_json = false; @@ -352,19 +363,32 @@ struct gpt_params { // call once at the start of a program if it uses libcommon // initializes the logging system and prints info about the build -void gpt_init(); +void common_init(); -std::string gpt_params_get_system_info(const gpt_params & params); +std::string common_params_get_system_info(const common_params & params); -bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]); -bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]); -void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr); +bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]); +bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]); +void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr); bool set_process_priority(enum ggml_sched_priority prio); // // String utils // +#ifdef __GNUC__ +#ifdef __MINGW32__ +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#endif +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) +#endif + +LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) +std::string string_format(const char * fmt, ...); + std::vector string_split(std::string input, char separator); std::string string_strip(const std::string & str); @@ -409,29 +433,29 @@ std::string fs_get_cache_file(const std::string & filename); // Model utils // -struct llama_init_result { +struct common_init_result { struct llama_model * model = nullptr; struct llama_context * context = nullptr; - std::vector lora_adapters; + std::vector lora_adapters; }; -struct llama_init_result llama_init_from_gpt_params(gpt_params & params); +struct common_init_result common_init_from_params(common_params & params); -struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params); -struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params); +struct llama_model_params common_model_params_to_llama (const common_params & params); +struct llama_context_params common_context_params_to_llama(const common_params & params); struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params); -struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); -struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params); +struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); +struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params); // clear LoRA adapters from context, then apply new list of adapters -void llama_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters); +void common_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters); // Batch utils -void llama_batch_clear(struct llama_batch & batch); +void common_batch_clear(struct llama_batch & batch); -void llama_batch_add( +void common_batch_add( struct llama_batch & batch, llama_token id, llama_pos pos, @@ -444,13 +468,13 @@ void llama_batch_add( // tokenizes a string into a vector of tokens // should work similar to Python's `tokenizer.encode` -std::vector llama_tokenize( +std::vector common_tokenize( const struct llama_context * ctx, const std::string & text, bool add_special, bool parse_special = false); -std::vector llama_tokenize( +std::vector common_tokenize( const struct llama_model * model, const std::string & text, bool add_special, @@ -458,7 +482,7 @@ std::vector llama_tokenize( // tokenizes a token into a piece, optionally renders special/control tokens // should work similar to Python's `tokenizer.id_to_piece` -std::string llama_token_to_piece( +std::string common_token_to_piece( const struct llama_context * ctx, llama_token token, bool special = true); @@ -466,7 +490,7 @@ std::string llama_token_to_piece( // detokenizes a vector of tokens into a string // should work similar to Python's `tokenizer.decode` // optionally renders special/control tokens -std::string llama_detokenize( +std::string common_detokenize( llama_context * ctx, const std::vector & tokens, bool special = true); @@ -475,32 +499,32 @@ std::string llama_detokenize( // Chat template utils // -// same as llama_chat_message, but uses std::string and std::vector -struct llama_chat_msg { +// same with llama_chat_message, but uses std::string +struct common_chat_msg { std::string role; std::string content; }; // Check if the template is supported or not. Returns true if it's valid -bool llama_chat_verify_template(const std::string & tmpl, bool use_jinja); +bool common_chat_verify_template(const std::string & tmpl, bool use_jinja); // CPP wrapper for llama_chat_apply_template // If the built-in template is not supported, we default to chatml // If the custom "tmpl" is not supported, we throw an error -std::string llama_chat_apply_template(const struct llama_model * model, +std::string common_chat_apply_template(const struct llama_model * model, const std::string & tmpl, - const std::vector & chat, + const std::vector & chat, bool add_ass); // Format single message, while taking into account the position of that message in chat history -std::string llama_chat_format_single(const struct llama_model * model, +std::string common_chat_format_single(const struct llama_model * model, const std::string & tmpl, - const std::vector & past_msg, - const llama_chat_msg & new_msg, + const std::vector & past_msg, + const common_chat_msg & new_msg, bool add_ass); // Returns an example of formatted chat -std::string llama_chat_format_example(const struct llama_model * model, +std::string common_chat_format_example(const struct llama_model * model, const std::string & tmpl); minja::chat_template llama_chat_template_from_model( @@ -512,31 +536,31 @@ minja::chat_template llama_chat_template_from_model( // // Dump the KV cache view with the number of sequences per cell. -void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80); +void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80); // Dump the KV cache view showing individual sequences in each cell (long output). -void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40); +void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40); // // Embedding utils // -void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2); +void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2); -float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n); +float common_embd_similarity_cos(const float * embd1, const float * embd2, int n); // // Control vector utils // -struct llama_control_vector_data { +struct common_control_vector_data { int n_embd; // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd std::vector data; }; -struct llama_control_vector_load_info { +struct common_control_vector_load_info { float strength; std::string fname; @@ -544,7 +568,7 @@ struct llama_control_vector_load_info { // Load control vectors, scale each by strength, and add them together. // On error, returns {-1, empty} -llama_control_vector_data llama_control_vector_load(const std::vector & load_infos); +common_control_vector_data common_control_vector_load(const std::vector & load_infos); // // Antiprompt utils @@ -656,7 +680,7 @@ private: void build(const llama_context * ctx, const std::vector & stop_words, const std::vector & grammar_trigger_words) { build( [&](const std::string & text) { - return llama_tokenize(ctx, text, /* special= */ true); + return common_tokenize(ctx, text, /* special= */ true); }, stop_words, grammar_trigger_words @@ -772,5 +796,5 @@ void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data); void yaml_dump_non_result_info( - FILE * stream, const gpt_params & params, const llama_context * lctx, + FILE * stream, const common_params & params, const llama_context * lctx, const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc); diff --git a/common/console.cpp b/common/console.cpp index f65cbc6ed..078a8d678 100644 --- a/common/console.cpp +++ b/common/console.cpp @@ -94,6 +94,9 @@ namespace console { simple_io = true; } } + if (simple_io) { + _setmode(_fileno(stdin), _O_U8TEXT); + } #else // POSIX-specific console initialization if (!simple_io) { diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index e881e4e7a..e759b31e5 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -592,7 +592,7 @@ private: } return join_seq(); }; - return add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space"); + return add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space"); } /* diff --git a/common/log.cpp b/common/log.cpp index 5a844ed59..04c7c0ed1 100644 --- a/common/log.cpp +++ b/common/log.cpp @@ -8,10 +8,10 @@ #include #include -int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA; +int common_log_verbosity_thold = LOG_DEFAULT_LLAMA; -void gpt_log_set_verbosity_thold(int verbosity) { - gpt_log_verbosity_thold = verbosity; +void common_log_set_verbosity_thold(int verbosity) { + common_log_verbosity_thold = verbosity; } #define LOG_COL_DEFAULT "\033[0m" @@ -29,16 +29,16 @@ static int64_t t_us() { } // colors -enum gpt_log_col : int { - GPT_LOG_COL_DEFAULT = 0, - GPT_LOG_COL_BOLD, - GPT_LOG_COL_RED, - GPT_LOG_COL_GREEN, - GPT_LOG_COL_YELLOW, - GPT_LOG_COL_BLUE, - GPT_LOG_COL_MAGENTA, - GPT_LOG_COL_CYAN, - GPT_LOG_COL_WHITE, +enum common_log_col : int { + COMMON_LOG_COL_DEFAULT = 0, + COMMON_LOG_COL_BOLD, + COMMON_LOG_COL_RED, + COMMON_LOG_COL_GREEN, + COMMON_LOG_COL_YELLOW, + COMMON_LOG_COL_BLUE, + COMMON_LOG_COL_MAGENTA, + COMMON_LOG_COL_CYAN, + COMMON_LOG_COL_WHITE, }; // disable colors by default @@ -54,7 +54,7 @@ static std::vector g_col = { "", }; -struct gpt_log_entry { +struct common_log_entry { enum ggml_log_level level; bool prefix; @@ -71,7 +71,7 @@ struct gpt_log_entry { if (!fcur) { // stderr displays DBG messages only when their verbosity level is not higher than the threshold // these messages will still be logged to a file - if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) { + if (level == GGML_LOG_LEVEL_DEBUG && common_log_verbosity_thold < LOG_DEFAULT_DEBUG) { return; } @@ -86,19 +86,19 @@ struct gpt_log_entry { if (timestamp) { // [M.s.ms.us] fprintf(fcur, "%s%d.%02d.%03d.%03d%s ", - g_col[GPT_LOG_COL_BLUE], + g_col[COMMON_LOG_COL_BLUE], (int) (timestamp / 1000000 / 60), (int) (timestamp / 1000000 % 60), (int) (timestamp / 1000 % 1000), (int) (timestamp % 1000), - g_col[GPT_LOG_COL_DEFAULT]); + g_col[COMMON_LOG_COL_DEFAULT]); } switch (level) { - case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN], g_col[GPT_LOG_COL_DEFAULT]); break; - case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], "" ); break; - case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED], "" ); break; - case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW], "" ); break; + case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[COMMON_LOG_COL_GREEN], g_col[COMMON_LOG_COL_DEFAULT]); break; + case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[COMMON_LOG_COL_MAGENTA], "" ); break; + case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[COMMON_LOG_COL_RED], "" ); break; + case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[COMMON_LOG_COL_YELLOW], "" ); break; default: break; } @@ -107,18 +107,18 @@ struct gpt_log_entry { fprintf(fcur, "%s", msg.data()); if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) { - fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]); + fprintf(fcur, "%s", g_col[COMMON_LOG_COL_DEFAULT]); } fflush(fcur); } }; -struct gpt_log { +struct common_log { // default capacity - will be expanded if needed - gpt_log() : gpt_log(256) {} + common_log() : common_log(256) {} - gpt_log(size_t capacity) { + common_log(size_t capacity) { file = nullptr; prefix = false; timestamps = false; @@ -137,7 +137,7 @@ struct gpt_log { resume(); } - ~gpt_log() { + ~common_log() { pause(); if (file) { fclose(file); @@ -158,12 +158,12 @@ private: int64_t t_start; // ring buffer of entries - std::vector entries; + std::vector entries; size_t head; size_t tail; // worker thread copies into this - gpt_log_entry cur; + common_log_entry cur; public: void add(enum ggml_log_level level, const char * fmt, va_list args) { @@ -219,7 +219,7 @@ public: tail = (tail + 1) % entries.size(); if (tail == head) { // expand the buffer - std::vector new_entries(2*entries.size()); + std::vector new_entries(2*entries.size()); size_t new_tail = 0; @@ -320,15 +320,15 @@ public: pause(); if (colors) { - g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT; - g_col[GPT_LOG_COL_BOLD] = LOG_COL_BOLD; - g_col[GPT_LOG_COL_RED] = LOG_COL_RED; - g_col[GPT_LOG_COL_GREEN] = LOG_COL_GREEN; - g_col[GPT_LOG_COL_YELLOW] = LOG_COL_YELLOW; - g_col[GPT_LOG_COL_BLUE] = LOG_COL_BLUE; - g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA; - g_col[GPT_LOG_COL_CYAN] = LOG_COL_CYAN; - g_col[GPT_LOG_COL_WHITE] = LOG_COL_WHITE; + g_col[COMMON_LOG_COL_DEFAULT] = LOG_COL_DEFAULT; + g_col[COMMON_LOG_COL_BOLD] = LOG_COL_BOLD; + g_col[COMMON_LOG_COL_RED] = LOG_COL_RED; + g_col[COMMON_LOG_COL_GREEN] = LOG_COL_GREEN; + g_col[COMMON_LOG_COL_YELLOW] = LOG_COL_YELLOW; + g_col[COMMON_LOG_COL_BLUE] = LOG_COL_BLUE; + g_col[COMMON_LOG_COL_MAGENTA] = LOG_COL_MAGENTA; + g_col[COMMON_LOG_COL_CYAN] = LOG_COL_CYAN; + g_col[COMMON_LOG_COL_WHITE] = LOG_COL_WHITE; } else { for (size_t i = 0; i < g_col.size(); i++) { g_col[i] = ""; @@ -355,47 +355,47 @@ public: // public API // -struct gpt_log * gpt_log_init() { - return new gpt_log; +struct common_log * common_log_init() { + return new common_log; } -struct gpt_log * gpt_log_main() { - static struct gpt_log log; +struct common_log * common_log_main() { + static struct common_log log; return &log; } -void gpt_log_pause(struct gpt_log * log) { +void common_log_pause(struct common_log * log) { log->pause(); } -void gpt_log_resume(struct gpt_log * log) { +void common_log_resume(struct common_log * log) { log->resume(); } -void gpt_log_free(struct gpt_log * log) { +void common_log_free(struct common_log * log) { delete log; } -void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) { +void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...) { va_list args; va_start(args, fmt); log->add(level, fmt, args); va_end(args); } -void gpt_log_set_file(struct gpt_log * log, const char * file) { +void common_log_set_file(struct common_log * log, const char * file) { log->set_file(file); } -void gpt_log_set_colors(struct gpt_log * log, bool colors) { +void common_log_set_colors(struct common_log * log, bool colors) { log->set_colors(colors); } -void gpt_log_set_prefix(struct gpt_log * log, bool prefix) { +void common_log_set_prefix(struct common_log * log, bool prefix) { log->set_prefix(prefix); } -void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) { +void common_log_set_timestamps(struct common_log * log, bool timestamps) { log->set_timestamps(timestamps); } diff --git a/common/log.h b/common/log.h index 84f9b3ed7..66605cc69 100644 --- a/common/log.h +++ b/common/log.h @@ -14,23 +14,23 @@ #define LOG_DEFAULT_LLAMA 0 // needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower -// set via gpt_log_set_verbosity() -extern int gpt_log_verbosity_thold; +// set via common_log_set_verbosity() +extern int common_log_verbosity_thold; -void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe +void common_log_set_verbosity_thold(int verbosity); // not thread-safe -// the gpt_log uses an internal worker thread to print/write log messages +// the common_log uses an internal worker thread to print/write log messages // when the worker thread is paused, incoming log messages are discarded -struct gpt_log; +struct common_log; -struct gpt_log * gpt_log_init(); -struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit -void gpt_log_pause (struct gpt_log * log); // pause the worker thread, not thread-safe -void gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe -void gpt_log_free (struct gpt_log * log); +struct common_log * common_log_init(); +struct common_log * common_log_main(); // singleton, automatically destroys itself on exit +void common_log_pause (struct common_log * log); // pause the worker thread, not thread-safe +void common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe +void common_log_free (struct common_log * log); LOG_ATTRIBUTE_FORMAT(3, 4) -void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...); +void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...); // defaults: file = NULL, colors = false, prefix = false, timestamps = false // @@ -54,10 +54,10 @@ void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * f // D - debug (stderr, V = LOG_DEFAULT_DEBUG) // -void gpt_log_set_file (struct gpt_log * log, const char * file); // not thread-safe -void gpt_log_set_colors (struct gpt_log * log, bool colors); // not thread-safe -void gpt_log_set_prefix (struct gpt_log * log, bool prefix); // whether to output prefix to each log -void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // whether to output timestamps in the prefix +void common_log_set_file (struct common_log * log, const char * file); // not thread-safe +void common_log_set_colors (struct common_log * log, bool colors); // not thread-safe +void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log +void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix // helper macros for logging // use these to avoid computing log arguments if the verbosity of the log is higher than the threshold @@ -66,13 +66,13 @@ void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // w // // LOG_DBG("this is a debug message: %d\n", expensive_function()); // -// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold +// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > common_log_verbosity_thold // #define LOG_TMPL(level, verbosity, ...) \ do { \ - if ((verbosity) <= gpt_log_verbosity_thold) { \ - gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \ + if ((verbosity) <= common_log_verbosity_thold) { \ + common_log_add(common_log_main(), (level), __VA_ARGS__); \ } \ } while (0) diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp index 7953c723e..a9dfb6714 100644 --- a/common/ngram-cache.cpp +++ b/common/ngram-cache.cpp @@ -8,7 +8,7 @@ #include #include -void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, +void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector & inp, int nnew, bool print_progress) { const int64_t t_start_ms = ggml_time_ms(); const int64_t inp_size = inp.size(); @@ -20,16 +20,16 @@ void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, in const int64_t i_start = std::max(inp_size - nnew, ngram_size); for (int64_t i = i_start; i < inp_size; ++i) { const int64_t ngram_start = i - ngram_size; - llama_ngram ngram(&inp[ngram_start], ngram_size); + common_ngram ngram(&inp[ngram_start], ngram_size); const llama_token token = inp[i]; - llama_ngram_cache::iterator part_it = ngram_cache.find(ngram); + common_ngram_cache::iterator part_it = ngram_cache.find(ngram); if (part_it == ngram_cache.end()) { - llama_ngram_cache_part part; + common_ngram_cache_part part; part.emplace(token, 1); ngram_cache.emplace(ngram, part); } else { - llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token); + common_ngram_cache_part::iterator token_count_it = part_it->second.find(token); if (token_count_it == part_it->second.end()) { part_it->second.emplace(token, 1); } else { @@ -62,12 +62,12 @@ constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2}; constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66}; // Helper function that tries to draft a token from only the static ngram cache: -static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) { - llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); +static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) { + common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); if (part_static_it == nc_static.end()) { return -1; } - const llama_ngram_cache_part part_static = part_static_it->second; + const common_ngram_cache_part part_static = part_static_it->second; int max_count_static = 0; int sum_count_static = 0; @@ -95,19 +95,19 @@ static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ng // Try to draft a token from primary cache (context/dynamic), validate with static cache: static llama_token try_draft( - llama_ngram_cache & nc_primary, const std::vector & ngrams_primary, llama_ngram_cache_part & part_static, + common_ngram_cache & nc_primary, const std::vector & ngrams_primary, common_ngram_cache_part & part_static, const int * min_sample_size, const int * min_percent) { llama_token drafted_token = -1; for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) { - const llama_ngram ngram_primary = ngrams_primary[i]; + const common_ngram ngram_primary = ngrams_primary[i]; - llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary); + common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary); if (part_primary_it == nc_primary.end()) { continue; } - const llama_ngram_cache_part part_primary = part_primary_it->second; + const common_ngram_cache_part part_primary = part_primary_it->second; int max_count_primary = 0; int max_count_static = 0; @@ -117,7 +117,7 @@ static llama_token try_draft( for (std::pair token_count_primary : part_primary) { const llama_token token = token_count_primary.first; - llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token); + common_ngram_cache_part::iterator token_count_static_it = part_static.find(token); const int32_t count_primary = token_count_primary.second; const int32_t count_static = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1; @@ -142,9 +142,9 @@ static llama_token try_draft( return drafted_token; } -void llama_ngram_cache_draft( +void common_ngram_cache_draft( std::vector & inp, std::vector & draft, int n_draft, int ngram_min, int ngram_max, - llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static + common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static ) { GGML_ASSERT(draft.size() == 1); const int inp_size = inp.size(); @@ -157,21 +157,21 @@ void llama_ngram_cache_draft( llama_token drafted_token = -1; const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1; - llama_ngram ngram_static; + common_ngram ngram_static; for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) { ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j); } - llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); - llama_ngram_cache_part part_static; + common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); + common_ngram_cache_part part_static; if (part_static_it != nc_static.end()) { part_static = part_static_it->second; } // cd = context + dynamic - std::vector ngrams_cd; + std::vector ngrams_cd; for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) { const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1; - llama_ngram ngram_cd; + common_ngram ngram_cd; for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) { ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j); } @@ -196,16 +196,16 @@ void llama_ngram_cache_draft( } } -void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) { +void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename) { std::ofstream file_out(filename, std::ios::binary); - for (std::pair item : ngram_cache) { - const llama_ngram ngram = item.first; - llama_ngram_cache_part token_counts = item.second; + for (std::pair item : ngram_cache) { + const common_ngram ngram = item.first; + common_ngram_cache_part token_counts = item.second; GGML_ASSERT(!token_counts.empty()); const int32_t ntokens = token_counts.size(); GGML_ASSERT(ntokens > 0); - file_out.write(reinterpret_cast(&ngram), sizeof(llama_ngram)); + file_out.write(reinterpret_cast(&ngram), sizeof(common_ngram)); file_out.write(reinterpret_cast(&ntokens), sizeof(int32_t)); for (std::pair item2 : token_counts) { const llama_token token = item2.first; @@ -219,14 +219,14 @@ void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filen } -llama_ngram_cache llama_ngram_cache_load(std::string & filename) { +common_ngram_cache common_ngram_cache_load(std::string & filename) { std::ifstream hashmap_file(filename, std::ios::binary); if (!hashmap_file) { throw std::ifstream::failure("Unable to open file " + filename); } - llama_ngram_cache ngram_cache; + common_ngram_cache ngram_cache; - llama_ngram ngram; + common_ngram ngram; int32_t ntokens; llama_token token; int32_t count; @@ -235,11 +235,11 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) { char * ntokensc = reinterpret_cast(&ntokens); char * tokenc = reinterpret_cast(&token); char * countc = reinterpret_cast(&count); - while(hashmap_file.read(ngramc, sizeof(llama_ngram))) { + while(hashmap_file.read(ngramc, sizeof(common_ngram))) { GGML_ASSERT(!hashmap_file.eof()); GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t))); GGML_ASSERT(ntokens > 0); - llama_ngram_cache_part token_counts; + common_ngram_cache_part token_counts; for (int i = 0; i < ntokens; ++i) { GGML_ASSERT(!hashmap_file.eof()); @@ -257,12 +257,12 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) { return ngram_cache; } -void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) { - for (std::pair ngram_part : ngram_cache_add) { - const llama_ngram ngram = ngram_part.first; - llama_ngram_cache_part part = ngram_part.second; +void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add) { + for (std::pair ngram_part : ngram_cache_add) { + const common_ngram ngram = ngram_part.first; + common_ngram_cache_part part = ngram_part.second; - llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram); + common_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram); if (part_merged_it == ngram_cache_target.end()) { ngram_cache_target.emplace(ngram, part); continue; @@ -273,7 +273,7 @@ void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram const int32_t count = token_count.second; GGML_ASSERT(count > 0); - llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token); + common_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token); if (token_count_merged_it == part_merged_it->second.end()) { part_merged_it->second.emplace(token, count); continue; diff --git a/common/ngram-cache.h b/common/ngram-cache.h index ab4c9b376..09c2b0319 100644 --- a/common/ngram-cache.h +++ b/common/ngram-cache.h @@ -12,22 +12,22 @@ // Data structures to map n-grams to empirical token probabilities: -struct llama_ngram { +struct common_ngram { llama_token tokens[LLAMA_NGRAM_MAX]; - llama_ngram() { + common_ngram() { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { tokens[i] = -1; } } - llama_ngram(const llama_token * input, const int ngram_size) { + common_ngram(const llama_token * input, const int ngram_size) { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { tokens[i] = i < ngram_size ? input[i] : -1; } } - bool operator==(const llama_ngram & other) const { + bool operator==(const common_ngram & other) const { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { if (tokens[i] != other.tokens[i]) { return false; @@ -37,28 +37,28 @@ struct llama_ngram { } }; -struct llama_token_hash_function { +struct common_token_hash_function { size_t operator()(const llama_token token) const { // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ return token * 11400714819323198485llu; } }; -struct llama_ngram_hash_function { - size_t operator()(const llama_ngram & ngram) const { - size_t hash = llama_token_hash_function{}(ngram.tokens[0]); +struct common_ngram_hash_function { + size_t operator()(const common_ngram & ngram) const { + size_t hash = common_token_hash_function{}(ngram.tokens[0]); for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) { - hash ^= llama_token_hash_function{}(ngram.tokens[i]); + hash ^= common_token_hash_function{}(ngram.tokens[i]); } return hash; } }; // token -> number of times token has been seen -typedef std::unordered_map llama_ngram_cache_part; +typedef std::unordered_map common_ngram_cache_part; // n-gram -> empirical distribution of following tokens -typedef std::unordered_map llama_ngram_cache; +typedef std::unordered_map common_ngram_cache; // Update an ngram cache with tokens. @@ -70,8 +70,8 @@ typedef std::unordered_map & inp_data, int nnew, bool print_progress); +void common_ngram_cache_update( + common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector & inp_data, int nnew, bool print_progress); // Try to draft tokens from ngram caches. // inp: the tokens generated so far. @@ -81,21 +81,21 @@ void llama_ngram_cache_update( // nc_context: ngram cache based on current context. // nc_dynamic: ngram cache based on previous user generations. // nc_static: ngram cache generated from a large text corpus, used for validation. -void llama_ngram_cache_draft( +void common_ngram_cache_draft( std::vector & inp, std::vector & draft, int n_draft, int ngram_min, int ngram_max, - llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static); + common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static); // Save an ngram cache to a file. // ngram_cache: the ngram cache to save. // filename: the path under which to save the ngram cache. -void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename); +void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename); -// Load an ngram cache saved with llama_ngram_cache_save. +// Load an ngram cache saved with common_ngram_cache_save. // filename: the path from which to load the ngram cache. // returns: an ngram cache containing the information saved to filename. -llama_ngram_cache llama_ngram_cache_load(std::string & filename); +common_ngram_cache common_ngram_cache_load(std::string & filename); // Merge two ngram caches. // ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add. // ngram_cache_add: the ngram cache to add to ngram_cache_target. -void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add); +void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add); diff --git a/common/sampling.cpp b/common/sampling.cpp index 5593ae4ef..4cd5c25de 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -98,8 +98,8 @@ struct ring_buffer { std::vector data; }; -struct gpt_sampler { - gpt_sampler_params params; +struct common_sampler { + common_sampler_params params; struct llama_sampler * grmr; struct llama_sampler * chain; @@ -125,21 +125,21 @@ struct gpt_sampler { } }; -std::string gpt_sampler_params::print() const { +std::string common_sampler_params::print() const { char result[1024]; snprintf(result, sizeof(result), "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n" - "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n" + "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n" "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f", penalty_last_n, penalty_repeat, penalty_freq, penalty_present, - top_k, tfs_z, top_p, min_p, typ_p, temp, + top_k, tfs_z, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp, mirostat, mirostat_eta, mirostat_tau); return std::string(result); } -bool gpt_sampler_trigger_grammar(const struct llama_model * model, gpt_sampler * gsmpl, const std::string & trigger) { +bool common_sampler_trigger_grammar(const struct llama_model * model, common_sampler * gsmpl, const std::string & trigger) { if (!llama_sampler_is_grammar_empty(gsmpl->grmr)) { return false; } @@ -148,12 +148,12 @@ bool gpt_sampler_trigger_grammar(const struct llama_model * model, gpt_sampler * return true; } -struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) { +struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) { llama_sampler_chain_params lparams = llama_sampler_chain_default_params(); lparams.no_perf = params.no_perf; - auto * result = new gpt_sampler { + auto * result = new common_sampler { /* .params = */ params, /* .grmr = */ llama_sampler_init_grammar(model, params.grammar_trigger_words.empty() ? params.grammar.c_str() : "", "root"), /* .chain = */ llama_sampler_chain_init(lparams), @@ -180,60 +180,52 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st params.penalize_nl, params.ignore_eos)); - if (params.temp > 0.0f) { - if (params.mirostat == 0) { - for (const auto & cnstr : params.samplers) { - switch (cnstr) { - case GPT_SAMPLER_TYPE_TOP_K: - llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); - break; - case GPT_SAMPLER_TYPE_TOP_P: - llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep)); - break; - case GPT_SAMPLER_TYPE_MIN_P: - llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep)); - break; - case GPT_SAMPLER_TYPE_TFS_Z: - llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep)); - break; - case GPT_SAMPLER_TYPE_TYPICAL_P: - llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep)); - break; - case GPT_SAMPLER_TYPE_TEMPERATURE: - llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); - break; - default: - GGML_ASSERT(false && "unknown sampler type"); - } + if (params.mirostat == 0) { + for (const auto & cnstr : params.samplers) { + switch (cnstr) { + case COMMON_SAMPLER_TYPE_TOP_K: + llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); + break; + case COMMON_SAMPLER_TYPE_TOP_P: + llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep)); + break; + case COMMON_SAMPLER_TYPE_MIN_P: + llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep)); + break; + case COMMON_SAMPLER_TYPE_XTC: + llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed)); + break; + case COMMON_SAMPLER_TYPE_TFS_Z: + llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep)); + break; + case COMMON_SAMPLER_TYPE_TYPICAL_P: + llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep)); + break; + case COMMON_SAMPLER_TYPE_TEMPERATURE: + llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); + break; + case COMMON_SAMPLER_TYPE_INFILL: + llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model)); + break; + default: + GGML_ASSERT(false && "unknown sampler type"); } - llama_sampler_chain_add(result->chain, llama_sampler_init_softmax()); - llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed)); - } else if (params.mirostat == 1) { - llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); - llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); - } else if (params.mirostat == 2) { - llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); - llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta)); - } else { - GGML_ASSERT(false && "unknown mirostat version"); } + llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed)); + } else if (params.mirostat == 1) { + llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); + llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); + } else if (params.mirostat == 2) { + llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); + llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta)); } else { - if (params.n_probs > 0) { - // some use cases require to sample greedily, but still obtain the probabilities of the top tokens - // ref: https://github.com/ggerganov/llama.cpp/pull/9605 - // - // the following will not produce exactly the same probs as applyging softmax to the full vocabulary, but - // it is much faster, since we avoid sorting all tokens and should give a good approximation - llama_sampler_chain_add(result->chain, llama_sampler_init_top_k(params.n_probs)); - llama_sampler_chain_add(result->chain, llama_sampler_init_softmax()); - } - llama_sampler_chain_add(result->chain, llama_sampler_init_greedy()); + GGML_ASSERT(false && "unknown mirostat version"); } return result; } -void gpt_sampler_free(struct gpt_sampler * gsmpl) { +void common_sampler_free(struct common_sampler * gsmpl) { if (gsmpl) { if (gsmpl->grmr) { llama_sampler_free(gsmpl->grmr); @@ -245,7 +237,7 @@ void gpt_sampler_free(struct gpt_sampler * gsmpl) { } } -void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar) { +void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) { if (accept_grammar) { llama_sampler_accept(gsmpl->grmr, token); } @@ -255,7 +247,7 @@ void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool acce gsmpl->prev.push_back(token); } -void gpt_sampler_reset(struct gpt_sampler * gsmpl) { +void common_sampler_reset(struct common_sampler * gsmpl) { if (gsmpl->grmr) { llama_sampler_reset(gsmpl->grmr); } @@ -263,8 +255,8 @@ void gpt_sampler_reset(struct gpt_sampler * gsmpl) { llama_sampler_reset(gsmpl->chain); } -struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) { - return new gpt_sampler { +struct common_sampler * common_sampler_clone(common_sampler * gsmpl) { + return new common_sampler { /* .params = */ gsmpl->params, /* .grmr = */ llama_sampler_clone(gsmpl->grmr), /* .chain = */ llama_sampler_clone(gsmpl->chain), @@ -274,7 +266,7 @@ struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) { }; } -void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) { +void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) { // TODO: measure grammar performance if (gsmpl) { @@ -285,7 +277,7 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * } } -llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) { +llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) { gsmpl->set_logits(ctx, idx); auto & grmr = gsmpl->grmr; @@ -331,21 +323,21 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context return cur_p.data[cur_p.selected].id; } -uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) { +uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) { return llama_sampler_get_seed(gsmpl->chain); } // helpers -llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) { +llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) { return &gsmpl->cur_p; } -llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) { +llama_token common_sampler_last(const struct common_sampler * gsmpl) { return gsmpl->prev.rat(0); } -std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) { +std::string common_sampler_print(const struct common_sampler * gsmpl) { std::string result = "logits "; for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) { @@ -356,7 +348,7 @@ std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) { return result; } -std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) { +std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) { n = std::min(n, (int) gsmpl->prev.size()); if (n <= 0) { @@ -371,63 +363,69 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen"); - result += llama_token_to_piece(ctx_main, id); + result += common_token_to_piece(ctx_main, id); } return result; } -char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) { +char common_sampler_type_to_chr(enum common_sampler_type cnstr) { switch (cnstr) { - case GPT_SAMPLER_TYPE_TOP_K: return 'k'; - case GPT_SAMPLER_TYPE_TFS_Z: return 'f'; - case GPT_SAMPLER_TYPE_TYPICAL_P: return 'y'; - case GPT_SAMPLER_TYPE_TOP_P: return 'p'; - case GPT_SAMPLER_TYPE_MIN_P: return 'm'; - case GPT_SAMPLER_TYPE_TEMPERATURE: return 't'; + case COMMON_SAMPLER_TYPE_TOP_K: return 'k'; + case COMMON_SAMPLER_TYPE_TFS_Z: return 'f'; + case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y'; + case COMMON_SAMPLER_TYPE_TOP_P: return 'p'; + case COMMON_SAMPLER_TYPE_MIN_P: return 'm'; + case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't'; + case COMMON_SAMPLER_TYPE_XTC: return 'x'; + case COMMON_SAMPLER_TYPE_INFILL: return 'i'; default : return '?'; } } -std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) { +std::string common_sampler_type_to_str(enum common_sampler_type cnstr) { switch (cnstr) { - case GPT_SAMPLER_TYPE_TOP_K: return "top_k"; - case GPT_SAMPLER_TYPE_TFS_Z: return "tfs_z"; - case GPT_SAMPLER_TYPE_TYPICAL_P: return "typ_p"; - case GPT_SAMPLER_TYPE_TOP_P: return "top_p"; - case GPT_SAMPLER_TYPE_MIN_P: return "min_p"; - case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature"; + case COMMON_SAMPLER_TYPE_TOP_K: return "top_k"; + case COMMON_SAMPLER_TYPE_TFS_Z: return "tfs_z"; + case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p"; + case COMMON_SAMPLER_TYPE_TOP_P: return "top_p"; + case COMMON_SAMPLER_TYPE_MIN_P: return "min_p"; + case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature"; + case COMMON_SAMPLER_TYPE_XTC: return "xtc"; + case COMMON_SAMPLER_TYPE_INFILL: return "infill"; default : return ""; } } -std::vector gpt_sampler_types_from_names(const std::vector & names, bool allow_alt_names) { - std::unordered_map sampler_canonical_name_map { - { "top_k", GPT_SAMPLER_TYPE_TOP_K }, - { "top_p", GPT_SAMPLER_TYPE_TOP_P }, - { "typ_p", GPT_SAMPLER_TYPE_TYPICAL_P }, - { "min_p", GPT_SAMPLER_TYPE_MIN_P }, - { "tfs_z", GPT_SAMPLER_TYPE_TFS_Z }, - { "temperature", GPT_SAMPLER_TYPE_TEMPERATURE }, +std::vector common_sampler_types_from_names(const std::vector & names, bool allow_alt_names) { + std::unordered_map sampler_canonical_name_map { + { "top_k", COMMON_SAMPLER_TYPE_TOP_K }, + { "top_p", COMMON_SAMPLER_TYPE_TOP_P }, + { "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P }, + { "min_p", COMMON_SAMPLER_TYPE_MIN_P }, + { "tfs_z", COMMON_SAMPLER_TYPE_TFS_Z }, + { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE }, + { "xtc", COMMON_SAMPLER_TYPE_XTC }, + { "infill", COMMON_SAMPLER_TYPE_INFILL }, }; // since samplers names are written multiple ways // make it ready for both system names and input names - std::unordered_map sampler_alt_name_map { - { "top-k", GPT_SAMPLER_TYPE_TOP_K }, - { "top-p", GPT_SAMPLER_TYPE_TOP_P }, - { "nucleus", GPT_SAMPLER_TYPE_TOP_P }, - { "typical-p", GPT_SAMPLER_TYPE_TYPICAL_P }, - { "typical", GPT_SAMPLER_TYPE_TYPICAL_P }, - { "typ-p", GPT_SAMPLER_TYPE_TYPICAL_P }, - { "typ", GPT_SAMPLER_TYPE_TYPICAL_P }, - { "min-p", GPT_SAMPLER_TYPE_MIN_P }, - { "tfs-z", GPT_SAMPLER_TYPE_TFS_Z }, - { "tfs", GPT_SAMPLER_TYPE_TFS_Z }, - { "temp", GPT_SAMPLER_TYPE_TEMPERATURE }, + std::unordered_map sampler_alt_name_map { + { "top-k", COMMON_SAMPLER_TYPE_TOP_K }, + { "top-p", COMMON_SAMPLER_TYPE_TOP_P }, + { "nucleus", COMMON_SAMPLER_TYPE_TOP_P }, + { "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P }, + { "typical", COMMON_SAMPLER_TYPE_TYPICAL_P }, + { "typ-p", COMMON_SAMPLER_TYPE_TYPICAL_P }, + { "typ", COMMON_SAMPLER_TYPE_TYPICAL_P }, + { "min-p", COMMON_SAMPLER_TYPE_MIN_P }, + { "tfs-z", COMMON_SAMPLER_TYPE_TFS_Z }, + { "tfs", COMMON_SAMPLER_TYPE_TFS_Z }, + { "temp", COMMON_SAMPLER_TYPE_TEMPERATURE }, }; - std::vector samplers; + std::vector samplers; samplers.reserve(names.size()); for (const auto & name : names) { @@ -447,17 +445,19 @@ std::vector gpt_sampler_types_from_names(const std::vector gpt_sampler_types_from_chars(const std::string & chars) { - std::unordered_map sampler_name_map = { - { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K }, - { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z }, - { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P }, - { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P), GPT_SAMPLER_TYPE_TOP_P }, - { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P), GPT_SAMPLER_TYPE_MIN_P }, - { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE } +std::vector common_sampler_types_from_chars(const std::string & chars) { + std::unordered_map sampler_name_map = { + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TFS_Z), COMMON_SAMPLER_TYPE_TFS_Z }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL }, }; - std::vector samplers; + std::vector samplers; samplers.reserve(chars.size()); for (const auto & c : chars) { diff --git a/common/sampling.h b/common/sampling.h index 34c52377d..d5cce3420 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -7,7 +7,7 @@ #include #include -// gpt_sampler extends llama_sampler with additional functionality: +// common_sampler extends llama_sampler with additional functionality: // // - grammar support // - custom sampler logic based on the parameters @@ -23,30 +23,30 @@ // token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the // grammar constraints are applied to the full vocabulary and the token is resampled. // -// The gpt_sampler also maintains a container with the last accepted tokens. In the future, this can +// The common_sampler also maintains a container with the last accepted tokens. In the future, this can // be moved into the core llama library. // -// For convenience, the gpt_sampler also maintains a container with the current candidate tokens. +// For convenience, the common_sampler also maintains a container with the current candidate tokens. // This can be used to access the probabilities of the rest of the non-sampled tokens. // // TODO: measure grammar performance // -struct gpt_sampler; +struct common_sampler; // llama_sampler API overloads -struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params); +struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params); -void gpt_sampler_free(struct gpt_sampler * gsmpl); +void common_sampler_free(struct common_sampler * gsmpl); // if accept_grammar is true, the token is accepted both by the sampling chain and the grammar -void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar); -void gpt_sampler_reset (struct gpt_sampler * gsmpl); -struct gpt_sampler * gpt_sampler_clone (struct gpt_sampler * gsmpl); +void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar); +void common_sampler_reset (struct common_sampler * gsmpl); +struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl); // arguments can be nullptr to skip printing -void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl); +void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl); // extended sampling implementation: // @@ -58,28 +58,28 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * // if grammar_first is true, the grammar is applied before the samplers (slower) // useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar // -llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false); +llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false); -uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl); +uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl); // helpers // access the internal list of current candidate tokens -llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl); +llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl); // get the last accepted token -llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl); +llama_token common_sampler_last(const struct common_sampler * gsmpl); // print the sampler chain into a string -std::string gpt_sampler_print(const struct gpt_sampler * gsmpl); +std::string common_sampler_print(const struct common_sampler * gsmpl); // get a string representation of the last accepted tokens -std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx, int n); +std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n); -char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr); -std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr); +char common_sampler_type_to_chr(enum common_sampler_type cnstr); +std::string common_sampler_type_to_str(enum common_sampler_type cnstr); -bool gpt_sampler_trigger_grammar(const struct llama_model * model, gpt_sampler * gsmpl, const std::string & trigger); +bool common_sampler_trigger_grammar(const struct llama_model * model, common_sampler * gsmpl, const std::string & trigger); -std::vector gpt_sampler_types_from_names(const std::vector & names, bool allow_alt_names); -std::vector gpt_sampler_types_from_chars(const std::string & chars); +std::vector common_sampler_types_from_names(const std::vector & names, bool allow_alt_names); +std::vector common_sampler_types_from_chars(const std::string & chars); diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 7be609054..e0b1b2bf9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -15,6 +15,7 @@ from enum import IntEnum from pathlib import Path from hashlib import sha256 from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast +from itertools import chain import math import numpy as np @@ -64,7 +65,6 @@ class Model: model_name: str | None metadata_override: Path | None dir_model_card: Path - is_lora: bool # subclasses should define this! model_arch: gguf.MODEL_ARCH @@ -72,7 +72,7 @@ class Model: def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False, use_temp_file: bool = False, eager: bool = False, metadata_override: Path | None = None, model_name: str | None = None, - split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, is_lora: bool = False): + split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): if type(self) is Model: raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") @@ -94,7 +94,6 @@ class Model: self.metadata_override = metadata_override self.model_name = model_name self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py - self.is_lora = is_lora # true if model is used inside convert_lora_to_gguf.py # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type if self.ftype == gguf.LlamaFileType.GUESSED: @@ -270,10 +269,14 @@ class Model: return False + # some models need extra generated tensors (like rope_freqs) + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + return () + def prepare_tensors(self): max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") - for name, data_torch in self.get_tensors(): + for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()): # we don't need these if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): continue @@ -291,8 +294,13 @@ class Model: bid = int(part) break - for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)): - data: np.ndarray # type hint + for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)): + data = data_torch.squeeze().numpy() + + # if data ends up empty, it means data_torch was a scalar tensor -> restore + if len(data.shape) == 0: + data = data_torch.numpy() + n_dims = len(data.shape) data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims) @@ -592,6 +600,9 @@ class Model: if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e": # ref: https://huggingface.co/databricks/dbrx-base res = "dbrx" + if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448": + # ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en + res = "jina-v1-en" if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en res = "jina-v2-en" @@ -640,6 +651,9 @@ class Model: if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085": # ref: https://huggingface.co/microsoft/phi-2 res = "phi-2" + if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450": + # ref: https://huggingface.co/facebook/chameleon-7b + res = "chameleon" if res is None: logger.warning("\n") @@ -1606,7 +1620,7 @@ class LlamaModel(Model): return [(self.map_tensor_name(name), data_torch)] - def prepare_tensors(self): + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): if rope_scaling.get("rope_type", '').lower() == "llama3": base = self.hparams.get("rope_theta", 10000.0) @@ -1633,9 +1647,9 @@ class LlamaModel(Model): smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - if not self.is_lora: - self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) + def prepare_tensors(self): super().prepare_tensors() if self._experts is not None: @@ -1859,8 +1873,6 @@ class MiniCPM3Model(Model): def set_gguf_parameters(self): hparams = self.hparams - rope_dims = hparams["qk_rope_head_dim"] - self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(hparams["hidden_size"]) @@ -1876,24 +1888,25 @@ class MiniCPM3Model(Model): self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: rope_scaling = self.find_hparam(['rope_scaling'], True) - if rope_scaling is None: - return + if rope_scaling is not None: + rope_dims = self.hparams["qk_rope_head_dim"] - long_factors = rope_scaling.get('long_factor', None) - short_factors = rope_scaling.get('short_factor', None) + long_factors = rope_scaling.get('long_factor', None) + short_factors = rope_scaling.get('short_factor', None) - if long_factors is None or short_factors is None: - raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') + if long_factors is None or short_factors is None: + raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') - if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: - raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') + if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: + raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') - self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32)) - self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) def set_vocab(self): - self._set_vocab_llama_hf() + self._set_vocab_sentencepiece() def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: @@ -2205,6 +2218,13 @@ class Phi3MiniModel(Model): self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"])) + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) + orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) + rope_dims = n_embd // n_head + # write rope scaling for long context (128k) model rope_scaling = self.find_hparam(['rope_scaling'], True) if rope_scaling is None: @@ -2234,9 +2254,8 @@ class Phi3MiniModel(Model): if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') - if not self.is_lora: - self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32)) - self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) @Model.register("PlamoForCausalLM") @@ -2598,7 +2617,7 @@ class NomicBertModel(BertModel): self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) -@Model.register("XLMRobertaModel") +@Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") class XLMRobertaModel(BertModel): model_arch = gguf.MODEL_ARCH.BERT @@ -2696,6 +2715,11 @@ class XLMRobertaModel(BertModel): self.gguf_writer.add_add_eos_token(True) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # if name starts with "roberta.", remove the prefix + # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main + if name.startswith("roberta."): + name = name[8:] + # position embeddings start at pad_token_id + 1, so just chop down the weight tensor if name == "embeddings.position_embeddings.weight": if self._position_offset is not None: @@ -2840,6 +2864,8 @@ class Rwkv6Model(Model): self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.chat_template = "rwkv-world" + special_vocab._set_special_token("eot", 261) special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): @@ -3107,6 +3133,14 @@ class JinaBertV2Model(BertModel): self.gguf_writer.add_add_bos_token(True) self.gguf_writer.add_add_eos_token(True) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # if name starts with "bert.", remove the prefix + # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en + if name.startswith("bert."): + name = name[5:] + + return super().modify_tensors(data_torch, name, bid) + @Model.register("OpenELMForCausalLM") class OpenELMModel(Model): @@ -4047,7 +4081,7 @@ class ExaoneModel(Model): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"]) - def prepare_tensors(self): + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): if rope_scaling.get("rope_type", '').lower() == "llama3": base = self.hparams.get("rope_theta", 10000.0) @@ -4074,10 +4108,7 @@ class ExaoneModel(Model): smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - if not self.is_lora: - self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32)) - - super().prepare_tensors() + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) @Model.register("GraniteForCausalLM") @@ -4138,6 +4169,48 @@ class GraniteMoeModel(GraniteModel): return super().modify_tensors(data_torch, name, bid) +@Model.register("ChameleonForConditionalGeneration") +@Model.register("ChameleonForCausalLM") # obsolete +class ChameleonModel(Model): + model_arch = gguf.MODEL_ARCH.CHAMELEON + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False)) + + def set_vocab(self): + self._set_vocab_gpt2() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # ignore image tokenizer for now + # TODO: remove this once image support is implemented for Chameleon + if name.startswith("model.vqmodel"): + return [] + + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + hidden_dim = self.hparams.get("hidden_size") + + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + if name.endswith(("q_norm.weight", "q_norm.bias")): + data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim) + if name.endswith(("k_norm.weight", "k_norm.bias")): + data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim) + + return [(self.map_tensor_name(name), data_torch)] + + # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203 + @staticmethod + def _reverse_hf_permute(data_torch, n_heads, hidden_dim): + head_dim = hidden_dim // n_heads + data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1) + data_torch = data_torch.repeat_interleave(n_heads, 0) + return data_torch + + ###### CONVERSION LOGIC ###### diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 021f65abd..022354a3b 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -81,6 +81,7 @@ models = [ {"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", }, {"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", }, {"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", }, + {"name": "jina-v1-en", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", }, {"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM! {"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", }, {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, @@ -99,6 +100,7 @@ models = [ {'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", }, {"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", }, {"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", }, + {"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", }, ] diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index d1c94e580..439a78de1 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -331,6 +331,10 @@ if __name__ == '__main__': self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha) super().set_gguf_parameters() + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + # Never add extra tensors (e.g. rope_freqs) for LoRA adapters + return () + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: tensor_map: dict[str, PartialLoraTensor] = {} @@ -392,7 +396,6 @@ if __name__ == '__main__': dry_run=args.dry_run, dir_lora_model=dir_lora, lora_alpha=alpha, - is_lora=True, ) logger.info("Exporting model...") diff --git a/docs/android.md b/docs/android.md index cec4358d9..320b62240 100644 --- a/docs/android.md +++ b/docs/android.md @@ -2,55 +2,82 @@ # Android ## Build on Android using Termux -[Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required). -``` -apt update && apt upgrade -y -apt install git make cmake -``` -It's recommended to move your model inside the `~/` directory for best performance: -``` -cd storage/downloads -mv model.gguf ~/ -``` +[Termux](https://termux.dev/en/) is an Android terminal emulator and Linux environment app (no root required). As of writing, Termux is available experimentally in the Google Play Store; otherwise, it may be obtained directly from the project repo or on F-Droid. -[Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`. - -## Building the Project using Android NDK -Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake. - -Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux: -``` -$ mkdir build-android -$ cd build-android -$ export NDK= -$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod .. -$ make -``` - -Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice). - -Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission: - -(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`) -``` -$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/ -$cd /data/data/com.termux/files/home/bin -$chmod +x ./* -``` - -Download model [llama-2-7b-chat.Q4_K_M.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_K_M.gguf), and push it to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/` +With Termux, you can install and run `llama.cpp` as if the environment were Linux. Once in the Termux shell: ``` -$mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/home/model/ +$ apt update && apt upgrade -y +$ apt install git cmake ``` -Now, you can start chatting: +Then, follow the [build instructions](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md), specifically for CMake. + +Once the binaries are built, download your model of choice (e.g., from Hugging Face). It's recommended to place it in the `~/` directory for best performance: + ``` -$cd /data/data/com.termux/files/home/bin -$./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml +$ curl -L {model-url} -o ~/{model}.gguf ``` -Here's a demo of an interactive session running on Pixel 5 phone: +Then, if you are not already in the repo directory, `cd` into `llama.cpp` and: + +``` +$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}" +``` + +Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal. + +To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone: https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4 + +## Cross-compile using Android NDK +It's possible to build `llama.cpp` for Android on your host system via CMake and the Android NDK. If you are interested in this path, ensure you already have an environment prepared to cross-compile programs for Android (i.e., install the Android SDK). Note that, unlike desktop environments, the Android environment ships with a limited set of native libraries, and so only those libraries are available to CMake when building with the Android NDK (see: https://developer.android.com/ndk/guides/stable_apis.) + +Once you're ready and have cloned `llama.cpp`, invoke the following in the project directory: + +``` +$ cmake \ + -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=android-28 \ + -DCMAKE_C_FLAGS="-march=armv8.7a" \ + -DCMAKE_CXX_FLAGS="-march=armv8.7a" \ + -DGGML_OPENMP=OFF \ + -DGGML_LLAMAFILE=OFF \ + -B build-android +``` + +Notes: + - While later versions of Android NDK ship with OpenMP, it must still be installed by CMake as a dependency, which is not supported at this time + - `llamafile` does not appear to support Android devices (see: https://github.com/Mozilla-Ocho/llamafile/issues/325) + +The above command should configure `llama.cpp` with the most performant options for modern devices. Even if your device is not running `armv8.7a`, `llama.cpp` includes runtime checks for available CPU features it can use. + +Feel free to adjust the Android ABI for your target. Once the project is configured: + +``` +$ cmake --build build-android --config Release -j{n} +$ cmake --install build-android --prefix {install-dir} --config Release +``` + +After installing, go ahead and download the model of your choice to your host system. Then: + +``` +$ adb shell "mkdir /data/local/tmp/llama.cpp" +$ adb push {install-dir} /data/local/tmp/llama.cpp/ +$ adb push {model}.gguf /data/local/tmp/llama.cpp/ +$ adb shell +``` + +In the `adb shell`: + +``` +$ cd /data/local/tmp/llama.cpp +$ LD_LIBRARY_PATH=lib ./bin/llama-simple -m {model}.gguf -c {context-size} -p "{your-prompt}" +``` + +That's it! + +Be aware that Android will not find the library path `lib` on its own, so we must specify `LD_LIBRARY_PATH` in order to run the installed executables. Android does support `RPATH` in later API levels, so this could change in the future. Refer to the previous section for information about `context-size` (very important!) and running other `examples`. diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index bc266f7d8..ea34182e4 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -26,7 +26,7 @@ ### Llama.cpp + SYCL -The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*). +The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD. ## Recommended Release @@ -111,10 +111,18 @@ SYCL backend supports Intel GPU Family: **Verified devices** -| Nvidia GPU | Status | Verified Model | -|--------------------------|---------|----------------| -| Ampere Series | Support | A100, A4000 | -| Ampere Series *(Mobile)* | Support | RTX 40 Series | +| Nvidia GPU | Status | Verified Model | +|--------------------------|-----------|----------------| +| Ampere Series | Supported | A100, A4000 | +| Ampere Series *(Mobile)* | Supported | RTX 40 Series | + +| AMD GPU | Status | Verified Model | +|--------------------------|--------------|----------------| +| Radeon Pro | Experimental | W6800 | +| Radeon RX | Experimental | 6700 XT | + +Note: AMD GPU support is highly experimental and is incompatible with F16. +Additionally, it only supports GPUs with a sub_group_size (warp size) of 32. ## Docker The docker build option is currently limited to *intel GPU* targets. @@ -186,6 +194,10 @@ Platform #0: Intel(R) OpenCL HD Graphics In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed. +- **AMD GPU** + +To target AMD GPUs with SYCL, the ROCm stack must be installed first. + 2. **Install Intel® oneAPI Base toolkit** - **For Intel GPU** @@ -212,6 +224,19 @@ cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENAB cmake --build buildWithCublas --config Release ``` +- **Adding support to AMD GPUs** + +**oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit. + +**oneMKL for rocBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* doesn't contain the rocBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *rocBLAS* backend enabled is thus required to run it on AMD GPUs. + +```sh +git clone https://github.com/oneapi-src/oneMKL +cd oneMKL +# Find your HIPTARGET with rocminfo, under the key 'Name:' +cmake -B buildWithrocBLAS -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_ROCBLAS_BACKEND=ON -DHIPTARGETS=${HIPTARGET} -DTARGET_DOMAINS=blas +cmake --build buildWithrocBLAS --config Release +``` 3. **Verify installation and environment** @@ -223,22 +248,32 @@ sycl-ls - **Intel GPU** -When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`ext_oneapi_level_zero:gpu:0`] in the sample output below: +When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`level_zero:gpu`] in the sample output below: ``` -[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000] -[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000] -[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50] -[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918] +[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000] +[opencl:cpu][opencl:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000] +[opencl:gpu][opencl:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50] +[level_zero:gpu][level_zero:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918] ``` - **Nvidia GPU** -Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow: +Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`cuda:gpu`] as below: + ``` -[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix] -[opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix] -[ext_oneapi_cuda:gpu:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.2] +[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix] +[opencl:cpu][opencl:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix] +[cuda:gpu][cuda:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.5] +``` + +- **AMD GPU** + +For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]: + +``` +[opencl:cpu][opencl:0] Intel(R) OpenCL, 12th Gen Intel(R) Core(TM) i9-12900K OpenCL 3.0 (Build 0) [2024.18.6.0.02_160000] +[hip:gpu][hip:0] AMD HIP BACKEND, AMD Radeon PRO W6800 gfx1030 [HIP 60140.9] ``` ### II. Build llama.cpp @@ -266,6 +301,7 @@ cmake --build build --config Release -j -v ``` #### Nvidia GPU + ```sh # Export relevant ENV variables export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH @@ -283,7 +319,25 @@ cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx - # build all binary cmake --build build --config Release -j -v +``` +#### AMD GPU + +```sh +# Export relevant ENV variables +export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH +export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH +export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR + +# Build LLAMA with rocBLAS acceleration through SYCL + +## AMD +# Use FP32, FP16 is not supported +# Find your GGML_SYCL_HIP_TARGET with rocminfo, under the key 'Name:' +cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_HIP_TARGET=${GGML_SYCL_HIP_TARGET} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx + +# build all binary +cmake --build build --config Release -j -v ``` ### III. Run the inference @@ -586,11 +640,11 @@ use 1 SYCL GPUs: [0] with Max compute units:512 #### Build -| Name | Value | Function | -|--------------------|-----------------------------------|---------------------------------------------| -| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.
FP32 path - recommended for better perforemance than FP16 on quantized model| -| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. | -| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | +| Name | Value | Function | +|--------------------|---------------------------------------|---------------------------------------------| +| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.
FP32 path - recommended for better perforemance than FP16 on quantized model| +| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. | +| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | | CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. | | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | diff --git a/docs/build.md b/docs/build.md index faa0ecfa4..4e362ebc7 100644 --- a/docs/build.md +++ b/docs/build.md @@ -198,6 +198,8 @@ The following compilation options are also available to tweak performance: ### MUSA +This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa). + - Using `make`: ```bash make GGML_MUSA=1 @@ -209,6 +211,12 @@ The following compilation options are also available to tweak performance: cmake --build build --config Release ``` +The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used. + +The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. + +Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet. + ### hipBLAS This provides BLAS acceleration on HIP-supported AMD GPUs. diff --git a/docs/docker.md b/docs/docker.md index e8a084173..8d90e6ded 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -19,8 +19,11 @@ Additionally, there the following images, similar to the above: - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) +- `ghcr.io/ggerganov/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`) +- `ghcr.io/ggerganov/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`) +- `ghcr.io/ggerganov/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`) -The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now). +The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now). ## Usage @@ -84,3 +87,37 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1 ``` + +## Docker With MUSA + +Assuming one has the [mt-container-toolkit](https://developer.mthreads.com/musa/native) properly installed on Linux, `muBLAS` should be accessible inside the container. + +## Building Docker locally + +```bash +docker build -t local/llama.cpp:full-musa -f .devops/full-musa.Dockerfile . +docker build -t local/llama.cpp:light-musa -f .devops/llama-cli-musa.Dockerfile . +docker build -t local/llama.cpp:server-musa -f .devops/llama-server-musa.Dockerfile . +``` + +You may want to pass in some different `ARGS`, depending on the MUSA environment supported by your container host, as well as the GPU architecture. + +The defaults are: + +- `MUSA_VERSION` set to `rc3.1.0` + +The resulting images, are essentially the same as the non-MUSA images: + +1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. +2. `local/llama.cpp:light-musa`: This image only includes the main executable file. +3. `local/llama.cpp:server-musa`: This image only includes the server executable file. + +## Usage + +After building locally, Usage is similar to the non-MUSA examples, but you'll need to set `mthreads` as default Docker runtime. This can be done by executing `(cd /usr/bin/musa && sudo ./docker setup $PWD)` and verifying the changes by executing `docker info | grep mthreads` on the host machine. You will also want to use the `--n-gpu-layers` flag. + +```bash +docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 +docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 +docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1 +``` diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 67b3d2774..ead630661 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -16,7 +16,6 @@ else() add_subdirectory(baby-llama) add_subdirectory(batched-bench) add_subdirectory(batched) - add_subdirectory(benchmark) add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(embedding) add_subdirectory(eval-callback) diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 4a15941f1..a3b21ad6b 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -15,13 +15,13 @@ static void print_usage(int, char ** argv) { } int main(int argc, char ** argv) { - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) { return 1; } - gpt_init(); + common_init(); int is_pp_shared = params.is_pp_shared; @@ -36,7 +36,7 @@ int main(int argc, char ** argv) { // initialize the model - llama_model_params model_params = llama_model_params_from_gpt_params(params); + llama_model_params model_params = common_model_params_to_llama(params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); @@ -45,7 +45,7 @@ int main(int argc, char ** argv) { return 1; } - llama_context_params ctx_params = llama_context_params_from_gpt_params(params); + llama_context_params ctx_params = common_context_params_to_llama(params); // ensure enough sequences are available ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end()); @@ -74,7 +74,6 @@ int main(int argc, char ** argv) { batch.n_seq_id + i, batch.seq_id + i, batch.logits + i, - 0, 0, 0, // unused }; const int ret = llama_decode(ctx, batch_view); @@ -92,7 +91,7 @@ int main(int argc, char ** argv) { // warm up { for (int i = 0; i < 16; ++i) { - llama_batch_add(batch, 0, i, { 0 }, false); + common_batch_add(batch, 0, i, { 0 }, false); } if (!decode_helper(ctx, batch, ctx_params.n_batch)) { @@ -122,11 +121,11 @@ int main(int argc, char ** argv) { continue; } - llama_batch_clear(batch); + common_batch_clear(batch); for (int i = 0; i < pp; ++i) { for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) { - llama_batch_add(batch, 0, i, { j }, false); + common_batch_add(batch, 0, i, { j }, false); } } batch.logits[batch.n_tokens - 1] = true; @@ -151,10 +150,10 @@ int main(int argc, char ** argv) { const auto t_tg_start = ggml_time_us(); for (int i = 0; i < tg; ++i) { - llama_batch_clear(batch); + common_batch_clear(batch); for (int j = 0; j < pl; ++j) { - llama_batch_add(batch, 0, pp + i, { j }, true); + common_batch_add(batch, 0, pp + i, { j }, true); } if (!decode_helper(ctx, batch, ctx_params.n_batch)) { diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 7887a43d6..3b554033e 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -15,16 +15,16 @@ static void print_usage(int, char ** argv) { } int main(int argc, char ** argv) { - gpt_params params; + common_params params; params.prompt = "Hello my name is"; params.n_predict = 32; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { return 1; } - gpt_init(); + common_init(); // number of parallel batches int n_parallel = params.n_parallel; @@ -39,7 +39,7 @@ int main(int argc, char ** argv) { // initialize the model - llama_model_params model_params = llama_model_params_from_gpt_params(params); + llama_model_params model_params = common_model_params_to_llama(params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); @@ -51,13 +51,13 @@ int main(int argc, char ** argv) { // tokenize the prompt std::vector tokens_list; - tokens_list = ::llama_tokenize(model, params.prompt, true); + tokens_list = common_tokenize(model, params.prompt, true); const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel; // initialize the context - llama_context_params ctx_params = llama_context_params_from_gpt_params(params); + llama_context_params ctx_params = common_context_params_to_llama(params); ctx_params.n_ctx = n_kv_req; ctx_params.n_batch = std::max(n_predict, n_parallel); @@ -94,7 +94,7 @@ int main(int argc, char ** argv) { LOG("\n"); for (auto id : tokens_list) { - LOG("%s", llama_token_to_piece(ctx, id).c_str()); + LOG("%s", common_token_to_piece(ctx, id).c_str()); } // create a llama_batch @@ -108,7 +108,7 @@ int main(int argc, char ** argv) { // evaluate the initial prompt for (size_t i = 0; i < tokens_list.size(); ++i) { - llama_batch_add(batch, tokens_list[i], i, seq_ids, false); + common_batch_add(batch, tokens_list[i], i, seq_ids, false); } GGML_ASSERT(batch.n_tokens == (int) tokens_list.size()); @@ -123,8 +123,8 @@ int main(int argc, char ** argv) { decoder_start_token_id = llama_token_bos(model); } - llama_batch_clear(batch); - llama_batch_add(batch, decoder_start_token_id, 0, seq_ids, false); + common_batch_clear(batch); + common_batch_add(batch, decoder_start_token_id, 0, seq_ids, false); } // llama_decode will output logits only for the last token of the prompt @@ -161,7 +161,7 @@ int main(int argc, char ** argv) { while (n_cur <= n_predict) { // prepare the next batch - llama_batch_clear(batch); + common_batch_clear(batch); // sample the next token for each parallel sequence / stream for (int32_t i = 0; i < n_parallel; ++i) { @@ -185,15 +185,15 @@ int main(int argc, char ** argv) { // if there is only one stream, we print immediately to stdout if (n_parallel == 1) { - LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str()); + LOG("%s", common_token_to_piece(ctx, new_token_id).c_str()); } - streams[i] += llama_token_to_piece(ctx, new_token_id); + streams[i] += common_token_to_piece(ctx, new_token_id); i_batch[i] = batch.n_tokens; // push this new token for next evaluation - llama_batch_add(batch, new_token_id, n_cur, { i }, true); + common_batch_add(batch, new_token_id, n_cur, { i }, true); n_decode += 1; } diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt deleted file mode 100644 index 34a58cc02..000000000 --- a/examples/benchmark/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -set(TARGET llama-bench-matmult) -add_executable(${TARGET} benchmark-matmult.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) -target_include_directories(${TARGET} PRIVATE ../../common) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp deleted file mode 100644 index 922daf528..000000000 --- a/examples/benchmark/benchmark-matmult.cpp +++ /dev/null @@ -1,275 +0,0 @@ -#include "common.h" -#include "ggml.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr); - - if (plan.work_size > 0) { - buf.resize(plan.work_size); - plan.work_data = buf.data(); - } - - ggml_graph_compute(graph, &plan); -} - -static float tensor_sum_elements(const ggml_tensor * tensor) { - double sum = 0; - if (tensor->type == GGML_TYPE_F32) { - for (int j = 0; j < tensor->ne[1]; j++) { - for (int k = 0; k < tensor->ne[0]; k++) { - sum += ((float *) tensor->data)[j*tensor->ne[0] + k]; - } - } - } - return sum; -} - -static void tensor_dump(const ggml_tensor * tensor, const char * name) { - printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name, - tensor->type, ggml_type_name(tensor->type), - tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); - float sum = tensor_sum_elements(tensor); - printf("Sum of tensor %s is %6.2f\n", name, sum); -} - -#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor) - -struct benchmark_params_struct { - int n_threads = 1; - int32_t n_iterations = 10; -}; - -static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - fprintf(stderr, " -i N, --iter N number of iterations to use during computation (default: %d)\n", params.n_iterations); - fprintf(stderr, "\n"); -} - -int main(int argc, char ** argv) { - struct benchmark_params_struct benchmark_params; - - bool invalid_param = false; - std::string arg; - for (int i = 1; i < argc; i++) { - arg = argv[i]; - - if (arg == "-t" || arg == "--threads") { - if (++i >= argc) { - invalid_param = true; - break; - } - benchmark_params.n_threads = std::stoi(argv[i]); - } else if (arg == "-i" || arg == "--iter") { - if (++i >= argc) { - invalid_param = true; - break; - } - benchmark_params.n_iterations = std::stoi(argv[i]); - } else if (arg == "-h" || arg == "--help") { - print_usage(argc, argv, benchmark_params); - exit(0); - } - } - if (invalid_param) { - fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - print_usage(argc, argv, benchmark_params); - exit(1); - } - - print_build_info(); - printf("Starting Test\n"); - - // create the ggml context - struct ggml_context * ctx; - //const int sizex = 4096; - //const int sizey = 11008; - -#undef VERBOSE_DEBUGGING -#ifndef VERBOSE_DEBUGGING - const int sizey = 4096; - const int sizex = 11008; - const int sizez = 128; -#else - /* Working - let's increase size */ - const int sizey = 1; - const int sizex = (8*32); - const int sizez = 1; - - /*const int sizey = 1; - const int sizex = 3*(8*32); - const int sizez = 1;*/ -#endif - - //printf("Memsize required = %i\n", sizex*sizex); - - // TODO: perform the bench for all types or for a user specified type - const ggml_type qtype = GGML_TYPE_Q4_1; - - size_t ctx_size = 0; - ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); - ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); - ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez); - ctx_size += ggml_row_size(qtype, sizex*sizey); - ctx_size += ggml_row_size(qtype, sizex*sizey); - ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS - ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS - ctx_size += 1024*1024*16; - - printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024)); - - struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, - /*.mem_buffer =*/ NULL, - /* no_alloc =*/ 0 - }; - - ctx = ggml_init(params); - if (!ctx) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); - return 1; - } - - - printf("Creating new tensors\n"); - // printf("Creating new tensor m1\n"); - struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); - ggml_set_f32(m11, 1.0f); - - // printf("Creating new tensor m1\n"); - struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); - ggml_set_f32(m12, 1.5f); - - // printf("Creating new tensor m2\n"); - struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez); - ggml_set_f32(m2, 2.0f); - - printf("\n------ Test 1 - Matrix Mult via F32 code\n"); - // printf("Creating new tensor m11xm2\n"); - struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2); - - // printf("Creating compute graph\n"); - struct ggml_cgraph * gf = ggml_new_graph(ctx); - ggml_build_forward_expand(gf, m11xm2); - - printf("n_threads=%i\n", benchmark_params.n_threads); - - TENSOR_DUMP(m11); - TENSOR_DUMP(m2); - - std::vector work_buffer; - - ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads); - - TENSOR_DUMP(ggml_graph_node(gf, 0)); - - printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype)); - - int32_t nelements = sizex*sizey; - - // Set up a the benchmark matrices - // printf("Creating new tensor q11 & Running quantize\n"); - struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); - ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], nullptr); - - // Set up a the compute graph - // printf("Creating new tensor q31\n"); - struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2); - - // printf("Creating compute graph\n"); - struct ggml_cgraph * gf31 = ggml_new_graph(ctx); - ggml_build_forward_expand(gf31, q31); - - // Set up a second graph computation to make sure we override the CPU cache lines - // printf("Creating new tensor q12 & Running quantize\n"); - struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); - ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], nullptr); - - // printf("Creating new tensor q32\n"); - struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); - - //printf("Creating compute graph\n"); - struct ggml_cgraph * gf32 = ggml_new_graph(ctx); - ggml_build_forward_expand(gf32, q32); - printf("n_threads=%i\n", benchmark_params.n_threads); - - const int dimx = sizex; - const int dimy = sizey; - const int dimz = sizez; - long long int flops_per_dot_product = dimy + dimy; - long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ; - printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000); - - - // Let's use the F32 result from above as a reference for the quantized multiplication - float sum_of_F32_reference = tensor_sum_elements(ggml_graph_node(gf, 0)); - - printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n"); - printf("=====================================================================================\n"); - - double gflops_sum = 0; - for (int i=0;i allowed_delta) { - printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n", - sum_of_F32_reference, - sum_of_Q4_result, - delta, - allowed_delta - ); - exit(0); - } - - // Running a different graph computation to make sure we override the CPU cache lines - ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads); - } - printf("\n"); - printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations)); - printf("=====================================================================================\n"); -} diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index ecff95f9a..988a584c9 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -201,7 +201,7 @@ static void print_sample_weights(TransformerWeights *w){ //////////////////////////////////////// ggml structs and functions required to load models, configs and save the model. -struct llama_vocab { +struct my_llama_vocab { using id = int32_t; using token = std::string; using ttype = llama_token_type; @@ -525,7 +525,7 @@ static std::string llama_escape_whitespaces(const std::string & text) { return out.str(); } -static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) { +static void load_vocab(const char * filename, const Config * config, struct my_llama_vocab * vocab) { if (is_ggml_file(filename)) { LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename); struct ggml_context * ctx_data = NULL; @@ -583,13 +583,13 @@ static void load_vocab(const char * filename, const Config * config, struct llam const int n_vocab = config->vocab_size; /* uint32_t max_token_length = */ file.read_u32(); // unused vocab->id_to_token.resize(n_vocab); - for (llama_vocab::id id=0; idtoken_embedding_table -> model->tok_embeddings @@ -671,7 +671,7 @@ static void save_as_llama_model( std::vector tokens; std::vector scores; std::vector token_types; - for (const llama_vocab::token_data & token_data : vocab->id_to_token) { + for (const my_llama_vocab::token_data & token_data : vocab->id_to_token) { tokens.push_back(token_data.text.c_str()); scores.push_back(token_data.score); token_types.push_back(token_data.type); @@ -872,7 +872,7 @@ static std::string basename(const std::string &path) { } int main(int argc, char ** argv) { - gpt_init(); + common_init(); struct train_params params = get_default_train_params(); if (!params_parse(argc, argv, ¶ms)) { @@ -905,7 +905,7 @@ int main(int argc, char ** argv) { fclose(file); } - struct llama_vocab vocab; + struct my_llama_vocab vocab; load_vocab(params.fn_vocab_model, &config, &vocab); struct my_llama_model model; diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index 41bf4eb2a..d1731bba6 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -31,7 +31,7 @@ template static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { std::string ret; for (; begin != end; ++begin) { - ret += llama_token_to_piece(ctx, *begin); + ret += common_token_to_piece(ctx, *begin); } return ret; @@ -272,8 +272,8 @@ struct tokenized_prompt { tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) { const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); - tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true); - tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true); + tokens_pos = common_tokenize(ctx, pos, add_bos, true); + tokens_neg = common_tokenize(ctx, neg, add_bos, true); max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); padding_seq(ctx, tokens_pos, max_seq_len); padding_seq(ctx, tokens_neg, max_seq_len); @@ -281,7 +281,7 @@ struct tokenized_prompt { void padding_seq(llama_context * ctx, std::vector & tokens, size_t len) { // TODO: customize padding token - std::vector pad_tokens = ::llama_tokenize(ctx, " ", false); + std::vector pad_tokens = common_tokenize(ctx, " ", false); llama_token pad_tok = pad_tokens.back(); while (tokens.size() < len) { tokens.push_back(pad_tok); @@ -339,7 +339,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { static bool get_hidden_layers(llama_context * ctx, std::vector & tokens) { llama_kv_cache_clear(ctx); - if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) { + if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) { fprintf(stderr, "%s : failed to eval\n", __func__); return false; } @@ -370,7 +370,7 @@ static void export_gguf(const std::vector & v_ctrl, const * Load prompt files and completion file. * Then format each pair of prompt + completion to make an entry. */ -static int prepare_entries(gpt_params & params, train_context & ctx_train) { +static int prepare_entries(common_params & params, train_context & ctx_train) { // load prompts std::vector positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true); std::vector negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true); @@ -388,9 +388,9 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) { } int main(int argc, char ** argv) { - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) { return 1; } @@ -413,7 +413,7 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the model to get hparams - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp index a969c486d..f6e307fbc 100644 --- a/examples/cvector-generator/pca.hpp +++ b/examples/cvector-generator/pca.hpp @@ -204,13 +204,6 @@ static ggml_status compute_piter( ggml_backend_cpu_set_n_threads(model.backend, params.n_threads); } -// TODO: enable GPU support when support for GGML_OP_SQRT is added -//#ifdef GGML_USE_METAL -// if (ggml_backend_is_metal(model.backend)) { -// ggml_backend_metal_set_n_cb(model.backend, params.n_threads); -// } -//#endif - ggml_status res = ggml_backend_graph_compute(model.backend, gf); if (res == GGML_STATUS_SUCCESS) { auto extract_i = [](std::string prefix, std::string str) -> int { diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index a438dcb5a..3f18fc6a7 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -28,7 +28,7 @@ static std::vector split_lines(const std::string & s, const std::st static void batch_add_seq(llama_batch & batch, const std::vector & tokens, llama_seq_id seq_id) { size_t n_tokens = tokens.size(); for (size_t i = 0; i < n_tokens; i++) { - llama_batch_add(batch, tokens[i], i, { seq_id }, true); + common_batch_add(batch, tokens[i], i, { seq_id }, true); } } @@ -74,18 +74,18 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu } float * out = output + embd_pos * n_embd; - llama_embd_normalize(embd, out, n_embd, embd_norm); + common_embd_normalize(embd, out, n_embd, embd_norm); } } int main(int argc, char ** argv) { - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) { return 1; } - gpt_init(); + common_init(); params.embedding = true; // For non-causal models, batch size must be equal to ubatch size @@ -95,7 +95,7 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the model - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -122,7 +122,7 @@ int main(int argc, char ** argv) { // print system information { LOG_INF("\n"); - LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } // split the prompt into lines @@ -135,7 +135,7 @@ int main(int argc, char ** argv) { // tokenize the prompts and trim std::vector> inputs; for (const auto & prompt : prompts) { - auto inp = ::llama_tokenize(ctx, prompt, true, false); + auto inp = common_tokenize(ctx, prompt, true, true); if (inp.size() > n_batch) { LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n", __func__, (long long int) inp.size(), (long long int) n_batch); @@ -159,7 +159,7 @@ int main(int argc, char ** argv) { LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str()); LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size()); for (int j = 0; j < (int) inputs[i].size(); j++) { - LOG("%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str()); + LOG("%6d -> '%s'\n", inputs[i][j], common_token_to_piece(ctx, inputs[i][j]).c_str()); } LOG("\n\n"); } @@ -199,7 +199,7 @@ int main(int argc, char ** argv) { batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize); e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s; s = 0; - llama_batch_clear(batch); + common_batch_clear(batch); } // add to batch @@ -234,6 +234,11 @@ int main(int argc, char ** argv) { } LOG("\n"); } + } else if (pooling_type == LLAMA_POOLING_TYPE_RANK) { + for (int j = 0; j < n_embd_count; j++) { + // NOTE: if you change this log - update the tests in ci/run.sh + LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]); + } } else { // print the first part of the embeddings or for a single prompt, the full embedding for (int j = 0; j < n_prompts; j++) { @@ -258,7 +263,7 @@ int main(int argc, char ** argv) { LOG("\n"); for (int i = 0; i < n_prompts; i++) { for (int j = 0; j < n_prompts; j++) { - float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); + float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); LOG("%6.2f ", sim); } LOG("%1.10s", prompts[i].c_str()); @@ -291,7 +296,7 @@ int main(int argc, char ** argv) { for (int i = 0;;) { // at least two iteration (n_embd_count > 1) LOG(" ["); for (int j = 0;;) { // at least two iteration (n_embd_count > 1) - float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); + float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); LOG("%6.2f", sim); j++; if (j < n_embd_count) LOG(", "); else break; diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 6d629fe4e..c08e3e5f6 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -126,12 +126,12 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { return true; } -static bool run(llama_context * ctx, const gpt_params & params) { +static bool run(llama_context * ctx, const common_params & params) { const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); - std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); + std::vector tokens = common_tokenize(ctx, params.prompt, add_bos); - if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) { + if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) { LOG_ERR("%s : failed to eval\n", __func__); return false; } @@ -142,13 +142,13 @@ static bool run(llama_context * ctx, const gpt_params & params) { int main(int argc, char ** argv) { callback_data cb_data; - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } - gpt_init(); + common_init(); llama_backend_init(); llama_numa_init(params.numa); @@ -160,7 +160,7 @@ int main(int argc, char ** argv) { params.warmup = false; // init - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -172,7 +172,7 @@ int main(int argc, char ** argv) { // print system information { LOG_INF("\n"); - LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); LOG_INF("\n"); } diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 0051a5eb6..67662313d 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -128,7 +128,7 @@ struct lora_merge_ctx { lora_merge_ctx( std::string & base_fname, - std::vector & lora_files, + std::vector & lora_files, std::string & outfile, int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) { fout.exceptions(std::ofstream::failbit); // fail fast on write errors @@ -314,9 +314,9 @@ struct lora_merge_ctx { // optionally dequantize it printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type)); auto nels = ggml_nelements(inp_base); - ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type); + const auto * qtype = ggml_get_type_traits(base->type); std::vector dequant_buf(nels * sizeof(float)); - qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels); + qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels); ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size()); } else { ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base)); @@ -400,9 +400,9 @@ static void print_usage(int, char ** argv) { } int main(int argc, char ** argv) { - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) { return 1; } diff --git a/examples/gen-docs/gen-docs.cpp b/examples/gen-docs/gen-docs.cpp index 4b19a9dc2..77c59a836 100644 --- a/examples/gen-docs/gen-docs.cpp +++ b/examples/gen-docs/gen-docs.cpp @@ -11,7 +11,7 @@ static void write_table_header(std::ofstream & file) { file << "| -------- | ----------- |\n"; } -static void write_table_entry(std::ofstream & file, const llama_arg & opt) { +static void write_table_entry(std::ofstream & file, const common_arg & opt) { file << "| `"; // args for (const auto & arg : opt.args) { @@ -40,7 +40,7 @@ static void write_table_entry(std::ofstream & file, const llama_arg & opt) { file << "` | " << md_help << " |\n"; } -static void write_table(std::ofstream & file, std::vector & opts) { +static void write_table(std::ofstream & file, std::vector & opts) { write_table_header(file); for (const auto & opt : opts) { write_table_entry(file, *opt); @@ -50,12 +50,12 @@ static void write_table(std::ofstream & file, std::vector & opts) { static void export_md(std::string fname, llama_example ex) { std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc); - gpt_params params; - auto ctx_arg = gpt_params_parser_init(params, ex); + common_params params; + auto ctx_arg = common_params_parser_init(params, ex); - std::vector common_options; - std::vector sparam_options; - std::vector specific_options; + std::vector common_options; + std::vector sparam_options; + std::vector specific_options; for (auto & opt : ctx_arg.options) { // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example if (opt.is_sparam) { diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index 82c239b83..7e62657e1 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -22,12 +22,20 @@ #endif enum split_operation : uint8_t { - SPLIT_OP_SPLIT, - SPLIT_OP_MERGE, + OP_NONE, + OP_SPLIT, + OP_MERGE, +}; + +enum split_mode : uint8_t { + MODE_NONE, + MODE_TENSOR, + MODE_SIZE, }; struct split_params { - split_operation operation = SPLIT_OP_SPLIT; + split_operation operation = OP_NONE; + split_mode mode = MODE_NONE; size_t n_bytes_split = 0; int n_split_tensors = 128; std::string input; @@ -87,59 +95,52 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p } bool arg_found = false; - bool is_op_set = false; - bool is_mode_set = false; if (arg == "-h" || arg == "--help") { split_print_usage(argv[0]); exit(0); - } - if (arg == "--version") { + } else if (arg == "--version") { fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); exit(0); - } - if (arg == "--dry-run") { + } else if (arg == "--dry-run") { arg_found = true; params.dry_run = true; - } - if (arg == "--no-tensor-first-split") { + } else if (arg == "--no-tensor-first-split") { arg_found = true; params.no_tensor_first_split = true; - } - - if (is_op_set) { - throw std::invalid_argument("error: either --split or --merge can be specified, but not both"); - } - if (arg == "--merge") { + } else if (arg == "--merge") { arg_found = true; - is_op_set = true; - params.operation = SPLIT_OP_MERGE; - } - if (arg == "--split") { + if (params.operation != OP_NONE && params.operation != OP_MERGE) { + throw std::invalid_argument("error: either --split or --merge can be specified, but not both"); + } + params.operation = OP_MERGE; + } else if (arg == "--split") { arg_found = true; - is_op_set = true; - params.operation = SPLIT_OP_SPLIT; - } - - if (is_mode_set) { - throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both"); - } - if (arg == "--split-max-tensors") { + if (params.operation != OP_NONE && params.operation != OP_SPLIT) { + throw std::invalid_argument("error: either --split or --merge can be specified, but not both"); + } + params.operation = OP_SPLIT; + } else if (arg == "--split-max-tensors") { if (++arg_idx >= argc) { invalid_param = true; break; } arg_found = true; - is_mode_set = true; + if (params.mode != MODE_NONE && params.mode != MODE_TENSOR) { + throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both"); + } + params.mode = MODE_TENSOR; params.n_split_tensors = atoi(argv[arg_idx]); - } - if (arg == "--split-max-size") { + } else if (arg == "--split-max-size") { if (++arg_idx >= argc) { invalid_param = true; break; } arg_found = true; - is_mode_set = true; + if (params.mode != MODE_NONE && params.mode != MODE_SIZE) { + throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both"); + } + params.mode = MODE_SIZE; params.n_bytes_split = split_str_to_n_bytes(argv[arg_idx]); } @@ -148,6 +149,15 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p } } + // the operation is split if not specified + if (params.operation == OP_NONE) { + params.operation = OP_SPLIT; + } + // the split mode is by tensor if not specified + if (params.mode == MODE_NONE) { + params.mode = MODE_TENSOR; + } + if (invalid_param) { throw std::invalid_argument("error: invalid parameter for argument: " + arg); } @@ -265,13 +275,15 @@ struct split_strategy { } bool should_split(int i_tensor, size_t next_size) { - if (params.n_bytes_split > 0) { + if (params.mode == MODE_SIZE) { // split by max size per file return next_size > params.n_bytes_split; - } else { + } else if (params.mode == MODE_TENSOR) { // split by number of tensors per file return i_tensor > 0 && i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0; } + // should never happen + GGML_ABORT("invalid mode"); } void print_info() { @@ -559,9 +571,9 @@ int main(int argc, const char ** argv) { split_params_parse(argc, argv, params); switch (params.operation) { - case SPLIT_OP_SPLIT: gguf_split(params); + case OP_SPLIT: gguf_split(params); break; - case SPLIT_OP_MERGE: gguf_merge(params); + case OP_MERGE: gguf_merge(params); break; default: split_print_usage(argv[0]); exit(EXIT_FAILURE); diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 20b99a4fd..6e42fa073 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -15,11 +15,11 @@ static std::vector> encode(llama_context * ctx, const std::ve llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1); for (uint64_t i = 0; i < sentences.size(); i++) { - llama_batch_clear(batch); + common_batch_clear(batch); const std::string input_string = instruction + sentences[i]; - std::vector inputs = llama_tokenize(model, input_string, true, false); + std::vector inputs = common_tokenize(model, input_string, true, false); const int32_t n_toks = inputs.size(); @@ -28,7 +28,7 @@ static std::vector> encode(llama_context * ctx, const std::ve // inputs.push_back(llama_token_eos(model)); // we want to ignore instruction tokens for mean pooling - const int32_t n_inst = llama_tokenize(model, instruction, true, false).size(); + const int32_t n_inst = common_tokenize(model, instruction, true, false).size(); #ifdef GRIT_DEBUG // debug tokens - should be matching as referenced in the GritLM sample @@ -40,7 +40,7 @@ static std::vector> encode(llama_context * ctx, const std::ve // add input to batch (this increments n_tokens) for (int32_t j = 0; j < n_toks; j++) { - llama_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst); + common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst); } // clear previous kv_cache values (irrelevant for embeddings) @@ -75,7 +75,7 @@ static std::vector> encode(llama_context * ctx, const std::ve } std::vector emb_norm(emb_unorm.size()); - llama_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd); + common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd); result.push_back(emb_norm); #ifdef GRIT_DEBUG @@ -105,16 +105,16 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1); - std::vector inputs = llama_tokenize(model, prompt, false, true); + std::vector inputs = common_tokenize(model, prompt, false, true); int32_t i_current_token = 0; while (true) { - llama_batch_clear(bat); + common_batch_clear(bat); { const int32_t n_inputs = inputs.size(); for (int32_t i = 0; i < n_inputs; i++) { - llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1); + common_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1); } } inputs.clear(); @@ -127,7 +127,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std break; } - std::string piece = llama_token_to_piece(ctx, token); + std::string piece = common_token_to_piece(ctx, token); if (stream) { std::printf("%s", piece.c_str()); std::fflush(stdout); @@ -152,16 +152,16 @@ static std::string gritlm_instruction(const std::string & instruction) { } int main(int argc, char * argv[]) { - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } - gpt_init(); + common_init(); - llama_model_params mparams = llama_model_params_from_gpt_params(params); - llama_context_params cparams = llama_context_params_from_gpt_params(params); + llama_model_params mparams = common_model_params_to_llama(params); + llama_context_params cparams = common_context_params_to_llama(params); llama_backend_init(); @@ -199,10 +199,10 @@ int main(int argc, char * argv[]) { const int n_embd = llama_n_embd(model); - const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd); - const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd); - const float cosine_sim_q1_d0 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd); - const float cosine_sim_q1_d1 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd); + const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd); + const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd); + const float cosine_sim_q1_d0 = common_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd); + const float cosine_sim_q1_d1 = common_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd); std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0); std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1); diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index c8e273529..70ff47768 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -37,13 +37,13 @@ struct Stats { class IMatrixCollector { public: IMatrixCollector() = default; - void set_params(gpt_params params) { m_params = std::move(params); } + void set_params(common_params params) { m_params = std::move(params); } bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); void save_imatrix(int ncall = -1) const; bool load_imatrix(const char * file_name); private: std::unordered_map m_stats; - gpt_params m_params; + common_params m_params; std::mutex m_mutex; int m_last_call = 0; std::vector m_src1_data; @@ -428,7 +428,7 @@ static void process_logits( } } -static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { +static bool compute_imatrix(llama_context * ctx, const common_params & params) { const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx))); const int n_ctx = llama_n_ctx(ctx); @@ -436,7 +436,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { auto tim1 = std::chrono::high_resolution_clock::now(); LOG_INF("%s: tokenizing the input ..\n", __func__); - std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); + std::vector tokens = common_tokenize(ctx, params.prompt, true); auto tim2 = std::chrono::high_resolution_clock::now(); LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); @@ -496,6 +496,8 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { // clear the KV cache llama_kv_cache_clear(ctx); + llama_batch batch = llama_batch_init(n_batch, 0, 1); + for (int j = 0; j < num_batches; ++j) { const int batch_start = start + j * n_batch; const int batch_size = std::min(end - batch_start, n_batch); @@ -508,9 +510,14 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); } - // TODO: use batch.logits to save computations instead of relying on logits_all == true - if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { + common_batch_clear(batch); + for (int i = 0; i < batch_size; i++) { + common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true); + } + + if (llama_decode(ctx, batch)) { LOG_ERR("%s : failed to eval\n", __func__); + llama_batch_free(batch); return false; } @@ -523,6 +530,8 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { } } + llama_batch_free(batch); + const auto t_end = std::chrono::high_resolution_clock::now(); if (i == 0) { @@ -568,17 +577,17 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { } int main(int argc, char ** argv) { - gpt_params params; + common_params params; params.n_ctx = 512; params.logits_all = true; params.escape = false; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) { return 1; } - gpt_init(); + common_init(); params.n_batch = std::min(params.n_batch, params.n_ctx); @@ -607,7 +616,7 @@ int main(int argc, char ** argv) { params.warmup = false; // init - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -625,7 +634,7 @@ int main(int argc, char ** argv) { // print system information { LOG_INF("\n"); - LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } if (!compute_imatrix(ctx, params)) { diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index d52425ae6..f18362c91 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -35,8 +35,8 @@ static llama_context ** g_ctx; static llama_model ** g_model; -static gpt_sampler ** g_smpl; -static gpt_params * g_params; +static common_sampler ** g_smpl; +static common_params * g_params; static std::vector * g_input_tokens; static std::ostringstream * g_output_ss; static std::vector * g_output_tokens; @@ -44,7 +44,7 @@ static std::vector * g_output_tokens; static bool is_interacting = false; static void write_logfile( - const llama_context * ctx, const gpt_params & params, const llama_model * model, + const llama_context * ctx, const common_params & params, const llama_model * model, const std::vector & input_tokens, const std::string & output, const std::vector & output_tokens ) { @@ -95,12 +95,12 @@ static void sigint_handler(int signo) { } else { console::cleanup(); LOG("\n"); - gpt_perf_print(*g_ctx, *g_smpl); + common_perf_print(*g_ctx, *g_smpl); write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); // make sure all logs are flushed LOG("Interrupted by user\n"); - gpt_log_pause(gpt_log_main()); + common_log_pause(common_log_main()); _exit(130); } @@ -109,14 +109,14 @@ static void sigint_handler(int signo) { #endif int main(int argc, char ** argv) { - gpt_params params; + common_params params; g_params = ¶ms; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) { return 1; } - gpt_init(); + common_init(); auto & sparams = params.sparams; @@ -166,7 +166,7 @@ int main(int argc, char ** argv) { llama_model * model = nullptr; llama_context * ctx = nullptr; - gpt_sampler * smpl = nullptr; + common_sampler * smpl = nullptr; g_model = &model; g_ctx = &ctx; @@ -174,7 +174,7 @@ int main(int argc, char ** argv) { // load the model and apply lora adapter, if any LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); model = llama_init.model; ctx = llama_init.context; @@ -195,21 +195,21 @@ int main(int argc, char ** argv) { // print system information { LOG_INF("\n"); - LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } const bool add_bos = llama_add_bos_token(model); GGML_ASSERT(!llama_add_eos_token(model)); std::vector embd_inp; std::vector embd_end; - std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); - std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); + std::vector inp_pfx = common_tokenize(ctx, params.input_prefix, false); + std::vector inp_sfx = common_tokenize(ctx, params.input_suffix, false); - GGML_ASSERT(llama_token_prefix(model) >= 0); - GGML_ASSERT(llama_token_suffix(model) >= 0); + GGML_ASSERT(llama_token_fim_pre(model) >= 0); + GGML_ASSERT(llama_token_fim_suf(model) >= 0); - inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); - inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); + inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model)); + inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model)); embd_inp = params.spm_infill ? inp_sfx : inp_pfx; embd_end = params.spm_infill ? inp_pfx : inp_sfx; @@ -218,7 +218,7 @@ int main(int argc, char ** argv) { } embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); - const llama_token middle_token = llama_token_middle(model); + const llama_token middle_token = llama_token_fim_mid(model); if (middle_token >= 0) { embd_inp.push_back(middle_token); } @@ -257,13 +257,13 @@ int main(int argc, char ** argv) { LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str()); } if (params.n_keep > 0) { LOG_INF("%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { - LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str()); } LOG_CNT("'\n"); } @@ -298,11 +298,11 @@ int main(int argc, char ** argv) { LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str()); } } - smpl = gpt_sampler_init(model, sparams); + smpl = common_sampler_init(model, sparams); - LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl)); + LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl)); LOG_INF("sampler params: \n%s\n", sparams.print().c_str()); - LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str()); + LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str()); LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); @@ -396,7 +396,7 @@ int main(int argc, char ** argv) { LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str()); - if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) { + if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) { LOG_ERR("%s : failed to eval\n", __func__); return 1; } @@ -411,9 +411,9 @@ int main(int argc, char ** argv) { embd.clear(); if ((int) embd_inp.size() <= n_consumed && !is_interacting) { - const llama_token id = gpt_sampler_sample(smpl, ctx, -1); + const llama_token id = common_sampler_sample(smpl, ctx, -1); - gpt_sampler_accept(smpl, id, true); + common_sampler_accept(smpl, id, true); // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str()); @@ -434,7 +434,7 @@ int main(int argc, char ** argv) { // push the prompt in the sampling context in order to apply repetition penalties later // for the prompt, we don't apply grammar rules - gpt_sampler_accept(smpl, embd_inp[n_consumed], false); + common_sampler_accept(smpl, embd_inp[n_consumed], false); ++n_consumed; if ((int) embd.size() >= params.n_batch) { @@ -446,7 +446,7 @@ int main(int argc, char ** argv) { // display text if (input_echo) { for (auto id : embd) { - const std::string token_str = llama_token_to_piece(ctx, id); + const std::string token_str = common_token_to_piece(ctx, id); LOG("%s", token_str.c_str()); if (embd.size() > 1) { @@ -465,10 +465,10 @@ int main(int argc, char ** argv) { // if not currently processing queued inputs; if ((int) embd_inp.size() <= n_consumed) { // deal with eot token in infill mode - if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){ + if ((common_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){ if (is_interacting && !params.interactive_first) { // print an eot token - LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); + LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str()); } LOG("\n"); console::set_display(console::user_input); @@ -505,11 +505,11 @@ int main(int argc, char ** argv) { } // tokenize new prefix and suffix - std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); - std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); + std::vector inp_pfx = common_tokenize(ctx, params.input_prefix, false); + std::vector inp_sfx = common_tokenize(ctx, params.input_suffix, false); - inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); - inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); + inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model)); + inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model)); embd_inp = params.spm_infill ? inp_sfx : inp_pfx; embd_end = params.spm_infill ? inp_pfx : inp_sfx; @@ -529,7 +529,7 @@ int main(int argc, char ** argv) { is_interacting = false; } // deal with end of generation tokens in interactive mode - else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) { + else if (llama_token_is_eog(model, common_sampler_last(smpl))) { LOG_DBG("found EOS token\n"); if (params.interactive) { @@ -579,7 +579,7 @@ int main(int argc, char ** argv) { const size_t original_size = embd_inp.size(); - const auto line_inp = ::llama_tokenize(ctx, buffer, false); + const auto line_inp = common_tokenize(ctx, buffer, false); LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str()); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); @@ -587,7 +587,7 @@ int main(int argc, char ** argv) { for (size_t i = original_size; i < embd_inp.size(); ++i) { const llama_token token = embd_inp[i]; output_tokens.push_back(token); - output_ss << llama_token_to_piece(ctx, token); + output_ss << common_token_to_piece(ctx, token); } n_remain -= line_inp.size(); @@ -601,7 +601,7 @@ int main(int argc, char ** argv) { if (n_past > 0) { if (is_interacting) { - gpt_sampler_reset(smpl); + common_sampler_reset(smpl); } is_interacting = false; } @@ -620,17 +620,17 @@ int main(int argc, char ** argv) { } } if (!params.interactive && n_remain <= 0) { - LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); + LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str()); } LOG("\n"); - gpt_perf_print(ctx, smpl); + common_perf_print(ctx, smpl); write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); llama_free(ctx); llama_free_model(model); - gpt_sampler_free(smpl); + common_sampler_free(smpl); llama_backend_free(); return 0; diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index a8779bf3b..fc9f0097f 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -540,7 +540,7 @@ class SchemaConverter: return self._add_rule( name, to_rule(transform()) if self._raw_pattern \ - else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space") + else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space") def _resolve_ref(self, ref): diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index fb1d387b2..4a8ea9676 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -151,7 +151,7 @@ static std::string get_gpu_info() { int count = ggml_backend_sycl_get_device_count(); for (int i = 0; i < count; i++) { char buf[128]; - ggml_sycl_get_device_description(i, buf, sizeof(buf)); + ggml_backend_sycl_get_device_description(i, buf, sizeof(buf)); id += buf; if (i < count - 1) { id += "/"; @@ -304,9 +304,9 @@ static void print_usage(int /* argc */, char ** argv) { printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str()); printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str()); printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); -#ifdef GGML_USE_RPC - printf(" -rpc, --rpc (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); -#endif + if (llama_supports_rpc()) { + printf(" -rpc, --rpc (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); + } printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); @@ -497,14 +497,12 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = string_split(argv[i], split_delim); params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); -#ifdef GGML_USE_RPC - } else if (arg == "-rpc" || arg == "--rpc") { + } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) { if (++i >= argc) { invalid_param = true; break; } params.rpc_servers.push_back(argv[i]); -#endif } else if (arg == "-sm" || arg == "--split-mode") { if (++i >= argc) { invalid_param = true; @@ -1430,7 +1428,7 @@ struct sql_printer : public printer { } }; -static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) { +static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) { llama_set_n_threads(ctx, n_threads, n_threads); const llama_model * model = llama_get_model(ctx); @@ -1446,14 +1444,14 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat for (int i = 1; i < n_tokens; i++) { tokens[i] = std::rand() % n_vocab; } - llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0)); + llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens)); n_processed += n_tokens; } llama_synchronize(ctx); } -static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) { +static void test_gen(llama_context * ctx, int n_gen, int n_threads) { llama_set_n_threads(ctx, n_threads, n_threads); const llama_model * model = llama_get_model(ctx); @@ -1462,7 +1460,7 @@ static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; for (int i = 0; i < n_gen; i++) { - llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0)); + llama_decode(ctx, llama_batch_get_one(&token, 1)); llama_synchronize(ctx); token = std::rand() % n_vocab; } @@ -1598,13 +1596,13 @@ int main(int argc, char ** argv) { fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count); } //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads); - test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads); + test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); } if (t.n_gen > 0) { if (params.progress) { fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count); } - test_gen(ctx, 1, 0, t.n_threads); + test_gen(ctx, 1, t.n_threads); } for (int i = 0; i < params.reps; i++) { @@ -1616,13 +1614,13 @@ int main(int argc, char ** argv) { if (params.progress) { fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps); } - test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads); + test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); } if (t.n_gen > 0) { if (params.progress) { fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps); } - test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads); + test_gen(ctx, t.n_gen, t.n_threads); } uint64_t t_ns = get_time_ns() - t_start; diff --git a/examples/llama.android/llama/build.gradle.kts b/examples/llama.android/llama/build.gradle.kts index 0a3806172..2d1dfba20 100644 --- a/examples/llama.android/llama/build.gradle.kts +++ b/examples/llama.android/llama/build.gradle.kts @@ -18,6 +18,7 @@ android { } externalNativeBuild { cmake { + arguments += "-DLLAMA_BUILD_COMMON=ON" arguments += "-DCMAKE_BUILD_TYPE=Release" cppFlags += listOf() arguments += listOf() diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp index f611809c6..b3858ddfb 100644 --- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -186,11 +186,11 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model( for (nri = 0; nri < nr; nri++) { LOGi("Benchmark prompt processing (pp)"); - llama_batch_clear(*batch); + common_batch_clear(*batch); const int n_tokens = pp; for (i = 0; i < n_tokens; i++) { - llama_batch_add(*batch, 0, i, { 0 }, false); + common_batch_add(*batch, 0, i, { 0 }, false); } batch->logits[batch->n_tokens - 1] = true; @@ -210,9 +210,9 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model( const auto t_tg_start = ggml_time_us(); for (i = 0; i < tg; i++) { - llama_batch_clear(*batch); + common_batch_clear(*batch); for (j = 0; j < pl; j++) { - llama_batch_add(*batch, 0, i, { j }, true); + common_batch_add(*batch, 0, i, { j }, true); } LOGi("llama_decode() text generation: %d", i); @@ -283,9 +283,6 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, nullptr, nullptr, nullptr, - 0, - 0, - 0, }; if (embd) { @@ -357,7 +354,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init( const auto context = reinterpret_cast(context_pointer); const auto batch = reinterpret_cast(batch_pointer); - const auto tokens_list = llama_tokenize(context, text, 1); + const auto tokens_list = common_tokenize(context, text, 1); auto n_ctx = llama_n_ctx(context); auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size()); @@ -369,14 +366,14 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init( } for (auto id : tokens_list) { - LOGi("%s", llama_token_to_piece(context, id).c_str()); + LOGi("%s", common_token_to_piece(context, id).c_str()); } - llama_batch_clear(*batch); + common_batch_clear(*batch); // evaluate the initial prompt for (auto i = 0; i < tokens_list.size(); i++) { - llama_batch_add(*batch, tokens_list[i], i, { 0 }, false); + common_batch_add(*batch, tokens_list[i], i, { 0 }, false); } // llama_decode will output logits only for the last token of the prompt @@ -419,7 +416,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop( return nullptr; } - auto new_token_chars = llama_token_to_piece(context, new_token_id); + auto new_token_chars = common_token_to_piece(context, new_token_id); cached_token_chars += new_token_chars; jstring new_token = nullptr; @@ -431,8 +428,8 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop( new_token = env->NewStringUTF(""); } - llama_batch_clear(*batch); - llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true); + common_batch_clear(*batch); + common_batch_add(*batch, new_token_id, n_cur, { 0 }, true); env->CallVoidMethod(intvar_ncur, la_int_var_inc); diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index dcd9803a2..65cd4eb51 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -46,7 +46,6 @@ actor LlamaContext { let sparams = llama_sampler_chain_default_params() self.sampling = llama_sampler_chain_init(sparams) llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.4)) - llama_sampler_chain_add(self.sampling, llama_sampler_init_softmax()) llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234)) } diff --git a/examples/llama.vim b/examples/llama.vim index 1b5ad6ba0..7a60442ad 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -1,135 +1,697 @@ -" Requires an already running llama.cpp server -" To install either copy or symlink to ~/.vim/autoload/llama.vim -" Then start with either :call llama#doLlamaGen(), -" or add a keybind to your vimrc such as -" nnoremap Z :call llama#doLlamaGen() -" Similarly, you could add an insert mode keybind with -" inoremap call llama#doLlamaGen() +" LLM-based text completion using llama.cpp " -" g:llama_api_url, g:llama_api_key and g:llama_overrides can be configured in your .vimrc -" let g:llama_api_url = "192.168.1.10:8080" -" llama_overrides can also be set through buffer/window scopes. For instance -" autocmd filetype python let b:llama_overrides = {"temp": 0.2} -" Could be added to your .vimrc to automatically set a lower temperature when -" editing a python script -" Additionally, an override dict can be stored at the top of a file -" !*{"stop": ["User:"]} -" Could be added to the start of your chatlog.txt to set the stopping token -" These parameter dicts are merged together from lowest to highest priority: -" server default -> g:llama_overrides -> w:llama_overrides -> -" b:llama_overrides -> in file (!*) overrides +" requires: +" +" - neovim +" - curl +" - llama.cpp server instance +" - FIM-compatible model +" +" sample config: +" +" - Tab - accept the current suggestion +" - Shift+Tab - accept just the first line of the segguestion +" - Ctrl+F - toggle FIM completion manually +" +" make symlink or copy this file to ~/.config/nvim/autoload/llama.vim +" +" start the llama.cpp server with a FIM-compatible model. for example: +" +" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256 +" +" --batch-size [512, model max context] +" +" adjust the batch size to control how much of the provided local context will be used during the inference +" lower values will use smaller part of the context around the cursor, which will result in faster processing +" +" --ubatch-size [64, 2048] +" +" chunks the batch into smaller chunks for faster processing +" depends on the specific hardware. use llama-bench to profile and determine the best size +" +" --cache-reuse (ge:llama_config.n_predict, 1024] +" +" this should be either 0 (disabled) or strictly larger than g:llama_config.n_predict +" using non-zero value enables context reuse on the server side which dramatically improves the performance at +" large contexts. a value of 256 should be good for all cases +" +" run this once to initialise llama.vim: +" +" :call llama#init() +" +" more info: https://github.com/ggerganov/llama.cpp/pull/9787 " -" Sublists (like logit_bias and stop) are overridden, not merged -" Example override: -" !*{"logit_bias": [[13, -5], [2, false]], "temperature": 1, "top_k": 5, "top_p": 0.5, "n_predict": 256, "repeat_last_n": 256, "repeat_penalty": 1.17647} -if !exists("g:llama_api_url") - let g:llama_api_url= "127.0.0.1:8080" -endif -if !exists("g:llama_overrides") - let g:llama_overrides = {} -endif -const s:querydata = {"n_predict": 256, "stop": [ "\n" ], "stream": v:true } -const s:curlcommand = ['curl','--data-raw', "{\"prompt\":\"### System:\"}", '--silent', '--no-buffer', '--request', 'POST', '--url', g:llama_api_url .. '/completion', '--header', "Content-Type: application/json"] -let s:linedict = {} -func s:callbackHandler(bufn, channel, msg) - if len(a:msg) < 3 - return - elseif a:msg[0] == "d" - let l:msg = a:msg[6:-1] - else - let l:msg = a:msg - endif - let l:decoded_msg = json_decode(l:msg) - let l:newtext = split(l:decoded_msg['content'], "\n", 1) - if len(l:newtext) > 0 - call setbufline(a:bufn, s:linedict[a:bufn], getbufline(a:bufn, s:linedict[a:bufn])[0] .. newtext[0]) - else - echo "nothing genned" - endif - if len(newtext) > 1 - let l:failed = appendbufline(a:bufn, s:linedict[a:bufn], newtext[1:-1]) - let s:linedict[a:bufn] = s:linedict[a:bufn] + len(newtext)-1 - endif - if has_key(l:decoded_msg, "stop") && l:decoded_msg.stop - echo "Finished generation" - endif +" colors (adjust to your liking) +highlight llama_hl_hint guifg=#ff772f +highlight llama_hl_info guifg=#77ff2f + +" general parameters: +" +" endpoint: llama.cpp server endpoint +" n_prefix: number of lines before the cursor location to include in the local prefix +" n_suffix: number of lines after the cursor location to include in the local suffix +" n_predict: max number of tokens to predict +" t_max_prompt_ms: max alloted time for the prompt processing (TODO: not yet supported) +" t_max_predict_ms: max alloted time for the prediction +" show_info: show extra info about the inference (0 - disabled, 1 - statusline, 2 - inline) +" auto_fim: trigger FIM completion automatically on cursor movement +" max_line_suffix: do not auto-trigger FIM completion if there are more than this number of characters to the right of the cursor +" +" ring buffer of chunks, accumulated with time upon: +" +" - completion request +" - yank +" - entering a buffer +" - leaving a buffer +" - writing a file +" +" parameters for the ring-buffer with extra context: +" +" ring_n_chunks: max number of chunks to pass as extra context to the server (0 to disable) +" ring_chunk_size: max size of the chunks (in number of lines) +" note: adjust these numbers so that you don't overrun your context +" at ring_n_chunks = 64 and ring_chunk_size = 64 you need ~32k context +" ring_scope: the range around the cursor position (in number of lines) for gathering chunks after FIM +" ring_update_ms: how often to process queued chunks in normal mode +" +let s:default_config = { + \ 'endpoint': 'http://127.0.0.1:8012/infill', + \ 'n_prefix': 256, + \ 'n_suffix': 64, + \ 'n_predict': 128, + \ 't_max_prompt_ms': 500, + \ 't_max_predict_ms': 1000, + \ 'show_info': 2, + \ 'auto_fim': v:true, + \ 'max_line_suffix': 8, + \ 'ring_n_chunks': 64, + \ 'ring_chunk_size': 64, + \ 'ring_scope': 1024, + \ 'ring_update_ms': 1000, + \ } + +let g:llama_config = get(g:, 'llama_config', s:default_config) + +function! s:rand(i0, i1) abort + return a:i0 + rand() % (a:i1 - a:i0 + 1) endfunction -func llama#doLlamaGen() - if exists("b:job") - if job_status(b:job) == "run" - call job_stop(b:job) - return - endif - endif - - let l:cbuffer = bufnr("%") - let s:linedict[l:cbuffer] = line('$') - let l:buflines = getbufline(l:cbuffer, 1, 1000) - let l:querydata = copy(s:querydata) - call extend(l:querydata, g:llama_overrides) - if exists("w:llama_overrides") - call extend(l:querydata, w:llama_overrides) - endif - if exists("b:llama_overrides") - call extend(l:querydata, b:llama_overrides) - endif - if l:buflines[0][0:1] == '!*' - let l:userdata = json_decode(l:buflines[0][2:-1]) - call extend(l:querydata, l:userdata) - let l:buflines = l:buflines[1:-1] - endif - let l:querydata.prompt = join(l:buflines, "\n") - let l:curlcommand = copy(s:curlcommand) - if exists("g:llama_api_key") - call extend(l:curlcommand, ['--header', 'Authorization: Bearer ' .. g:llama_api_key]) - endif - let l:curlcommand[2] = json_encode(l:querydata) - let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])}) -endfunction - -" Echos the tokkenization of the provided string , or cursor to end of word -" Onus is placed on the user to include the preceding space -func llama#tokenizeWord(...) - if (a:0 > 0) - let l:input = a:1 - else - exe "normal \"*ye" - let l:input = @* +function! llama#init() + if !executable('curl') + echohl WarningMsg + echo 'llama.vim requires the "curl" command to be available' + echohl None + return endif - let l:querydata = {"content": l:input} - let l:curlcommand = copy(s:curlcommand) - let l:curlcommand[2] = json_encode(l:querydata) - let l:curlcommand[8] = g:llama_api_url .. "/tokenize" - let s:token_job = job_start(l:curlcommand, {"callback": function("s:tokenizeWordCallback", [l:input])}) -endfunction -func s:tokenizeWordCallback(plaintext, channel, msg) - echo '"' .. a:plaintext ..'" - ' .. string(json_decode(a:msg).tokens) -endfunction + let s:pos_x = 0 " cursor position upon start of completion + let s:pos_y = 0 + let s:line_cur = '' -" Echos the token count of the entire buffer (or provided string) -" Example usage :echo llama#tokenCount() -func llama#tokenCount(...) - if (a:0 > 0) - let l:buflines = a:1 - else - let l:buflines = getline(1,1000) - if l:buflines[0][0:1] == '!*' - let l:buflines = l:buflines[1:-1] + let s:line_cur_prefix = '' + let s:line_cur_suffix = '' + + let s:ring_chunks = [] " current set of chunks used as extra context + let s:ring_queued = [] " chunks that are queued to be sent for processing + let s:ring_n_evict = 0 + + let s:hint_shown = v:false + let s:pos_y_pick = -9999 " last y where we picked a chunk + let s:pos_dx = 0 + let s:content = [] + let s:can_accept = v:false + + let s:timer_fim = -1 + let s:t_fim_start = reltime() " used to measure total FIM time + let s:t_last_move = reltime() " last time the cursor moved + + let s:current_job = v:null + + augroup llama + autocmd! + autocmd InsertEnter * inoremap llama#fim_inline(v:false) + autocmd InsertLeavePre * call llama#fim_cancel() + + autocmd CursorMoved * call s:on_move() + autocmd CursorMovedI * call s:on_move() + autocmd CompleteChanged * call llama#fim_cancel() + + if g:llama_config.auto_fim + autocmd CursorMovedI * call llama#fim(v:true) endif - let l:buflines = join(l:buflines, "\n") + + " gather chunks upon yanking + autocmd TextYankPost * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif + + " gather chunks upon entering/leaving a buffer + autocmd BufEnter * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)}) + autocmd BufLeave * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true) + + " gather chunk upon saving the file + autocmd BufWritePost * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true) + augroup END + + silent! call llama#fim_cancel() + + " init background update of the ring buffer + if g:llama_config.ring_n_chunks > 0 + call s:ring_update() endif - let l:querydata = {"content": l:buflines} - let l:curlcommand = copy(s:curlcommand) - let l:curlcommand[2] = json_encode(l:querydata) - let l:curlcommand[8] = g:llama_api_url .. "/tokenize" - let s:token_job = job_start(l:curlcommand, {"callback": "s:tokenCountCallback"}) endfunction -func s:tokenCountCallback(channel, msg) - let resp = json_decode(a:msg) - echo len(resp.tokens) +" compute how similar two chunks of text are +" 0 - no similarity, 1 - high similarity +" TODO: figure out something better +function! s:chunk_sim(c0, c1) + let l:lines0 = len(a:c0) + let l:lines1 = len(a:c1) + + let l:common = 0 + + for l:line0 in a:c0 + for l:line1 in a:c1 + if l:line0 == l:line1 + let l:common += 1 + break + endif + endfor + endfor + + return 2.0 * l:common / (l:lines0 + l:lines1) +endfunction + +" pick a random chunk of size g:llama_config.ring_chunk_size from the provided text and queue it for processing +" +" no_mod - do not pick chunks from buffers with pending changes +" do_evict - evict chunks that are very similar to the new one +" +function! s:pick_chunk(text, no_mod, do_evict) + " do not pick chunks from buffers with pending changes or buffers that are not files + if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%'))) + return + endif + + " if the extra context option is disabled - do nothing + if g:llama_config.ring_n_chunks <= 0 + return + endif + + " don't pick very small chunks + if len(a:text) < 3 + return + endif + + if len(a:text) + 1 < g:llama_config.ring_chunk_size + let l:chunk = a:text + else + let l:l0 = s:rand(0, max([0, len(a:text) - g:llama_config.ring_chunk_size/2])) + let l:l1 = min([l:l0 + g:llama_config.ring_chunk_size/2, len(a:text)]) + + let l:chunk = a:text[l:l0:l:l1] + endif + + let l:chunk_str = join(l:chunk, "\n") . "\n" + + " check if this chunk is already added + let l:exist = v:false + + for i in range(len(s:ring_chunks)) + if s:ring_chunks[i].data == l:chunk + let l:exist = v:true + break + endif + endfor + + for i in range(len(s:ring_queued)) + if s:ring_queued[i].data == l:chunk + let l:exist = v:true + break + endif + endfor + + if l:exist + return + endif + + " evict queued chunks that are very similar to the new one + for i in range(len(s:ring_queued) - 1, 0, -1) + if s:chunk_sim(s:ring_queued[i].data, l:chunk) > 0.9 + if a:do_evict + call remove(s:ring_queued, i) + let s:ring_n_evict += 1 + else + return + endif + endif + endfor + + " also from s:ring_chunks + for i in range(len(s:ring_chunks) - 1, 0, -1) + if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.9 + if a:do_evict + call remove(s:ring_chunks, i) + let s:ring_n_evict += 1 + else + return + endif + endif + endfor + + " TODO: become parameter ? + if len(s:ring_queued) == 16 + call remove(s:ring_queued, 0) + endif + + call add(s:ring_queued, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime(), 'filename': expand('%')}) + + "let &statusline = 'extra context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued) +endfunction + +" picks a queued chunk, sends it for processing and adds it to s:ring_chunks +" called every g:llama_config.ring_update_ms +function! s:ring_update() + call timer_start(g:llama_config.ring_update_ms, {-> s:ring_update()}) + + " update only if in normal mode or if the cursor hasn't moved for a while + if mode() !=# 'n' && reltimefloat(reltime(s:t_last_move)) < 3.0 + return + endif + + if len(s:ring_queued) == 0 + return + endif + + " move the first queued chunk to the ring buffer + if len(s:ring_chunks) == g:llama_config.ring_n_chunks + call remove(s:ring_chunks, 0) + endif + + call add(s:ring_chunks, remove(s:ring_queued, 0)) + + "let &statusline = 'updated context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued) + + " send asynchronous job with the new extra context so that it is ready for the next FIM + let l:extra_context = [] + for l:chunk in s:ring_chunks + call add(l:extra_context, { + \ 'text': l:chunk.str, + \ 'time': l:chunk.time, + \ 'filename': l:chunk.filename + \ }) + endfor + + " no samplers needed here + let l:request = json_encode({ + \ 'input_prefix': "", + \ 'input_suffix': "", + \ 'input_extra': l:extra_context, + \ 'prompt': "", + \ 'n_predict': 1, + \ 'temperature': 0.0, + \ 'stream': v:false, + \ 'samplers': ["temperature"], + \ 'cache_prompt': v:true, + \ 't_max_prompt_ms': 1, + \ 't_max_predict_ms': 1 + \ }) + + let l:curl_command = printf( + \ "curl --silent --no-buffer --request POST --url %s --header \"Content-Type: application/json\" --data %s", + \ g:llama_config.endpoint, shellescape(l:request) + \ ) + + " no callbacks because we don't need to process the response + call jobstart(l:curl_command, {}) +endfunction + +" necessary for 'inoremap ' +function! llama#fim_inline(is_auto) abort + call llama#fim(a:is_auto) + return '' +endfunction + +" the main FIM call +" takes local context around the cursor and sends it together with the extra context to the server for completion +function! llama#fim(is_auto) abort + " we already have a suggestion for the current cursor position + if s:hint_shown && !a:is_auto + call llama#fim_cancel() + return + endif + + call llama#fim_cancel() + + " avoid sending repeated requests too fast + if reltimefloat(reltime(s:t_fim_start)) < 0.6 + if s:timer_fim != -1 + call timer_stop(s:timer_fim) + let s:timer_fim = -1 + endif + + let s:t_fim_start = reltime() + let s:timer_fim = timer_start(600, {-> llama#fim(v:true)}) + return + endif + + let s:t_fim_start = reltime() + + let s:content = [] + let s:can_accept = v:false + + let s:pos_x = col('.') - 1 + let s:pos_y = line('.') + let l:max_y = line('$') + + let l:lines_prefix = getline(max([1, s:pos_y - g:llama_config.n_prefix]), s:pos_y - 1) + let l:lines_suffix = getline(s:pos_y + 1, min([l:max_y, s:pos_y + g:llama_config.n_suffix])) + + let s:line_cur = getline('.') + + let s:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x) + let s:line_cur_suffix = strpart(s:line_cur, s:pos_x) + + if a:is_auto && len(s:line_cur_suffix) > g:llama_config.max_line_suffix + return + endif + + let l:prefix = "" + \ . join(l:lines_prefix, "\n") + \ . "\n" + + let l:prompt = "" + \ . s:line_cur_prefix + + let l:suffix = "" + \ . s:line_cur_suffix + \ . "\n" + \ . join(l:lines_suffix, "\n") + \ . "\n" + + " prepare the extra context data + let l:extra_context = [] + for l:chunk in s:ring_chunks + call add(l:extra_context, { + \ 'text': l:chunk.str, + \ 'time': l:chunk.time, + \ 'filename': l:chunk.filename + \ }) + endfor + + " the indentation of the current line + let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*')) + + let l:request = json_encode({ + \ 'input_prefix': l:prefix, + \ 'input_suffix': l:suffix, + \ 'input_extra': l:extra_context, + \ 'prompt': l:prompt, + \ 'n_predict': g:llama_config.n_predict, + \ 'n_indent': l:indent, + \ 'top_k': 40, + \ 'top_p': 0.99, + \ 'stream': v:false, + \ 'samplers': ["top_k", "top_p", "infill"], + \ 'cache_prompt': v:true, + \ 't_max_prompt_ms': g:llama_config.t_max_prompt_ms, + \ 't_max_predict_ms': g:llama_config.t_max_predict_ms + \ }) + + let l:curl_command = printf( + \ "curl --silent --no-buffer --request POST --url %s --header \"Content-Type: application/json\" --data %s", + \ g:llama_config.endpoint, shellescape(l:request) + \ ) + + if s:current_job != v:null + call jobstop(s:current_job) + endif + + " send the request asynchronously + let s:current_job = jobstart(l:curl_command, { + \ 'on_stdout': function('s:fim_on_stdout'), + \ 'on_exit': function('s:fim_on_exit'), + \ 'stdout_buffered': v:true, + \ 'pos_x': s:pos_x, + \ 'pos_y': s:pos_y, + \ 'is_auto': a:is_auto + \ }) + + " TODO: per-file location + let l:delta_y = abs(s:pos_y - s:pos_y_pick) + + " gather some extra context nearby and process it in the background + " only gather chunks if the cursor has moved a lot + " TODO: something more clever? reranking? + if a:is_auto && l:delta_y > 32 + " expand the prefix even further + call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false) + + " pick a suffix chunk + call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.n_suffix + g:llama_config.ring_chunk_size])), v:false, v:false) + + let s:pos_y_pick = s:pos_y + endif +endfunction + +" if first_line == v:true accept only the first line of the response +function! llama#fim_accept(first_line) + " insert the suggestion at the cursor location + if s:can_accept && len(s:content) > 0 + call setline(s:pos_y, s:line_cur[:(s:pos_x - 1)] . s:content[0]) + if len(s:content) > 1 + if !a:first_line + call append(s:pos_y, s:content[1:-1]) + endif + endif + + " move the cursor to the end of the accepted text + if !a:first_line && len(s:content) > 1 + call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx + 1) + else + call cursor(s:pos_y, s:pos_x + len(s:content[0])) + endif + endif + + call llama#fim_cancel() +endfunction + +function! llama#fim_cancel() + let s:hint_shown = v:false + + " clear the virtual text + let l:bufnr = bufnr('%') + + let l:id_vt_fim = nvim_create_namespace('vt_fim') + + call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim, 0, -1) + + " remove the mappings + silent! iunmap + silent! iunmap + silent! iunmap +endfunction + +function! s:on_move() + let s:t_last_move = reltime() + + call llama#fim_cancel() +endfunction + +" callback that processes the FIM result from the server and displays the suggestion +function! s:fim_on_stdout(job_id, data, event) dict + let l:raw = join(a:data, "\n") + if len(l:raw) == 0 + return + endif + + if self.pos_x != col('.') - 1 || self.pos_y != line('.') + return + endif + + " show the suggestion only in insert mode + if mode() !=# 'i' + return + endif + + let s:pos_x = self.pos_x + let s:pos_y = self.pos_y + + let s:can_accept = v:true + let l:has_info = v:false + + if s:can_accept && v:shell_error + if !self.is_auto + call add(s:content, "<| curl error: is the server on? |>") + endif + let s:can_accept = v:false + endif + + let l:n_prompt = 0 + let l:t_prompt_ms = 1.0 + let l:s_prompt = 0 + + let l:n_predict = 0 + let l:t_predict_ms = 1.0 + let l:s_predict = 0 + + " get the generated suggestion + if s:can_accept + let l:response = json_decode(l:raw) + + for l:part in split(get(l:response, 'content', ''), "\n", 1) + call add(s:content, l:part) + endfor + + " remove trailing new lines + while len(s:content) > 0 && s:content[-1] == "" + call remove(s:content, -1) + endwhile + + let l:generation_settings = get(l:response, 'generation_settings', {}) + let l:n_ctx = get(l:generation_settings, 'n_ctx', 0) + + let l:n_cached = get(l:response, 'tokens_cached', 0) + let l:truncated = get(l:response, 'truncated', v:false) + + " if response.timings is available + if len(get(l:response, 'timings', {})) > 0 + let l:has_info = v:true + let l:timings = get(l:response, 'timings', {}) + + let l:n_prompt = get(l:timings, 'prompt_n', 0) + let l:t_prompt_ms = get(l:timings, 'prompt_ms', 1) + let l:s_prompt = get(l:timings, 'prompt_per_second', 0) + + let l:n_predict = get(l:timings, 'predicted_n', 0) + let l:t_predict_ms = get(l:timings, 'predicted_ms', 1) + let l:s_predict = get(l:timings, 'predicted_per_second', 0) + endif + endif + + if len(s:content) == 0 + call add(s:content, "") + let s:can_accept = v:false + endif + + if len(s:content) == 0 + return + endif + + " NOTE: the following is logic for discarding predictions that repeat existing text + " the code is quite ugly and there is very likely a simpler and more canonical way to implement this + " + " still, I wonder if there is some better way that avoids having to do these special hacks? + " on one hand, the LLM 'sees' the contents of the file before we start editing, so it is normal that it would + " start generating whatever we have given it via the extra context. but on the other hand, it's not very + " helpful to re-generate the same code that is already there + + " truncate the suggestion if the first line is empty + if len(s:content) == 1 && s:content[0] == "" + let s:content = [""] + endif + + " ... and the next lines are repeated + if len(s:content) > 1 && s:content[0] == "" && s:content[1:] == getline(s:pos_y + 1, s:pos_y + len(s:content) - 1) + let s:content = [""] + endif + + " truncate the suggestion if it repeats the suffix + if len(s:content) == 1 && s:content[0] == s:line_cur_suffix + let s:content = [""] + endif + + " find the first non-empty line (strip whitespace) + let l:cmp_y = s:pos_y + 1 + while l:cmp_y < line('$') && getline(l:cmp_y) =~? '^\s*$' + let l:cmp_y += 1 + endwhile + + if (s:line_cur_prefix . s:content[0]) == getline(l:cmp_y) + " truncate the suggestion if it repeats the next line + if len(s:content) == 1 + let s:content = [""] + endif + + " ... or if the second line of the suggestion is the prefix of line l:cmp_y + 1 + if len(s:content) == 2 && s:content[-1] == getline(l:cmp_y + 1)[:len(s:content[-1]) - 1] + let s:content = [""] + endif + + " ... or if the middle chunk of lines of the suggestion is the same as [l:cmp_y + 1, l:cmp_y + len(s:content) - 1) + if len(s:content) > 2 && join(s:content[1:-1], "\n") == join(getline(l:cmp_y + 1, l:cmp_y + len(s:content) - 1), "\n") + let s:content = [""] + endif + endif + + " keep only lines that have the same or larger whitespace prefix as s:line_cur_prefix + "let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*')) + "for i in range(1, len(s:content) - 1) + " if strlen(matchstr(s:content[i], '^\s*')) < l:indent + " let s:content = s:content[:i - 1] + " break + " endif + "endfor + + let s:pos_dx = len(s:content[-1]) + + let s:content[-1] .= s:line_cur_suffix + + call llama#fim_cancel() + + " display virtual text with the suggestion + let l:bufnr = bufnr('%') + + let l:id_vt_fim = nvim_create_namespace('vt_fim') + + " construct the info message + if g:llama_config.show_info > 0 && l:has_info + let l:prefix = ' ' + + if l:truncated + let l:info = printf("%s | WARNING: the context is full: %d / %d, increase the server context size or reduce g:llama_config.ring_n_chunks", + \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim', + \ l:n_cached, l:n_ctx + \ ) + else + let l:info = printf("%s | c: %d / %d, r: %d / %d, e: %d, q: %d / 16 | p: %d (%.2f ms, %.2f t/s) | g: %d (%.2f ms, %.2f t/s) | t: %.2f ms", + \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim', + \ l:n_cached, l:n_ctx, len(s:ring_chunks), g:llama_config.ring_n_chunks, s:ring_n_evict, len(s:ring_queued), + \ l:n_prompt, l:t_prompt_ms, l:s_prompt, + \ l:n_predict, l:t_predict_ms, l:s_predict, + \ 1000.0 * reltimefloat(reltime(s:t_fim_start)) + \ ) + endif + + if g:llama_config.show_info == 1 + " display the info in the statusline + let &statusline = l:info + let l:info = '' + endif + endif + + " display the suggestion and append the info to the end of the first line + call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, { + \ 'virt_text': [[s:content[0], 'llama_hl_hint'], [l:info, 'llama_hl_info']], + \ 'virt_text_win_col': virtcol('.') - 1 + \ }) + + call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, 0, { + \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}), + \ 'virt_text_win_col': virtcol('.') + \ }) + + " setup accept shortcuts + inoremap :call llama#fim_accept(v:false) + inoremap :call llama#fim_accept(v:true) + + let s:hint_shown = v:true +endfunction + +function! s:fim_on_exit(job_id, exit_code, event) dict + if a:exit_code != 0 + echom "Job failed with exit code: " . a:exit_code + endif + + let s:current_job = v:null endfunction diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 8aa7b0750..14e02c8dd 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -2444,12 +2444,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ggml_backend_cpu_set_n_threads(ctx->backend, n_threads); } -#ifdef GGML_USE_METAL - if (ggml_backend_is_metal(ctx->backend)) { - ggml_backend_metal_set_n_cb(ctx->backend, n_threads); - } -#endif - ggml_backend_graph_compute(ctx->backend, gf); // the last node is the embedding tensor diff --git a/examples/llava/convert_image_encoder_to_gguf.py b/examples/llava/convert_image_encoder_to_gguf.py index 36f6b92fb..4fa1d6cea 100644 --- a/examples/llava/convert_image_encoder_to_gguf.py +++ b/examples/llava/convert_image_encoder_to_gguf.py @@ -274,7 +274,7 @@ fout.add_bool("clip.use_gelu", use_gelu) if has_llava_projector: - model.vision_model.encoder.layers.pop(-1) # pyright: ignore[reportAttributeAccessIssue] + model.vision_model.encoder.layers.pop(-1) projector = torch.load(args.llava_projector) for name, data in projector.items(): name = get_tensor_name(name) @@ -288,7 +288,7 @@ if has_llava_projector: print("Projector tensors added\n") -state_dict = model.state_dict() # pyright: ignore[reportAttributeAccessIssue] +state_dict = model.state_dict() for name, data in state_dict.items(): if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector): # we don't need this diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 8f437863f..161098585 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -20,7 +20,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector n_batch) { n_eval = n_batch; } - if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { + if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) { LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); return false; } @@ -37,21 +37,21 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ std::string str2 = str; - std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); + std::vector embd_inp = common_tokenize(ctx_llama, str2, add_bos, true); eval_tokens(ctx_llama, embd_inp, n_batch, n_past); return true; } -static const char * sample(struct gpt_sampler * smpl, +static const char * sample(struct common_sampler * smpl, struct llama_context * ctx_llama, int * n_past) { - const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1); - gpt_sampler_accept(smpl, id, true); + const llama_token id = common_sampler_sample(smpl, ctx_llama, -1); + common_sampler_accept(smpl, id, true); static std::string ret; if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { ret = ""; } else { - ret = llama_token_to_piece(ctx_llama, id); + ret = common_token_to_piece(ctx_llama, id); } eval_id(ctx_llama, id, n_past); return ret.c_str(); @@ -120,7 +120,7 @@ static void print_usage(int, char ** argv) { LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); } -static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) { +static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) { // load and preprocess the image llava_image_embed * embed = NULL; @@ -146,7 +146,7 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para return embed; } -static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, gpt_params * params, const std::string & prompt) { +static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) { int n_past = 0; const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; @@ -159,16 +159,16 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ user_prompt = prompt.substr(image_pos + std::string("").length()); LOG_INF("system_prompt: %s\n", system_prompt.c_str()); if (params->verbose_prompt) { - auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); + auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } LOG_INF("user_prompt: %s\n", user_prompt.c_str()); if (params->verbose_prompt) { - auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); + auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } } else { @@ -176,9 +176,9 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:"; user_prompt = prompt + "\nASSISTANT:"; if (params->verbose_prompt) { - auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); + auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } } @@ -191,7 +191,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ LOG("\n"); - struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams); + struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams); if (!smpl) { LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); exit(1); @@ -211,15 +211,15 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ fflush(stdout); } - gpt_sampler_free(smpl); + common_sampler_free(smpl); LOG("\n"); } -static struct llama_model * llava_init(gpt_params * params) { +static struct llama_model * llava_init(common_params * params) { llama_backend_init(); llama_numa_init(params->numa); - llama_model_params model_params = llama_model_params_from_gpt_params(*params); + llama_model_params model_params = common_model_params_to_llama(*params); llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); if (model == NULL) { @@ -229,7 +229,7 @@ static struct llama_model * llava_init(gpt_params * params) { return model; } -static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) { +static struct llava_context * llava_init_context(common_params * params, llama_model * model) { const char * clip_path = params->mmproj.c_str(); auto prompt = params->prompt; @@ -240,7 +240,7 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); - llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); + llama_context_params ctx_params = common_context_params_to_llama(*params); ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); @@ -272,13 +272,13 @@ static void llava_free(struct llava_context * ctx_llava) { int main(int argc, char ** argv) { ggml_time_init(); - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) { return 1; } - gpt_init(); + common_init(); if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { print_usage(argc, argv); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 8558c6bdc..be6988540 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -401,6 +401,39 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co return true; } +struct llava_embd_batch { + std::vector pos; + std::vector n_seq_id; + std::vector seq_id_0; + std::vector seq_ids; + std::vector logits; + llama_batch batch; + llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { + pos .resize(n_tokens); + n_seq_id.resize(n_tokens); + seq_ids .resize(n_tokens + 1); + logits .resize(n_tokens); + seq_id_0.resize(1); + seq_id_0[0] = seq_id; + seq_ids [n_tokens] = nullptr; + batch = { + /*n_tokens =*/ n_tokens, + /*tokens =*/ nullptr, + /*embd =*/ embd, + /*pos =*/ pos.data(), + /*n_seq_id =*/ n_seq_id.data(), + /*seq_id =*/ seq_ids.data(), + /*logits =*/ logits.data(), + }; + for (int i = 0; i < n_tokens; i++) { + batch.pos [i] = pos_0 + i; + batch.n_seq_id[i] = 1; + batch.seq_id [i] = seq_id_0.data(); + batch.logits [i] = false; + } + } +}; + bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) { int n_embd = llama_n_embd(llama_get_model(ctx_llama)); @@ -409,8 +442,9 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_ if (n_eval > n_batch) { n_eval = n_batch; } - llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, }; - if (llama_decode(ctx_llama, batch)) { + float * embd = image_embed->embed+i*n_embd; + llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0); + if (llama_decode(ctx_llama, llava_batch.batch)) { LOG_ERR("%s : failed to eval\n", __func__); return false; } @@ -432,7 +466,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos); if (!image_embed_result) { clip_image_u8_free(img); - LOG_ERR("%s: coulnd't embed the image\n", __func__); + LOG_ERR("%s: couldn't embed the image\n", __func__); return NULL; } diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index c5156c35b..cbecec343 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -25,11 +25,11 @@ static void show_additional_info(int /*argc*/, char ** argv) { LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n"); } -static struct llama_model * llava_init(gpt_params * params) { +static struct llama_model * llava_init(common_params * params) { llama_backend_init(); llama_numa_init(params->numa); - llama_model_params model_params = llama_model_params_from_gpt_params(*params); + llama_model_params model_params = common_model_params_to_llama(*params); llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); if (model == NULL) { @@ -39,13 +39,13 @@ static struct llama_model * llava_init(gpt_params * params) { return model; } -static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) { +static struct llava_context * llava_init_context(common_params * params, llama_model * model) { auto prompt = params->prompt; if (prompt.empty()) { prompt = "describe the image in detail."; } - llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); + llama_context_params ctx_params = common_context_params_to_llama(*params); if (params->n_ctx < 2048) { // warn user here, "Image processing requires at least 2048 context, setting context to 2048" LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__); @@ -79,7 +79,7 @@ static void llava_free(struct llava_context * ctx_llava) { llama_backend_free(); } -static struct clip_ctx * clip_init_context(gpt_params * params) { +static struct clip_ctx * clip_init_context(common_params * params) { const char * clip_path = params->mmproj.c_str(); auto prompt = params->prompt; @@ -97,7 +97,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector n_batch) { n_eval = n_batch; } - if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { + if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) { LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); return false; } @@ -114,7 +114,7 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ std::string str2 = str; - std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); + std::vector embd_inp = common_tokenize(ctx_llama, str2, add_bos, true); return eval_tokens(ctx_llama, embd_inp, n_batch, n_past); } @@ -129,7 +129,7 @@ static void process_eval_image_embed(struct llava_context * ctx_llava, const str llava_image_embed_free(slice_embed); } -static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) { +static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, common_params * params, int &n_past) { std::string system_prompt; int idx = 0; int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip); @@ -162,22 +162,22 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e LOG_INF("%s: image token past: %d\n", __func__, n_past); } -static const char * sample(struct gpt_sampler * smpl, +static const char * sample(struct common_sampler * smpl, struct llama_context * ctx_llama, int * n_past) { - const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1); - gpt_sampler_accept(smpl, id, true); + const llama_token id = common_sampler_sample(smpl, ctx_llama, -1); + common_sampler_accept(smpl, id, true); static std::string ret; if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { ret = ""; } else { - ret = llama_token_to_piece(ctx_llama, id); + ret = common_token_to_piece(ctx_llama, id); } eval_id(ctx_llama, id, n_past); return ret.c_str(); } -static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){ +static struct llava_context * minicpmv_init(common_params * params, const std::string & fname, int &n_past){ auto * ctx_clip = clip_init_context(params); auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str()); if (!embeds) { @@ -213,7 +213,7 @@ static struct llava_context * minicpmv_init(gpt_params * params, const std::stri return ctx_llava; } -static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, const std::string & prompt, int & n_past, bool is_first = false){ +static struct common_sampler * llama_init(struct llava_context * ctx_llava, common_params * params, const std::string & prompt, int & n_past, bool is_first = false){ std::string user_prompt = prompt; int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip); if (!is_first) { @@ -237,11 +237,11 @@ static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_par LOG_INF("\n"); - struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams); + struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams); return smpl; } -static const char * llama_loop(struct llava_context * ctx_llava,struct gpt_sampler * smpl, int &n_past){ +static const char * llama_loop(struct llava_context * ctx_llava,struct common_sampler * smpl, int &n_past){ const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past); return tmp; @@ -250,13 +250,13 @@ static const char * llama_loop(struct llava_context * ctx_llava,struct gpt_sampl int main(int argc, char ** argv) { ggml_time_init(); - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) { return 1; } - gpt_init(); + common_init(); if (params.mmproj.empty() || (params.image.empty())) { show_additional_info(argc, argv); @@ -290,7 +290,7 @@ int main(int argc, char ** argv) { fflush(stdout); } - gpt_sampler_free(smpl); + common_sampler_free(smpl); }else { while (true) { LOG(""); @@ -309,7 +309,7 @@ int main(int argc, char ** argv) { if (strstr(response.c_str(), "")) break; // minicpm-v fflush(stdout); } - gpt_sampler_free(smpl); + common_sampler_free(smpl); } } printf("\n"); diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 49870b4a4..3c0ccfea2 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -37,13 +37,13 @@ struct ngram_container { }; int main(int argc, char ** argv) { - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } - gpt_init(); + common_init(); const int W = 15; // lookahead window const int N = 5; // n-gram size @@ -56,7 +56,7 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the target model - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -65,7 +65,7 @@ int main(int argc, char ** argv) { std::vector inp; std::vector all; - inp = ::llama_tokenize(ctx, params.prompt, true, true); + inp = common_tokenize(ctx, params.prompt, true, true); all = inp; const int max_context_size = llama_n_ctx(ctx); @@ -79,7 +79,7 @@ int main(int argc, char ** argv) { LOG("\n\n"); for (auto id : inp) { - LOG("%s", llama_token_to_piece(ctx, id).c_str()); + LOG("%s", common_token_to_piece(ctx, id).c_str()); } fflush(stderr); @@ -89,8 +89,8 @@ int main(int argc, char ** argv) { const auto t_enc_start = ggml_time_us(); // eval the prompt - llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0, 0)); - llama_decode(ctx, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0)); + llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1)); + llama_decode(ctx, llama_batch_get_one(&inp.back(), 1)); for (int s = 1; s < W + G + 1; ++s) { llama_kv_cache_seq_cp(ctx, 0, s, -1, -1); @@ -115,7 +115,7 @@ int main(int argc, char ** argv) { llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1); // target model sampling context - struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams); + struct common_sampler * smpl = common_sampler_init(model, params.sparams); // verification n-grams std::vector ngrams_cur(G); @@ -156,12 +156,12 @@ int main(int argc, char ** argv) { // sample first token { - id = gpt_sampler_sample(smpl, ctx, 0); + id = common_sampler_sample(smpl, ctx, 0); - gpt_sampler_accept(smpl, id, true); + common_sampler_accept(smpl, id, true); { - const std::string token_str = llama_token_to_piece(ctx, id); + const std::string token_str = common_token_to_piece(ctx, id); LOG("%s", token_str.c_str()); fflush(stdout); @@ -172,7 +172,7 @@ int main(int argc, char ** argv) { // debug if (dump_kv_cache) { llama_kv_cache_view_update(ctx, &kvc_view); - llama_kv_cache_dump_view_seqs(kvc_view, 40); + common_kv_cache_dump_view_seqs(kvc_view, 40); } // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/ @@ -201,10 +201,10 @@ int main(int argc, char ** argv) { // V V V V V V // id { - llama_batch_clear(batch); + common_batch_clear(batch); // current token - first token of the first level - llama_batch_add(batch, id, n_past, seq_id_all, true); + common_batch_add(batch, id, n_past, seq_id_all, true); // verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation { @@ -229,7 +229,7 @@ int main(int argc, char ** argv) { ngrams_cur[g].tokens [j + 1] = t; ngrams_cur[g].i_batch[j + 1] = batch.n_tokens; - llama_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true); + common_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true); } } } @@ -241,13 +241,13 @@ int main(int argc, char ** argv) { seq_id_look[j] = i + j + 1; } - llama_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false); + common_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false); } // fill the rest of the levels for (int j = 1; j < N - 1; j++) { for (int i = 0; i < W; i++) { - llama_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2); + common_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2); } } } @@ -281,13 +281,13 @@ int main(int argc, char ** argv) { } // sample the next token - id = gpt_sampler_sample(smpl, ctx, i_batch); + id = common_sampler_sample(smpl, ctx, i_batch); - gpt_sampler_accept(smpl, id, true); + common_sampler_accept(smpl, id, true); // print { - const std::string token_str = llama_token_to_piece(ctx, id); + const std::string token_str = common_token_to_piece(ctx, id); if (v == 0) { LOG("%s", token_str.c_str()); @@ -327,7 +327,7 @@ int main(int argc, char ** argv) { // print known n-grams starting with token id (debug) if (0 && v == 0) { if (ngrams_observed.cnt[id] > 0) { - LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str()); + LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], common_token_to_piece(ctx, id).c_str()); } for (int i = 0; i < ngrams_observed.cnt[id]; i++) { @@ -336,7 +336,7 @@ int main(int argc, char ** argv) { const int idx = id*(N - 1)*G + i*(N - 1); for (int j = 0; j < N - 1; j++) { - const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]); + const std::string token_str = common_token_to_piece(ctx, ngrams_observed.tokens[idx + j]); LOG("%s", token_str.c_str()); } @@ -358,7 +358,7 @@ int main(int argc, char ** argv) { if (v == 0) { // sample from the last level for (int i = 0; i < W; i++) { - tokens_j[N - 2][i] = gpt_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i); + tokens_j[N - 2][i] = common_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i); } } else { for (int i = 0; i < W; i++) { @@ -466,9 +466,9 @@ int main(int argc, char ** argv) { LOG_INF("n_accept = %d\n", n_accept); LOG_INF("\n"); - gpt_perf_print(ctx, smpl); + common_perf_print(ctx, smpl); - gpt_sampler_free(smpl); + common_sampler_free(smpl); llama_kv_cache_view_free(&kvc_view); diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp index 33287c02c..7ced0aa97 100644 --- a/examples/lookup/lookup-create.cpp +++ b/examples/lookup/lookup-create.cpp @@ -12,9 +12,9 @@ #include int main(int argc, char ** argv){ - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { return 1; } @@ -23,7 +23,7 @@ int main(int argc, char ** argv){ llama_numa_init(params.numa); // load the model - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -31,15 +31,15 @@ int main(int argc, char ** argv){ // tokenize the prompt std::vector inp; - inp = ::llama_tokenize(ctx, params.prompt, true, true); + inp = common_tokenize(ctx, params.prompt, true, true); fprintf(stderr, "%s: tokenization done\n", __func__); - llama_ngram_cache ngram_cache; - llama_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true); + common_ngram_cache ngram_cache; + common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true); fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str()); - llama_ngram_cache_save(ngram_cache, params.lookup_cache_static); + common_ngram_cache_save(ngram_cache, params.lookup_cache_static); return 0; } diff --git a/examples/lookup/lookup-merge.cpp b/examples/lookup/lookup-merge.cpp index 81e2b0436..6871c0f5f 100644 --- a/examples/lookup/lookup-merge.cpp +++ b/examples/lookup/lookup-merge.cpp @@ -33,15 +33,15 @@ int main(int argc, char ** argv){ } fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str()); - llama_ngram_cache ngram_cache_merged = llama_ngram_cache_load(args[0]); + common_ngram_cache ngram_cache_merged = common_ngram_cache_load(args[0]); for (size_t i = 1; i < args.size()-1; ++i) { fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str()); - llama_ngram_cache ngram_cache = llama_ngram_cache_load(args[i]); + common_ngram_cache ngram_cache = common_ngram_cache_load(args[i]); - llama_ngram_cache_merge(ngram_cache_merged, ngram_cache); + common_ngram_cache_merge(ngram_cache_merged, ngram_cache); } fprintf(stderr, "lookup-merge: saving file %s\n", args.back().c_str()); - llama_ngram_cache_save(ngram_cache_merged, args.back()); + common_ngram_cache_save(ngram_cache_merged, args.back()); } diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp index 6d1e1ceb9..7faebe7ba 100644 --- a/examples/lookup/lookup-stats.cpp +++ b/examples/lookup/lookup-stats.cpp @@ -13,13 +13,13 @@ #include int main(int argc, char ** argv){ - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { return 1; } - gpt_init(); + common_init(); const int n_draft = params.n_draft; @@ -28,18 +28,18 @@ int main(int argc, char ** argv){ llama_numa_init(params.numa); // load the model - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; // tokenize the prompt std::vector inp; - inp = ::llama_tokenize(ctx, params.prompt, true, true); + inp = common_tokenize(ctx, params.prompt, true, true); - llama_ngram_cache ngram_cache_context; - llama_ngram_cache ngram_cache_dynamic; - llama_ngram_cache ngram_cache_static; + common_ngram_cache ngram_cache_context; + common_ngram_cache ngram_cache_dynamic; + common_ngram_cache ngram_cache_static; int64_t t_draft_flat_us = 0; int64_t t_draft_us = 0; @@ -48,7 +48,7 @@ int main(int argc, char ** argv){ if (!params.lookup_cache_static.empty()) { try { - ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static); + ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static); } catch (std::ifstream::failure const &) { LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); exit(1); @@ -57,7 +57,7 @@ int main(int argc, char ** argv){ if (!params.lookup_cache_dynamic.empty()) { try { - ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic); + ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic); } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program } @@ -86,7 +86,7 @@ int main(int argc, char ** argv){ { const int64_t t_start_draft_us = ggml_time_us(); - llama_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static); + common_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static); t_draft_us += ggml_time_us() - t_start_draft_us; } @@ -105,7 +105,7 @@ int main(int argc, char ** argv){ { const int64_t t_start_draft_us = ggml_time_us(); - llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false); + common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false); t_draft_us += ggml_time_us() - t_start_draft_us; } } @@ -115,7 +115,7 @@ int main(int argc, char ** argv){ pseudo_output.push_back(inp_slice[pseudo_output.size()]); { const int64_t t_start_draft_us = ggml_time_us(); - llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false); + common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false); t_draft_us += ggml_time_us() - t_start_draft_us; } } @@ -133,7 +133,7 @@ int main(int argc, char ** argv){ } // After each chunk, update the dynamic ngram cache with the context ngram cache: - llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context); + common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context); ngram_cache_context.clear(); } diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 2ccd0e6c1..a04728b18 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -13,13 +13,13 @@ #include int main(int argc, char ** argv){ - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { return 1; } - gpt_init(); + common_init(); // max. number of additional tokens to draft if match is found const int n_draft = params.n_draft; @@ -31,29 +31,29 @@ int main(int argc, char ** argv){ llama_numa_init(params.numa); // load the model - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; // tokenize the prompt std::vector inp; - inp = ::llama_tokenize(ctx, params.prompt, true, true); + inp = common_tokenize(ctx, params.prompt, true, true); - llama_ngram_cache ngram_cache_context; - llama_ngram_cache ngram_cache_dynamic; - llama_ngram_cache ngram_cache_static; + common_ngram_cache ngram_cache_context; + common_ngram_cache ngram_cache_dynamic; + common_ngram_cache ngram_cache_static; int64_t t_draft_flat_us = 0; int64_t t_draft_us = 0; { // Fill up context ngram cache with tokens from user input: const int64_t t_start_draft_us = ggml_time_us(); - llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false); + common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false); if (!params.lookup_cache_static.empty()) { try { - ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static); + ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static); } catch (std::ifstream::failure const &) { LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); exit(1); @@ -62,7 +62,7 @@ int main(int argc, char ** argv){ if (!params.lookup_cache_dynamic.empty()) { try { - ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic); + ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic); } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program } @@ -80,7 +80,7 @@ int main(int argc, char ** argv){ LOG("\n\n"); for (auto id : inp) { - LOG("%s", llama_token_to_piece(ctx, id).c_str()); + LOG("%s", common_token_to_piece(ctx, id).c_str()); } fflush(stderr); @@ -89,8 +89,8 @@ int main(int argc, char ** argv){ const auto t_enc_start = ggml_time_us(); - llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0, 0)); - llama_decode(ctx, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0)); + llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1)); + llama_decode(ctx, llama_batch_get_one(&inp.back(), 1)); const auto t_enc_end = ggml_time_us(); @@ -102,7 +102,7 @@ int main(int argc, char ** argv){ bool has_eos = false; - struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams); + struct common_sampler * smpl = common_sampler_init(model, params.sparams); std::vector draft; @@ -117,7 +117,7 @@ int main(int argc, char ** argv){ // debug if (dump_kv_cache) { llama_kv_cache_view_update(ctx, &kvc_view); - llama_kv_cache_dump_view_seqs(kvc_view, 40); + common_kv_cache_dump_view_seqs(kvc_view, 40); } // print current draft sequence @@ -126,11 +126,11 @@ int main(int argc, char ** argv){ int i_dft = 0; while (true) { // sample from the target model - llama_token id = gpt_sampler_sample(smpl, ctx, i_dft); + llama_token id = common_sampler_sample(smpl, ctx, i_dft); - gpt_sampler_accept(smpl, id, true); + common_sampler_accept(smpl, id, true); - const std::string token_str = llama_token_to_piece(ctx, id); + const std::string token_str = common_token_to_piece(ctx, id); if (!params.use_color) { LOG("%s", token_str.c_str()); @@ -152,7 +152,7 @@ int main(int argc, char ** argv){ { // Update context ngram cache with the newly accepted token: const int64_t t_start_draft_us = ggml_time_us(); - llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false); + common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false); t_draft_us += ggml_time_us() - t_start_draft_us; } @@ -178,7 +178,7 @@ int main(int argc, char ** argv){ { // Update context ngram cache with the newly accepted token: const int64_t t_start_draft_us = ggml_time_us(); - llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false); + common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false); t_draft_us += ggml_time_us() - t_start_draft_us; } break; @@ -192,18 +192,18 @@ int main(int argc, char ** argv){ // clean the cache of draft tokens that weren't accepted llama_kv_cache_seq_rm(ctx, 0, n_past, -1); - llama_batch_clear(batch_tgt); - llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true); + common_batch_clear(batch_tgt); + common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true); // Draft already contains a single token sampled from the model: GGML_ASSERT(draft.size() == 1); GGML_ASSERT(draft[0] == inp.back()); const int64_t t_start_draft_us = ggml_time_us(); - llama_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static); + common_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static); for (size_t i = 1; i < draft.size(); ++i) { - llama_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true); + common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true); } t_draft_us += ggml_time_us() - t_start_draft_us; @@ -218,8 +218,8 @@ int main(int argc, char ** argv){ auto t_dec_end = ggml_time_us(); // Update dynamic ngram cache with context ngram cache and save it to disk: - llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context); - llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic); + common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context); + common_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic); LOG("\n\n"); @@ -237,9 +237,9 @@ int main(int argc, char ** argv){ LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); LOG_INF("\ntarget:\n\n"); - gpt_perf_print(ctx, smpl); + common_perf_print(ctx, smpl); - gpt_sampler_free(smpl); + common_sampler_free(smpl); llama_batch_free(batch_tgt); diff --git a/examples/main/README.md b/examples/main/README.md index 6730effdf..7e192b9f2 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -69,7 +69,7 @@ In this section, we cover the most commonly used options for running the `llama- - `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. - `-mli, --multiline-input`: Allows you to write or paste multiple lines without ending each in '\' - `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has. -- - `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance. +- `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance. ## Input Prompts @@ -241,6 +241,19 @@ The `--mirostat-ent` option sets the Mirostat target entropy (tau), which repres Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0` +### XTC Sampling + +- `--xtc-probability N`: Sets the chance for token removal (checked once on sampler start) (default: 0.0). +- `--xtc-threshold N`: Sets a minimum probability threshold for tokens to be removed (default: 0.1). + +Exclude Top Choices (XTC) is a unique sampler that is designed to remove top tokens from consideration and avoid more obvious and repetitive outputs. With a chance of `xtc-probability` it searches for tokens with probabilities of `xtc-threshold` and above, then removes all such tokens except the least probable one. + +By removing top tokens XTC can improve the variety of answers, break writing clichés and inhibit repition, since clichés and repeated phrases are usually more likely to appear. By keeping the last token above the threshold, XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models. + +Being experimental and unique, XTC is disabled by default. The recommended combination of samplers is Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02 --xtc-probability 0.5`. + +Example usage: `--xtc-probability 0.5 --xtc-threshold 0.1` + ### Logit Bias - `-l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS`: Modify the likelihood of a token appearing in the generated text completion. @@ -284,10 +297,6 @@ These options help improve the performance and memory usage of the LLaMA models. These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root. -### Memory Float 32 - -- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended. - ### Batch Size - `-b N, --batch-size N`: Set the batch size for prompt processing (default: `2048`). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations. diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 068d53b39..2c6ab8e20 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -33,8 +33,8 @@ static llama_context ** g_ctx; static llama_model ** g_model; -static gpt_sampler ** g_smpl; -static gpt_params * g_params; +static common_sampler ** g_smpl; +static common_params * g_params; static std::vector * g_input_tokens; static std::string * g_output_s; static std::vector * g_output_tokens; @@ -63,7 +63,7 @@ static bool file_is_empty(const std::string & path) { } static void write_logfile( - const llama_context * ctx, const gpt_params & params, const llama_model * model, + const llama_context * ctx, const common_params & params, const llama_model * model, const std::vector & input_tokens, const std::string & output, const std::vector & output_tokens ) { @@ -114,12 +114,12 @@ static void sigint_handler(int signo) { } else { console::cleanup(); LOG("\n"); - gpt_perf_print(*g_ctx, *g_smpl); + common_perf_print(*g_ctx, *g_smpl); write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, *g_output_s, *g_output_tokens); // make sure all logs are flushed LOG("Interrupted by user\n"); - gpt_log_pause(gpt_log_main()); + common_log_pause(common_log_main()); _exit(130); } @@ -127,22 +127,22 @@ static void sigint_handler(int signo) { } #endif -static std::string chat_add_and_format(struct llama_model * model, std::vector & chat_msgs, const std::string & role, const std::string & content) { - llama_chat_msg new_msg{role, content}; - auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user"); +static std::string chat_add_and_format(struct llama_model * model, std::vector & chat_msgs, const std::string & role, const std::string & content) { + common_chat_msg new_msg{role, content}; + auto formatted = common_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user"); chat_msgs.push_back({role, content}); LOG_DBG("formatted: '%s'\n", formatted.c_str()); return formatted; } int main(int argc, char ** argv) { - gpt_params params; + common_params params; g_params = ¶ms; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) { return 1; } - gpt_init(); + common_init(); auto & sparams = params.sparams; @@ -187,9 +187,9 @@ int main(int argc, char ** argv) { llama_model * model = nullptr; llama_context * ctx = nullptr; - gpt_sampler * smpl = nullptr; + common_sampler * smpl = nullptr; - std::vector chat_msgs; + std::vector chat_msgs; g_model = &model; g_ctx = &ctx; @@ -197,7 +197,7 @@ int main(int argc, char ** argv) { // load the model and apply lora adapter, if any LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); model = llama_init.model; ctx = llama_init.context; @@ -246,7 +246,7 @@ int main(int argc, char ** argv) { // print chat template example in conversation mode if (params.conversation) { if (params.enable_chat_template) { - LOG_INF("%s: chat template example:\n%s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str()); + LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str()); } else { LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__); } @@ -255,7 +255,7 @@ int main(int argc, char ** argv) { // print system information { LOG_INF("\n"); - LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); LOG_INF("\n"); } @@ -296,7 +296,7 @@ int main(int argc, char ** argv) { : params.prompt; if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) { LOG_DBG("tokenize the prompt\n"); - embd_inp = ::llama_tokenize(ctx, prompt, true, true); + embd_inp = common_tokenize(ctx, prompt, true, true); } else { LOG_DBG("use session tokens\n"); embd_inp = session_tokens; @@ -379,13 +379,13 @@ int main(int argc, char ** argv) { LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str()); } if (params.n_keep > add_bos) { LOG_INF("%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { - LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str()); } LOG_CNT("'\n"); } @@ -415,9 +415,9 @@ int main(int argc, char ** argv) { for (const auto & antiprompt : params.antiprompt) { LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str()); if (params.verbose_prompt) { - auto tmp = ::llama_tokenize(ctx, antiprompt, false, true); + auto tmp = common_tokenize(ctx, antiprompt, false, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str()); } } } @@ -430,9 +430,9 @@ int main(int argc, char ** argv) { if (!params.input_prefix.empty()) { LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str()); if (params.verbose_prompt) { - auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true); + auto tmp = common_tokenize(ctx, params.input_prefix, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str()); } } } @@ -440,23 +440,23 @@ int main(int argc, char ** argv) { if (!params.input_suffix.empty()) { LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str()); if (params.verbose_prompt) { - auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true); + auto tmp = common_tokenize(ctx, params.input_suffix, false, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str()); } } } } - smpl = gpt_sampler_init(model, sparams); + smpl = common_sampler_init(model, sparams); if (!smpl) { LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); return 1; } - LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl)); + LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl)); LOG_INF("sampler params: \n%s\n", sparams.print().c_str()); - LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str()); + LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str()); LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); @@ -524,7 +524,7 @@ int main(int argc, char ** argv) { int enc_input_size = embd_inp.size(); llama_token * enc_input_buf = embd_inp.data(); - if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) { + if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size))) { LOG_ERR("%s : failed to eval\n", __func__); return 1; } @@ -565,30 +565,30 @@ int main(int argc, char ** argv) { if (!params.ctx_shift){ LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__); break; - } else { - if (params.n_predict == -2) { - LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); - break; - } - - const int n_left = n_past - params.n_keep; - const int n_discard = n_left/2; - - LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", - n_past, n_left, n_ctx, params.n_keep, n_discard); - - llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); - llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); - - n_past -= n_discard; - - LOG_DBG("after swap: n_past = %d\n", n_past); - - LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str()); - - LOG_DBG("clear session path\n"); - path_session.clear(); } + + if (params.n_predict == -2) { + LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); + break; + } + + const int n_left = n_past - params.n_keep; + const int n_discard = n_left/2; + + LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", + n_past, n_left, n_ctx, params.n_keep, n_discard); + + llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); + llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); + + n_past -= n_discard; + + LOG_DBG("after swap: n_past = %d\n", n_past); + + LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str()); + + LOG_DBG("clear session path\n"); + path_session.clear(); } } else { // context extension via Self-Extend @@ -644,7 +644,7 @@ int main(int argc, char ** argv) { LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str()); - if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) { + if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) { LOG_ERR("%s : failed to eval\n", __func__); return 1; } @@ -675,9 +675,9 @@ int main(int argc, char ** argv) { LOG_DBG("saved session to %s\n", path_session.c_str()); } - const llama_token id = gpt_sampler_sample(smpl, ctx, -1); + const llama_token id = common_sampler_sample(smpl, ctx, -1); - gpt_sampler_accept(smpl, id, /* accept_grammar= */ true); + common_sampler_accept(smpl, id, /* accept_grammar= */ true); // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str()); @@ -698,7 +698,7 @@ int main(int argc, char ** argv) { // push the prompt in the sampling context in order to apply repetition penalties later // for the prompt, we don't apply grammar rules - gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false); + common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false); ++n_consumed; if ((int) embd.size() >= params.n_batch) { @@ -710,7 +710,7 @@ int main(int argc, char ** argv) { // display text if (input_echo && display) { for (auto id : embd) { - const std::string token_str = llama_token_to_piece(ctx, id, params.special); + const std::string token_str = common_token_to_piece(ctx, id, params.special); // Console/Stream Output LOG("%s", token_str.c_str()); @@ -741,7 +741,7 @@ int main(int argc, char ** argv) { is_antiprompt = false; // check for reverse prompt using special tokens - llama_token last_token = gpt_sampler_last(smpl); + llama_token last_token = common_sampler_last(smpl); auto match = antiprompts.findSingleTokenMatch(last_token); if (match.pos != std::string::npos) { if (params.interactive) { @@ -768,13 +768,13 @@ int main(int argc, char ** argv) { } // deal with end of generation tokens in interactive mode - if (llama_token_is_eog(model, gpt_sampler_last(smpl))) { + if (llama_token_is_eog(model, common_sampler_last(smpl))) { LOG_DBG("found an EOG token\n"); if (params.interactive) { if (!antiprompts.stop_words.empty()) { // tokenize and inject first reverse prompt - const auto first_antiprompt = ::llama_tokenize(ctx, antiprompts.stop_words.front(), false, true); + const auto first_antiprompt = common_tokenize(ctx, antiprompts.stop_words.front(), false, true); embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); is_antiprompt = true; } @@ -789,8 +789,8 @@ int main(int argc, char ** argv) { // if current token is not EOG, we add it to current assistant message if (params.conversation) { - const auto id = gpt_sampler_last(smpl); - assistant_ss << llama_token_to_piece(ctx, id, false); + const auto id = common_sampler_last(smpl); + assistant_ss << common_token_to_piece(ctx, id, false); } if (n_past > 0 && is_interacting) { @@ -848,9 +848,9 @@ int main(int argc, char ** argv) { ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer)) : std::move(buffer); // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix) - const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); - const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat); - const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); + const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true); + const auto line_inp = common_tokenize(ctx, user_inp, false, format_chat); + const auto line_sfx = common_tokenize(ctx, params.input_suffix, false, true); LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str()); @@ -868,7 +868,7 @@ int main(int argc, char ** argv) { for (size_t i = original_size; i < embd_inp.size(); ++i) { const llama_token token = embd_inp[i]; output_tokens.push_back(token); - output_s.append(llama_token_to_piece(ctx, token)); + output_s.append(common_token_to_piece(ctx, token)); } // reset assistant message @@ -885,7 +885,7 @@ int main(int argc, char ** argv) { if (n_past > 0) { if (is_interacting) { - gpt_sampler_reset(smpl); + common_sampler_reset(smpl); } is_interacting = false; } @@ -911,10 +911,10 @@ int main(int argc, char ** argv) { } LOG("\n\n"); - gpt_perf_print(ctx, smpl); + common_perf_print(ctx, smpl); write_logfile(ctx, params, model, input_tokens, output_s, output_tokens); - gpt_sampler_free(smpl); + common_sampler_free(smpl); llama_free(ctx); llama_free_model(model); diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 81e2f7ed7..43c8f3ed5 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -54,7 +54,7 @@ static std::vector k_prompts = { struct client { ~client() { if (smpl) { - gpt_sampler_free(smpl); + common_sampler_free(smpl); } } @@ -75,7 +75,7 @@ struct client { std::string prompt; std::string response; - struct gpt_sampler * smpl = nullptr; + struct common_sampler * smpl = nullptr; }; static void print_date_time() { @@ -103,13 +103,13 @@ static std::vector split_string(const std::string& input, char deli int main(int argc, char ** argv) { srand(1234); - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) { return 1; } - gpt_init(); + common_init(); // number of simultaneous "clients" to simulate const int32_t n_clients = params.n_parallel; @@ -130,7 +130,7 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the target model - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -160,11 +160,11 @@ int main(int argc, char ** argv) { for (size_t i = 0; i < clients.size(); ++i) { auto & client = clients[i]; client.id = i; - client.smpl = gpt_sampler_init(model, params.sparams); + client.smpl = common_sampler_init(model, params.sparams); } std::vector tokens_system; - tokens_system = ::llama_tokenize(ctx, k_system, true); + tokens_system = common_tokenize(ctx, k_system, true); const int32_t n_tokens_system = tokens_system.size(); llama_seq_id g_seq_id = 0; @@ -189,7 +189,7 @@ int main(int argc, char ** argv) { LOG_INF("%s: Evaluating the system prompt ...\n", __func__); for (int32_t i = 0; i < n_tokens_system; ++i) { - llama_batch_add(batch, tokens_system[i], i, { 0 }, false); + common_batch_add(batch, tokens_system[i], i, { 0 }, false); } if (llama_decode(ctx, batch) != 0) { @@ -210,10 +210,10 @@ int main(int argc, char ** argv) { while (true) { if (dump_kv_cache) { llama_kv_cache_view_update(ctx, &kvc_view); - llama_kv_cache_dump_view_seqs(kvc_view, 40); + common_kv_cache_dump_view_seqs(kvc_view, 40); } - llama_batch_clear(batch); + common_batch_clear(batch); // decode any currently ongoing sequences for (auto & client : clients) { @@ -223,7 +223,7 @@ int main(int argc, char ** argv) { client.i_batch = batch.n_tokens; - llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true); + common_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true); client.n_decoded += 1; } @@ -252,14 +252,14 @@ int main(int argc, char ** argv) { client.prompt = client.input + "\nAssistant:"; client.response = ""; - gpt_sampler_reset(client.smpl); + common_sampler_reset(client.smpl); // do not prepend BOS because we have a system prompt! std::vector tokens_prompt; - tokens_prompt = ::llama_tokenize(ctx, client.prompt, false); + tokens_prompt = common_tokenize(ctx, client.prompt, false); for (size_t i = 0; i < tokens_prompt.size(); ++i) { - llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false); + common_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false); } // extract the logits only for the last token @@ -308,7 +308,6 @@ int main(int argc, char ** argv) { batch.n_seq_id + i, batch.seq_id + i, batch.logits + i, - 0, 0, 0, // unused }; const int ret = llama_decode(ctx, batch_view); @@ -340,9 +339,9 @@ int main(int argc, char ** argv) { //printf("client %d, seq %d, token %d, pos %d, batch %d\n", // client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch); - const llama_token id = gpt_sampler_sample(client.smpl, ctx, client.i_batch - i); + const llama_token id = common_sampler_sample(client.smpl, ctx, client.i_batch - i); - gpt_sampler_accept(client.smpl, id, true); + common_sampler_accept(client.smpl, id, true); if (client.n_decoded == 1) { // start measuring generation time after the first token to make sure all concurrent clients @@ -350,7 +349,7 @@ int main(int argc, char ** argv) { client.t_start_gen = ggml_time_us(); } - const std::string token_str = llama_token_to_piece(ctx, id); + const std::string token_str = common_token_to_piece(ctx, id); client.response += token_str; client.sampled = id; diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 7ef8d14f3..09bba708f 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -15,17 +15,17 @@ static void print_usage(int, char ** argv) { } int main(int argc, char ** argv) { - gpt_params params; + common_params params; params.n_junk = 250; params.n_keep = 32; params.i_pos = -1; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) { return 1; } - gpt_init(); + common_init(); int n_junk = params.n_junk; int n_keep = params.n_keep; @@ -61,7 +61,7 @@ int main(int argc, char ** argv) { // initialize the model - llama_model_params model_params = llama_model_params_from_gpt_params(params); + llama_model_params model_params = common_model_params_to_llama(params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); @@ -72,7 +72,7 @@ int main(int argc, char ** argv) { // initialize the context - llama_context_params ctx_params = llama_context_params_from_gpt_params(params); + llama_context_params ctx_params = common_context_params_to_llama(params); ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep; @@ -92,10 +92,10 @@ int main(int argc, char ** argv) { // tokenize the prompt std::vector tokens_list; - tokens_list = ::llama_tokenize(ctx, params.prompt, true); + tokens_list = common_tokenize(ctx, params.prompt, true); // tokenize the prefix and use it as a sink - const int n_tokens_prefix = ::llama_tokenize(ctx, prompt_prefix, true).size(); + const int n_tokens_prefix = common_tokenize(ctx, prompt_prefix, true).size(); const int n_tokens_all = tokens_list.size(); @@ -137,10 +137,10 @@ int main(int argc, char ** argv) { n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; } - llama_batch_clear(batch); + common_batch_clear(batch); for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) { - llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false); + common_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false); } if (i + n_batch >= n_tokens_all) { @@ -171,10 +171,10 @@ int main(int argc, char ** argv) { n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; - llama_batch_clear(batch); + common_batch_clear(batch); for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) { - llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false); + common_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false); } if (i + n_batch >= n_tokens_all) { @@ -229,15 +229,15 @@ int main(int argc, char ** argv) { break; } - LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str()); + LOG("%s", common_token_to_piece(ctx, new_token_id).c_str()); n_decode += 1; // prepare the next batch - llama_batch_clear(batch); + common_batch_clear(batch); // push this new token for next evaluation - llama_batch_add(batch, new_token_id, n_past++, { 0 }, true); + common_batch_add(batch, new_token_id, n_past++, { 0 }, true); } n_cur += 1; diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 87347135e..e803ff143 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -35,7 +35,7 @@ struct results_log_softmax { }; static void write_logfile( - const llama_context * ctx, const gpt_params & params, const llama_model * model, + const llama_context * ctx, const common_params & params, const llama_model * model, const struct results_perplexity & results ) { if (params.logdir.empty()) { @@ -169,7 +169,7 @@ static void process_logits( break; } lock.unlock(); - const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]); + const results_log_softmax results = log_softmax(n_vocab, logits + size_t(i)*n_vocab, tokens[i+1]); const double v = -results.log_softmax; local_nll += v; local_nll2 += v*v; @@ -203,7 +203,7 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits, break; } lock.unlock(); - const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + i*nv, tokens[i+1]); + const double v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, log_probs.data() + i*nv, tokens[i+1]); local_nll += v; local_nll2 += v*v; } @@ -281,7 +281,9 @@ static std::pair log_softmax(int n_vocab, const float * logits, c kld.sum_kld += sum; kld.sum_kld2 += sum*sum; ++kld.count; - if (imax == imax_base) ++kld.n_same_top; + if (imax == imax_base) { + ++kld.n_same_top; + } const float p_base = expf(-nll_base); const float p = expf(-nll); @@ -323,7 +325,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens break; } lock.unlock(); - std::pair v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld); + std::pair v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld); kld_values[i] = (float)v.first; p_diff_values[i] = v.second; } @@ -337,7 +339,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens } } -static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) { +static results_perplexity perplexity_v2(llama_context * ctx, const common_params & params) { // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Output: `perplexity: 13.5106 [114/114]` @@ -348,7 +350,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & LOG_INF("%s: tokenizing the input ..\n", __func__); - std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); + std::vector tokens = common_tokenize(ctx, params.prompt, true); const int n_ctx = llama_n_ctx(ctx); @@ -383,9 +385,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1) / params.ppl_stride; const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_batch = params.n_batch; + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + int count = 0; double nll = 0.0; @@ -405,14 +408,21 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & // clear the KV cache llama_kv_cache_clear(ctx); + llama_batch batch = llama_batch_init(n_batch, 0, 1); + for (int j = 0; j < num_batches; ++j) { const int batch_start = start + j * n_batch; const int batch_size = std::min(end - batch_start, n_batch); + common_batch_clear(batch); + for (int i = 0; i < batch_size; i++) { + common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true); + } + //LOG_DBG(" Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch); - // TODO: use llama_batch.logits instead of relying on logits_all == true - if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { + if (llama_decode(ctx, batch)) { //LOG_ERR("%s : failed to eval\n", __func__); + llama_batch_free(batch); return {tokens, -1, logit_history, prob_history}; } @@ -424,14 +434,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); } - const auto batch_logits = llama_get_logits(ctx); - logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); + const auto * batch_logits = llama_get_logits(ctx); + logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab); if (j == 0) { tokens[batch_start] = token_org; } } + llama_batch_free(batch); + const auto t_end = std::chrono::high_resolution_clock::now(); if (i == 0) { @@ -447,11 +459,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & //LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start); for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) { - // Calculate probability of next token, given the previous ones. const std::vector tok_logits( - logits.begin() + (j + 0) * n_vocab, - logits.begin() + (j + 1) * n_vocab); + logits.begin() + size_t(j + 0) * n_vocab, + logits.begin() + size_t(j + 1) * n_vocab); const float prob = softmax(tok_logits)[tokens[start + j + 1]]; logit_history[start + j + 1] = tok_logits[tokens[start + j + 1]]; @@ -472,7 +483,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & return {tokens, std::exp(nll / count), logit_history, prob_history}; } -static results_perplexity perplexity(llama_context * ctx, const gpt_params & params, const int32_t n_ctx) { +static results_perplexity perplexity(llama_context * ctx, const common_params & params, const int32_t n_ctx) { if (params.ppl_stride > 0) { return perplexity_v2(ctx, params); } @@ -500,7 +511,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par auto tim1 = std::chrono::high_resolution_clock::now(); LOG_INF("%s: tokenizing the input ..\n", __func__); - std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); + std::vector tokens = common_tokenize(ctx, params.prompt, true); auto tim2 = std::chrono::high_resolution_clock::now(); LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); @@ -521,9 +532,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par const int n_chunk_max = tokens.size() / n_ctx; const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_batch = params.n_batch; + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + int count = 0; double nll = 0.0; double nll2 = 0.0; @@ -538,7 +550,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par std::vector logits; if (num_batches > 1) { - logits.reserve((size_t)n_ctx * n_vocab); + logits.reserve(size_t(n_ctx) * n_vocab); } LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq); @@ -620,7 +632,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par if (num_batches > 1 && n_outputs > 0) { const auto * batch_logits = llama_get_logits(ctx); - logits.insert(logits.end(), batch_logits, batch_logits + n_outputs * n_vocab); + logits.insert(logits.end(), batch_logits, batch_logits + size_t(n_outputs) * n_vocab); } } @@ -661,7 +673,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par } else { double av = nll/count; double av2 = nll2/count - av*av; - if (av2 > 0) av2 = sqrt(av2/(count-1)); + if (av2 > 0) { + av2 = sqrt(av2/(count-1)); + } LOG("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2); } } @@ -686,10 +700,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par return {tokens, ppl, logit_history, prob_history}; } -static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector & batch_logits, int32_t n_batch, int32_t n_vocab) { +static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector & batch_logits, int n_batch, int n_vocab) { int prev_outputs = 0; - for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { - const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); + for (int i = 0; i < (int) batch.n_tokens; i += n_batch) { + const int n_tokens = std::min(n_batch, batch.n_tokens - i); llama_batch batch_view = { n_tokens, @@ -699,7 +713,6 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector< batch.n_seq_id + i, batch.seq_id + i, batch.logits + i, - 0, 0, 0, // unused }; const int ret = llama_decode(ctx, batch_view); @@ -713,7 +726,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector< n_outputs += batch_view.logits[i] != 0; } - memcpy(batch_logits.data() + prev_outputs*n_vocab, llama_get_logits(ctx), n_outputs*n_vocab*sizeof(float)); + memcpy(batch_logits.data() + size_t(prev_outputs)*n_vocab, llama_get_logits(ctx), size_t(n_outputs)*n_vocab*sizeof(float)); prev_outputs += n_outputs; } @@ -728,7 +741,9 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto if (eval_results.size() != eval_pairs.size()) { eval_results.resize(eval_pairs.size()); } - if (eval_pairs.empty()) return; + if (eval_pairs.empty()) { + return; + } size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size()); @@ -736,11 +751,13 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () { float local_logprobs[K_TOKEN_CHUNK]; while (true) { - size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed); - if (first >= eval_results.size()) break; - size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size()); + const size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed); + if (first >= eval_results.size()) { + break; + } + const size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size()); for (size_t i = first; i < last; ++i) { - auto logits = batch_logits + eval_pairs[i].first * n_vocab; + const auto * logits = batch_logits + eval_pairs[i].first * n_vocab; float max_logit = logits[0]; for (int j = 1; j < n_vocab; ++j) { max_logit = std::max(max_logit, logits[j]); @@ -763,7 +780,7 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto } } -static void hellaswag_score(llama_context * ctx, const gpt_params & params) { +static void hellaswag_score(llama_context * ctx, const common_params & params) { // Calculates hellaswag score (acc_norm) from prompt // // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl @@ -844,7 +861,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] ); for (size_t j = 0; j < 4; j++) { hs_cur.ending[j] = prompt_lines[idx*6+2+j]; - hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true); + hs_cur.seq_tokens[j] = common_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true); } // determine the common prefix of the endings @@ -877,10 +894,11 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { double acc = 0.0f; - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_ctx = llama_n_ctx(ctx); const int n_batch = params.n_batch; + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + const int max_tasks_per_batch = 32; const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); @@ -888,7 +906,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { std::vector tok_logits(n_vocab); // TODO: this could be made smaller; it's currently the worst-case size - std::vector batch_logits(n_vocab*n_ctx); + std::vector batch_logits(size_t(n_ctx)*n_vocab); std::vector> eval_pairs; std::vector eval_results; @@ -900,7 +918,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { size_t i1 = i0; size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch - llama_batch_clear(batch); + common_batch_clear(batch); // batch as much tasks as possible into the available context // each task has 4 unique sequence ids - one for each ending @@ -916,7 +934,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { } for (size_t i = 0; i < hs_cur.common_prefix; ++i) { - llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false); + common_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false); } batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix n_logits += 1; @@ -926,7 +944,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { // TODO: don't evaluate the last token of each sequence for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) { const bool needs_logits = i < seq_tokens_size - 1; - llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits); + common_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits); n_logits += needs_logits; } } @@ -975,7 +993,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { auto & hs_cur = hs_data[i]; // get the logits of the last token of the common prefix - std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*hs_cur.i_logits, n_vocab*sizeof(float)); + std::memcpy(tok_logits.data(), batch_logits.data() + hs_cur.i_logits*n_vocab, n_vocab*sizeof(float)); const auto first_probs = softmax(tok_logits); @@ -1102,7 +1120,7 @@ static std::vector load_winogrande_from_csv(const std::string * 0,Sarah was a much better surgeon than Maria so _ always got the easier cases.,Sarah,Maria,2 * */ -static void winogrande_score(llama_context * ctx, const gpt_params & params) { +static void winogrande_score(llama_context * ctx, const common_params & params) { constexpr int k_min_trailing_ctx = 3; @@ -1136,8 +1154,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { LOG_INF("%s : tokenizing selected tasks\n", __func__); for (auto & task : data) { - task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true); - task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true); + task.seq_tokens[0] = common_tokenize(ctx, task.first + task.choices[0] + task.second, true); + task.seq_tokens[1] = common_tokenize(ctx, task.first + task.choices[1] + task.second, true); task.common_prefix = 0; for (size_t k = 0; k < task.seq_tokens[0].size(); k++) { @@ -1152,16 +1170,17 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { task.seq_tokens[0].size() - task.common_prefix + task.seq_tokens[1].size() - task.common_prefix; - task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true).size(); - task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size(); + task.n_base1 = common_tokenize(ctx, task.first + task.choices[0], true).size(); + task.n_base2 = common_tokenize(ctx, task.first + task.choices[1], true).size(); } LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__); - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_ctx = llama_n_ctx(ctx); const int n_batch = params.n_batch; + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + const int max_tasks_per_batch = 128; const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); @@ -1169,7 +1188,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { std::vector tok_logits(n_vocab); // TODO: this could be made smaller; it's currently the worst-case size - std::vector batch_logits(n_vocab*n_ctx); + std::vector batch_logits(size_t(n_ctx)*n_vocab); std::vector> eval_pairs; std::vector eval_results; @@ -1184,7 +1203,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { size_t i1 = i0; size_t i_logits = 0; - llama_batch_clear(batch); + common_batch_clear(batch); while (n_cur + (int) data[i1].required_tokens <= n_ctx) { int n_logits = 0; @@ -1194,7 +1213,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { } for (size_t i = 0; i < data[i1].common_prefix; ++i) { - llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false); + common_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false); } batch.logits[batch.n_tokens - 1] = true; n_logits += 1; @@ -1202,7 +1221,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { for (int s = 0; s < 2; ++s) { // TODO: end before the last token, no need to predict past the end of the sequences for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) { - llama_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true); + common_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true); n_logits += 1; } } @@ -1359,7 +1378,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic } return false; } - task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true)); + task.seq_tokens.emplace_back(::common_tokenize(ctx, task.question + " " + answer, true)); } auto min_len = task.seq_tokens.front().size(); for (auto& seq : task.seq_tokens) { @@ -1403,7 +1422,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic // git@hf.co:datasets/Stevross/mmlu // https://huggingface.co/datasets/truthful_qa // -static void multiple_choice_score(llama_context * ctx, const gpt_params & params) { +static void multiple_choice_score(llama_context * ctx, const common_params & params) { std::istringstream strstream(params.prompt); uint32_t n_task; @@ -1509,17 +1528,18 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params LOG("\ntask\tacc_norm\n"); - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_ctx = llama_n_ctx(ctx); const int n_batch = params.n_batch; + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + const int max_tasks_per_batch = 32; const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); llama_batch batch = llama_batch_init(n_ctx, 0, max_seq); std::vector tok_logits(n_vocab); - std::vector batch_logits(n_vocab*n_ctx); + std::vector batch_logits(size_t(n_ctx)*n_vocab); std::vector> eval_pairs; std::vector eval_results; @@ -1536,7 +1556,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params size_t i1 = i0; size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch - llama_batch_clear(batch); + common_batch_clear(batch); // batch as much tasks as possible into the available context // each task has 4 unique sequence ids - one for each ending @@ -1559,7 +1579,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params for (size_t i = 0; i < cur_task.common_prefix; ++i) { //llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false); - llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false); + common_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false); } batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix n_logits += 1; @@ -1569,7 +1589,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params // TODO: don't evaluate the last token of each sequence for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) { const bool needs_logits = i < seq_tokens_size - 1; - llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits); + common_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits); n_logits += needs_logits; } } @@ -1627,7 +1647,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params //LOG("\n common_prefix: %zu\n", cur_task.common_prefix); // get the logits of the last token of the common prefix - std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float)); + std::memcpy(tok_logits.data(), batch_logits.data() + cur_task.i_logits*n_vocab, n_vocab*sizeof(float)); const auto first_probs = softmax(tok_logits); @@ -1683,7 +1703,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params LOG_INF("\n"); } -static void kl_divergence(llama_context * ctx, const gpt_params & params) { +static void kl_divergence(llama_context * ctx, const common_params & params) { if (params.logits_file.empty()) { LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__); return; @@ -1709,7 +1729,8 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { __func__, params.logits_file.c_str(), n_ctx, params.n_ctx); } - int n_vocab, n_chunk; + int n_vocab; + int n_chunk; in.read((char *)&n_vocab, sizeof(n_vocab)); in.read((char *)&n_chunk, sizeof(n_chunk)); if (in.fail()) { @@ -1720,7 +1741,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx))); } - std::vector tokens(n_ctx * n_chunk); + std::vector tokens(size_t(n_ctx) * n_chunk); if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) { LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str()); return; @@ -1737,7 +1758,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { std::vector p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk); std::vector logits; if (num_batches > 1) { - logits.reserve(n_ctx * n_vocab); + logits.reserve(size_t(n_ctx) * n_vocab); } std::vector workers(std::thread::hardware_concurrency() - 1); @@ -1778,6 +1799,8 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { // clear the KV cache llama_kv_cache_clear(ctx); + llama_batch batch = llama_batch_init(n_batch, 0, 1); + for (int j = 0; j < num_batches; ++j) { const int batch_start = start + j * n_batch; const int batch_size = std::min(end - batch_start, n_batch); @@ -1790,9 +1813,14 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); } - // TODO: use llama_batch.logits instead of relying on logits_all == true - if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { + common_batch_clear(batch); + for (int i = 0; i < batch_size; i++) { + common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true); + } + + if (llama_decode(ctx, batch)) { LOG_ERR("%s : failed to eval\n", __func__); + llama_batch_free(batch); return; } @@ -1801,10 +1829,12 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { if (num_batches > 1) { const auto * batch_logits = llama_get_logits(ctx); - logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); + logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab); } } + llama_batch_free(batch); + const auto t_end = std::chrono::high_resolution_clock::now(); if (i == 0) { @@ -1822,7 +1852,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { const int first = n_ctx/2; const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); - process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, + process_logits(n_vocab, all_logits + size_t(first)*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr); p_diff_ptr += n_ctx - 1 - first; kld_ptr += n_ctx - 1 - first; @@ -1955,17 +1985,17 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { } int main(int argc, char ** argv) { - gpt_params params; + common_params params; params.n_ctx = 512; params.logits_all = true; params.escape = false; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) { return 1; } - gpt_init(); + common_init(); const int32_t n_ctx = params.n_ctx; @@ -2004,7 +2034,7 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the model and apply lora adapter, if any - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -2023,7 +2053,7 @@ int main(int argc, char ** argv) { // print system information { LOG_INF("\n"); - LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } struct results_perplexity results; diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 498cbbe3c..e372856c6 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -142,7 +142,7 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) { } static void test_roundtrip_on_chunk( - const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference, + const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, bool use_reference, float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats ) { if (layer->type == GGML_TYPE_F16) { @@ -166,7 +166,7 @@ static void test_roundtrip_on_chunk( // Run quantization function for a single layer and update error stats static void test_roundtrip_on_layer( - std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference, + std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, bool use_reference, const ggml_tensor * layer, std::vector & input_scratch, std::vector & quantized_scratch, std::vector & output_scratch, error_stats & total_error, int max_thread = 0 ) { @@ -371,8 +371,8 @@ int main(int argc, char ** argv) { if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { continue; } - ggml_type_traits_t qfns = ggml_internal_get_type_traits(type); - if (qfns.from_float && qfns.to_float) { + const auto * qfns = ggml_get_type_traits(type); + if (qfns->from_float && qfns->to_float) { if (params.verbose) { printf("testing %s ...\n", ggml_type_name(type)); } @@ -393,7 +393,7 @@ int main(int argc, char ** argv) { test_roundtrip_on_layer( layer_name, params.per_layer_stats, - qfns, + *qfns, params.reference, kv_tensor.second, input_scratch, diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 5971690f1..1768aae51 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -77,7 +77,7 @@ static std::vector chunk_file(const std::string & filename, int chunk_siz static void batch_add_seq(llama_batch & batch, const std::vector & tokens, llama_seq_id seq_id) { size_t n_tokens = tokens.size(); for (size_t i = 0; i < n_tokens; i++) { - llama_batch_add(batch, tokens[i], i, { seq_id }, true); + common_batch_add(batch, tokens[i], i, { seq_id }, true); } } @@ -107,18 +107,18 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu } float * out = output + batch.seq_id[i][0] * n_embd; - llama_embd_normalize(embd, out, n_embd); + common_embd_normalize(embd, out, n_embd); } } int main(int argc, char ** argv) { - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) { return 1; } - gpt_init(); + common_init(); // For BERT models, batch size must be equal to ubatch size params.n_ubatch = params.n_batch; @@ -149,7 +149,7 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the model - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -176,7 +176,7 @@ int main(int argc, char ** argv) { // print system information { LOG_INF("\n"); - LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } // max batch size @@ -185,7 +185,7 @@ int main(int argc, char ** argv) { // tokenize the prompts and trim for (auto & chunk : chunks) { - auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false); + auto inp = common_tokenize(ctx, chunk.textdata, true, false); if (inp.size() > n_batch) { LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n", __func__, (long long int) inp.size(), (long long int) n_batch); @@ -204,7 +204,7 @@ int main(int argc, char ** argv) { LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str()); LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size()); for (int j = 0; j < (int) chunks[i].tokens.size(); j++) { - LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str()); + LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], common_token_to_piece(ctx, chunks[i].tokens[j]).c_str()); } LOG_INF("\n\n"); } @@ -232,7 +232,7 @@ int main(int argc, char ** argv) { if (batch.n_tokens + n_toks > n_batch) { float * out = emb + p * n_embd; batch_decode(ctx, batch, out, s, n_embd); - llama_batch_clear(batch); + common_batch_clear(batch); p += s; s = 0; } @@ -260,20 +260,20 @@ int main(int argc, char ** argv) { while (true) { LOG("Enter query: "); std::getline(std::cin, query); - std::vector query_tokens = llama_tokenize(ctx, query, true); + std::vector query_tokens = common_tokenize(ctx, query, true); batch_add_seq(query_batch, query_tokens, 0); std::vector query_emb(n_embd, 0); batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd); - llama_batch_clear(query_batch); + common_batch_clear(query_batch); // compute cosine similarities { std::vector> similarities; for (int i = 0; i < n_chunks; i++) { - float sim = llama_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd); + float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd); similarities.push_back(std::make_pair(i, sim)); } diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp index 6342e6488..8354e37e5 100644 --- a/examples/rpc/rpc-server.cpp +++ b/examples/rpc/rpc-server.cpp @@ -6,6 +6,10 @@ #include "ggml-metal.h" #endif +#ifdef GGML_USE_VULKAN +#include "ggml-vulkan.h" +#endif + #include "ggml-rpc.h" #ifdef _WIN32 # include @@ -79,6 +83,12 @@ static ggml_backend_t create_backend() { if (!backend) { fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); } +#elif GGML_USE_VULKAN + fprintf(stderr, "%s: using Vulkan backend\n", __func__); + backend = ggml_backend_vk_init(0); // init device 0 + if (!backend) { + fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__); + } #endif // if there aren't GPU Backends fallback to CPU backend @@ -92,6 +102,8 @@ static ggml_backend_t create_backend() { static void get_backend_memory(size_t * free_mem, size_t * total_mem) { #ifdef GGML_USE_CUDA ggml_backend_cuda_get_device_memory(0, free_mem, total_mem); +#elif GGML_USE_VULKAN + ggml_backend_vk_get_device_memory(0, free_mem, total_mem); #else #ifdef _WIN32 MEMORYSTATUSEX status; @@ -139,7 +151,7 @@ int main(int argc, char * argv[]) { get_backend_memory(&free_mem, &total_mem); } printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024)); - start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem); + ggml_backend_rpc_start_server(backend, endpoint.c_str(), free_mem, total_mem); ggml_backend_free(backend); return 0; } diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 0117d9357..8c49a52a6 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -6,12 +6,12 @@ #include int main(int argc, char ** argv) { - gpt_params params; + common_params params; params.prompt = "The quick brown fox"; params.sparams.seed = 1234; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } @@ -28,7 +28,7 @@ int main(int argc, char ** argv) { std::string result2; // init - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -42,15 +42,21 @@ int main(int argc, char ** argv) { llama_sampler * smpl = llama_sampler_chain_init(sparams); - llama_sampler_chain_add(smpl, llama_sampler_init_softmax()); llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed)); // tokenize prompt - auto tokens = llama_tokenize(ctx, params.prompt, true); + auto tokens = common_tokenize(ctx, params.prompt, true); + + // prepare the batch + llama_batch batch = llama_batch_init(tokens.size(), 0, 1); + for (size_t i = 0; i < tokens.size(); i++) { + common_batch_add(batch, tokens[i], i, {0}, false); + } + batch.logits[batch.n_tokens - 1] = true; // generate next token // evaluate prompt - llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0)); - n_past += tokens.size(); + llama_decode(ctx, batch); + n_past += batch.n_tokens; // save state (rng, logits, embedding and kv_cache) to file { @@ -72,13 +78,17 @@ int main(int argc, char ** argv) { for (auto i = 0; i < params.n_predict; i++) { auto next_token = llama_sampler_sample(smpl, ctx, -1); - auto next_token_str = llama_token_to_piece(ctx, next_token); + auto next_token_str = common_token_to_piece(ctx, next_token); printf("%s", next_token_str.c_str()); result0 += next_token_str; - if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) { + common_batch_clear(batch); + common_batch_add(batch, next_token, n_past, {0}, true); + + if (llama_decode(ctx, batch)) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); + llama_batch_free(batch); llama_free(ctx); llama_free_model(model); return 1; @@ -92,11 +102,10 @@ int main(int argc, char ** argv) { llama_free(ctx); // make new context - auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params)); + auto * ctx2 = llama_new_context_with_model(model, common_context_params_to_llama(params)); llama_sampler * smpl2 = llama_sampler_chain_init(sparams); - llama_sampler_chain_add(smpl2, llama_sampler_init_softmax()); llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed)); printf("\nsecond run: %s", params.prompt.c_str()); @@ -128,13 +137,17 @@ int main(int argc, char ** argv) { // second run for (auto i = 0; i < params.n_predict; i++) { auto next_token = llama_sampler_sample(smpl2, ctx2, -1); - auto next_token_str = llama_token_to_piece(ctx2, next_token); + auto next_token_str = common_token_to_piece(ctx2, next_token); printf("%s", next_token_str.c_str()); result1 += next_token_str; - if (llama_decode(ctx2, llama_batch_get_one(&next_token, 1, n_past, 0))) { + common_batch_clear(batch); + common_batch_add(batch, next_token, n_past, {0}, true); + + if (llama_decode(ctx2, batch)) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); + llama_batch_free(batch); llama_free(ctx2); llama_free_model(model); return 1; @@ -152,11 +165,10 @@ int main(int argc, char ** argv) { } // make new context - auto * ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params)); + auto * ctx3 = llama_new_context_with_model(model, common_context_params_to_llama(params)); llama_sampler * smpl3 = llama_sampler_chain_init(sparams); - llama_sampler_chain_add(smpl3, llama_sampler_init_softmax()); llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed)); printf("\nsingle seq run: %s", params.prompt.c_str()); @@ -216,13 +228,17 @@ int main(int argc, char ** argv) { // third run with seq 1 instead of 0 for (auto i = 0; i < params.n_predict; i++) { auto next_token = llama_sampler_sample(smpl3, ctx3, -1); - auto next_token_str = llama_token_to_piece(ctx3, next_token); + auto next_token_str = common_token_to_piece(ctx3, next_token); printf("%s", next_token_str.c_str()); result2 += next_token_str; - if (llama_decode(ctx3, llama_batch_get_one(&next_token, 1, n_past, 1))) { + common_batch_clear(batch); + common_batch_add(batch, next_token, n_past, {1}, true); + + if (llama_decode(ctx3, batch)) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); + llama_batch_free(batch); llama_free(ctx3); llama_free_model(model); return 1; @@ -236,6 +252,7 @@ int main(int argc, char ** argv) { llama_sampler_free(smpl2); llama_sampler_free(smpl3); + llama_batch_free(batch); llama_free(ctx3); llama_free_model(model); diff --git a/examples/server/README.md b/examples/server/README.md index cf479aeac..fa0e205f8 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -7,6 +7,7 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp. **Features:** * LLM inference of F16 and quantized models on GPU and CPU * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes + * Reranking endoint (WIP: https://github.com/ggerganov/llama.cpp/pull/9510) * Parallel decoding with multi-user support * Continuous batching * Multimodal (wip) @@ -17,12 +18,15 @@ The project is under active development, and we are [looking for feedback and co ## Usage + + **Common params** | Argument | Explanation | | -------- | ----------- | | `-h, --help, --usage` | print usage and exit | | `--version` | show version and build info | +| `--verbose-prompt` | print a verbose prompt before generation (default: false) | | `-t, --threads N` | number of threads to use during generation (default: -1)
(env: LLAMA_ARG_THREADS) | | `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) | | `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") | @@ -56,8 +60,6 @@ The project is under active development, and we are [looking for feedback and co | `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
(env: LLAMA_ARG_YARN_ATTN_FACTOR) | | `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0)
(env: LLAMA_ARG_YARN_BETA_SLOW) | | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)
(env: LLAMA_ARG_YARN_BETA_FAST) | -| `-gan, --grp-attn-n N` | group-attention factor (default: 1)
(env: LLAMA_ARG_GRP_ATTN_N) | -| `-gaw, --grp-attn-w N` | group-attention width (default: 512.0)
(env: LLAMA_ARG_GRP_ATTN_W) | | `-dkvc, --dump-kv-cache` | verbose print of the KV cache | | `-nkvo, --no-kv-offload` | disable KV offload
(env: LLAMA_ARG_NO_KV_OFFLOAD) | | `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | @@ -98,7 +100,7 @@ The project is under active development, and we are [looking for feedback and co | Argument | Explanation | | -------- | ----------- | | `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'
(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) | -| `-s, --seed SEED` | RNG seed (default: 4294967295, use random seed for 4294967295) | +| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | | `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | | `--penalize-nl` | penalize newline tokens (default: false) | @@ -130,7 +132,7 @@ The project is under active development, and we are [looking for feedback and co | `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)
(env: LLAMA_ARG_NO_CONTEXT_SHIFT) | | `-sp, --special` | special tokens output enabled (default: false) | | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) | -| `--pooling {none,mean,cls,last}` | pooling type for embeddings, use model default if unspecified
(env: LLAMA_ARG_POOLING) | +| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified
(env: LLAMA_ARG_POOLING) | | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) | | `-nocb, --no-cont-batching` | disable continuous batching
(env: LLAMA_ARG_NO_CONT_BATCHING) | | `-a, --alias STRING` | set alias for model name (to be used by REST API)
(env: LLAMA_ARG_ALIAS) | @@ -138,20 +140,24 @@ The project is under active development, and we are [looking for feedback and co | `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) | | `--path PATH` | path to serve static files from (default: )
(env: LLAMA_ARG_STATIC_PATH) | | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) | +| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)
(env: LLAMA_ARG_RERANKING) | | `--api-key KEY` | API key to use for authentication (default: none)
(env: LLAMA_API_KEY) | | `--api-key-file FNAME` | path to file containing API keys (default: none) | | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key
(env: LLAMA_ARG_SSL_KEY_FILE) | | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate
(env: LLAMA_ARG_SSL_CERT_FILE) | | `-to, --timeout N` | server read/write timeout in seconds (default: 600)
(env: LLAMA_ARG_TIMEOUT) | | `--threads-http N` | number of threads used to process HTTP requests (default: -1)
(env: LLAMA_ARG_THREADS_HTTP) | -| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications | +| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)
(env: LLAMA_ARG_CACHE_REUSE) | | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_METRICS) | -| `--no-slots` | disables slots monitoring endpoint (default: enabled)
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | +| `--slots` | enable slots monitoring endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_SLOTS) | +| `--props` | enable changing global properties via POST /props (default: disabled)
(env: LLAMA_ARG_ENDPOINT_PROPS) | +| `--no-slots` | disables slots monitoring endpoint
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted:
https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
| | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | + Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var. Example usage of docker compose with environment variables: @@ -312,7 +318,6 @@ node index.js - The prompt is a string or an array with the first element given as a string - The model's `tokenizer.ggml.add_bos_token` metadata is `true` - - The system prompt is empty `temperature`: Adjust the randomness of the generated text. Default: `0.8` @@ -328,6 +333,8 @@ node index.js `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity. + `n_indent`: Specify the minimum line indentation for the generated text in number of whitespace characters. Useful for code completion tasks. Default: `0` + `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token. By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt. @@ -370,14 +377,14 @@ node index.js `min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0` + `t_max_predict_ms`: Set a time limit in milliseconds for the prediction (a.k.a. text-generation) phase. The timeout will trigger if the generation takes more than the specified time (measured since the first token was generated) and if a new-line character has already been generated. Useful for FIM applications. Default: `0`, which is disabled. + `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1` `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false` - `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) - `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values. **Response format** @@ -478,38 +485,99 @@ The same as [the embedding example](../embedding) does. `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. +### POST `/reranking`: Rerank documents according to a given query + +Similar to https://jina.ai/reranker/ but might change in the future. +Requires a reranker model (such as [bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3)) and the `--embedding --pooling rank` options. + + *Options:* + + `query`: The query against which the documents will be ranked. + + `documents`: An array strings representing the documents to be ranked. + + *Aliases:* + - `/rerank` + - `/v1/rerank` + - `/v1/reranking` + + *Examples:* + + ```shell + curl http://127.0.0.1:8012/v1/rerank \ + -H "Content-Type: application/json" \ + -d '{ + "model": "some-model", + "query": "What is panda?", + "top_n": 3, + "documents": [ + "hi", + "it is a bear", + "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." + ] + }' | jq + ``` + ### POST `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream. - *Options:* +*Options:* - `input_prefix`: Set the prefix of the code to infill. +- `input_prefix`: Set the prefix of the code to infill. +- `input_suffix`: Set the suffix of the code to infill. +- `input_extra`: Additional context inserted before the FIM prefix. +- `prompt`: Added after the `FIM_MID` token - `input_suffix`: Set the suffix of the code to infill. +`input_extra` is array of `{"filename": string, "text": string}` objects. - It also accepts all the options of `/completion` except `stream` and `prompt`. +The endpoint also accepts all the options of `/completion`. -- **GET** `/props`: Return current server settings. +If the model has `FIM_REPO` and `FIM_FILE_SEP` tokens, the [repo-level pattern](https://arxiv.org/pdf/2409.12186) is used: + +```txt +myproject +{chunk 0 filename} +{chunk 0 text} +{chunk 1 filename} +{chunk 1 text} +... +filename +[input_prefix][input_suffix][prompt] +``` + +If the tokens are missing, then the extra context is simply prefixed at the start: + +```txt +[input_extra][input_prefix][input_suffix][prompt] +``` + +### **GET** `/props`: Get server global properties. + +This endpoint is public (no API key check). By default, it is read-only. To make POST request to change global properties, you need to start server with `--props` **Response format** ```json { - "assistant_name": "", - "user_name": "", "default_generation_settings": { ... }, "total_slots": 1, "chat_template": "" } ``` -- `assistant_name` - the required assistant name to generate the prompt in case you have specified a system prompt for all slots. -- `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots. - `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint. - `total_slots` - the total number of slots for process requests (defined by `--parallel` option) - `chat_template` - the model's original Jinja2 prompt template +### POST `/props`: Change server global properties. + +To use this endpoint with POST method, you need to start server with `--props` + +*Options:* + +- None yet + ### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used. @@ -855,28 +923,6 @@ To know the `id` of the adapter, use GET `/lora-adapters` ## More examples -### Change system prompt on runtime - -To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt`. This only needs to be used once. - -`prompt`: Specify a context that you want all connecting clients to respect. - -`anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint. - -`assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint. - -```json -{ - "system_prompt": { - "prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:", - "anti_prompt": "User:", - "assistant_name": "Assistant:" - } -} -``` - -**NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`. - ### Interactive mode Check the sample in [chat.mjs](chat.mjs). diff --git a/examples/server/public/index-new.html b/examples/server/public/index-new.html index c87dd8f1e..ad4183cd9 100644 --- a/examples/server/public/index-new.html +++ b/examples/server/public/index-new.html @@ -43,6 +43,8 @@ top_k: 0, // <= 0 to use vocab size top_p: 1.0, // 1.0 = disabled min_p: 0.05, // 0 = disabled; recommended for non-english: ~ 0.4 + xtc_probability: 0.0, // 0 = disabled; + xtc_threshold: 0.1, // > 0.5 disables XTC; tfs_z: 1.0, // 1.0 = disabled typical_p: 1.0, // 1.0 = disabled presence_penalty: 0.0, // 0.0 = disabled @@ -836,6 +838,8 @@ return html` ${FloatField({ label: "TFS-Z", title: "Activates tail-free sampling, a method used to limit the prediction of tokens that are too frequent. The parameter z controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })} ${FloatField({ label: "Frequency Penalty", title: "A penalty that is applied based on the frequency with which certain tokens occur in the training data set. A higher value results in rare tokens being favoured.", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })} ${FloatField({ label: "Typical-P", title: "Activates local typical sampling, a method used to limit the prediction of tokens that are atypical in the current context. The parameter p controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })} + ${FloatField({ label: "XTC probability", title: "Sets the chance for token removal (checked once on sampler start)", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })} + ${FloatField({ label: "XTC threshold", title: "Sets a minimum probability threshold for tokens to be removed", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })} ${IntField({ label: "Min Keep", title: "If greater than 0, samplers are forced to return N possible tokens at minimum. Default is 0", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })} @@ -1132,6 +1136,8 @@ document.addEventListener('DOMContentLoaded', (event) => { const snapSettings = { temperature: { snapValue: 1.0, snapRangeMultiplier: 6 }, min_p: { snapValue: 0.05, snapRangeMultiplier: 2 }, + xtc_probability: { snapValue: 0.0, snapRangeMultiplier: 4 }, + xtc_threshold: { snapValue: 0.5, snapRangeMultiplier: 4 }, top_p: { snapValue: 1.0, snapRangeMultiplier: 4 }, tfs_z: { snapValue: 1.0, snapRangeMultiplier: 4 }, typical_p: { snapValue: 1.0, snapRangeMultiplier: 4 }, diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 07fec6a38..88065705f 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -307,6 +307,8 @@ top_k: 40, // <= 0 to use vocab size top_p: 0.95, // 1.0 = disabled min_p: 0.05, // 0 = disabled + xtc_probability: 0.0, // 0 = disabled; + xtc_threshold: 0.1, // > 0.5 disables XTC; tfs_z: 1.0, // 1.0 = disabled typical_p: 1.0, // 1.0 = disabled presence_penalty: 0.0, // 0.0 = disabled @@ -1013,6 +1015,8 @@ ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })} ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })} ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })} + ${FloatField({ label: "XTC probability", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })} + ${FloatField({ label: "XTC threshold", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
diff --git a/examples/server/public/index.js b/examples/server/public/index.js index fe615ca25..32ec6e9e1 100644 --- a/examples/server/public/index.js +++ b/examples/server/public/index.js @@ -1 +1 @@ -const t=Symbol.for("preact-signals");function n(){if(r>1){r--;return}let t,n=!1;while(void 0!==i){let _=i;i=void 0;u++;while(void 0!==_){const i=_.o;_.o=void 0;_.f&=-3;if(!(8&_.f)&&h(_))try{_.c()}catch(e){if(!n){t=e;n=!0}}_=i}}u=0;r--;if(n)throw t}function e(t){if(r>0)return t();r++;try{return t()}finally{n()}}let _,i;function o(t){const n=_;_=void 0;try{return t()}finally{_=n}}let r=0,u=0,l=0;function f(t){if(void 0===_)return;let n=t.n;if(void 0===n||n.t!==_){n={i:0,S:t,p:_.s,n:void 0,t:_,e:void 0,x:void 0,r:n};if(void 0!==_.s)_.s.n=n;_.s=n;t.n=n;if(32&_.f)t.S(n);return n}else if(-1===n.i){n.i=0;if(void 0!==n.n){n.n.p=n.p;if(void 0!==n.p)n.p.n=n.n;n.p=_.s;n.n=void 0;_.s.n=n;_.s=n}return n}}function s(t){this.v=t;this.i=0;this.n=void 0;this.t=void 0}s.prototype.brand=t;s.prototype.h=function(){return!0};s.prototype.S=function(t){if(this.t!==t&&void 0===t.e){t.x=this.t;if(void 0!==this.t)this.t.e=t;this.t=t}};s.prototype.U=function(t){if(void 0!==this.t){const n=t.e,e=t.x;if(void 0!==n){n.x=e;t.e=void 0}if(void 0!==e){e.e=n;t.x=void 0}if(t===this.t)this.t=e}};s.prototype.subscribe=function(t){return k(()=>{const n=this.value,e=_;_=void 0;try{t(n)}finally{_=e}})};s.prototype.valueOf=function(){return this.value};s.prototype.toString=function(){return this.value+""};s.prototype.toJSON=function(){return this.value};s.prototype.peek=function(){const t=_;_=void 0;try{return this.value}finally{_=t}};Object.defineProperty(s.prototype,"value",{get(){const t=f(this);if(void 0!==t)t.i=this.i;return this.v},set(t){if(t!==this.v){if(u>100)throw new Error("Cycle detected");this.v=t;this.i++;l++;r++;try{for(let t=this.t;void 0!==t;t=t.x)t.t.N()}finally{n()}}}});function c(t){return new s(t)}function h(t){for(let n=t.s;void 0!==n;n=n.n)if(n.S.i!==n.i||!n.S.h()||n.S.i!==n.i)return!0;return!1}function a(t){for(let n=t.s;void 0!==n;n=n.n){const e=n.S.n;if(void 0!==e)n.r=e;n.S.n=n;n.i=-1;if(void 0===n.n){t.s=n;break}}}function p(t){let n,e=t.s;while(void 0!==e){const t=e.p;if(-1===e.i){e.S.U(e);if(void 0!==t)t.n=e.n;if(void 0!==e.n)e.n.p=t}else n=e;e.S.n=e.r;if(void 0!==e.r)e.r=void 0;e=t}t.s=n}function d(t){s.call(this,void 0);this.x=t;this.s=void 0;this.g=l-1;this.f=4}(d.prototype=new s).h=function(){this.f&=-3;if(1&this.f)return!1;if(32==(36&this.f))return!0;this.f&=-5;if(this.g===l)return!0;this.g=l;this.f|=1;if(this.i>0&&!h(this)){this.f&=-2;return!0}const t=_;try{a(this);_=this;const t=this.x();if(16&this.f||this.v!==t||0===this.i){this.v=t;this.f&=-17;this.i++}}catch(t){this.v=t;this.f|=16;this.i++}_=t;p(this);this.f&=-2;return!0};d.prototype.S=function(t){if(void 0===this.t){this.f|=36;for(let t=this.s;void 0!==t;t=t.n)t.S.S(t)}s.prototype.S.call(this,t)};d.prototype.U=function(t){if(void 0!==this.t){s.prototype.U.call(this,t);if(void 0===this.t){this.f&=-33;for(let t=this.s;void 0!==t;t=t.n)t.S.U(t)}}};d.prototype.N=function(){if(!(2&this.f)){this.f|=6;for(let t=this.t;void 0!==t;t=t.x)t.t.N()}};Object.defineProperty(d.prototype,"value",{get(){if(1&this.f)throw new Error("Cycle detected");const t=f(this);this.h();if(void 0!==t)t.i=this.i;if(16&this.f)throw this.v;return this.v}});function v(t){return new d(t)}function y(t){const e=t.u;t.u=void 0;if("function"==typeof e){r++;const i=_;_=void 0;try{e()}catch(n){t.f&=-2;t.f|=8;m(t);throw n}finally{_=i;n()}}}function m(t){for(let n=t.s;void 0!==n;n=n.n)n.S.U(n);t.x=void 0;t.s=void 0;y(t)}function g(t){if(_!==this)throw new Error("Out-of-order effect");p(this);_=t;this.f&=-2;if(8&this.f)m(this);n()}function b(t){this.x=t;this.u=void 0;this.s=void 0;this.o=void 0;this.f=32}b.prototype.c=function(){const t=this.S();try{if(8&this.f)return;if(void 0===this.x)return;const n=this.x();if("function"==typeof n)this.u=n}finally{t()}};b.prototype.S=function(){if(1&this.f)throw new Error("Cycle detected");this.f|=1;this.f&=-9;y(this);a(this);r++;const t=_;_=this;return g.bind(this,t)};b.prototype.N=function(){if(!(2&this.f)){this.f|=2;this.o=i;i=this}};b.prototype.d=function(){this.f|=8;if(!(1&this.f))m(this)};function k(t){const n=new b(t);try{n.c()}catch(t){n.d();throw t}return n.d.bind(n)}var w,S,x,C,U,E,H,P,N,$,T,D,M={},F=[],A=/acit|ex(?:s|g|n|p|$)|rph|grid|ows|mnc|ntw|ine[ch]|zoo|^ord|itera/i,W=Array.isArray;function L(t,n){for(var e in n)t[e]=n[e];return t}function O(t){var n=t.parentNode;n&&n.removeChild(t)}function R(t,n,e){var _,i,o,r={};for(o in n)"key"==o?_=n[o]:"ref"==o?i=n[o]:r[o]=n[o];if(arguments.length>2&&(r.children=arguments.length>3?w.call(arguments,2):e),"function"==typeof t&&null!=t.defaultProps)for(o in t.defaultProps)void 0===r[o]&&(r[o]=t.defaultProps[o]);return I(t,r,_,i,null)}function I(t,n,e,_,i){var o={type:t,props:n,key:e,ref:_,__k:null,__:null,__b:0,__e:null,__d:void 0,__c:null,constructor:void 0,__v:null==i?++x:i,__i:-1,__u:0};return null==i&&null!=S.vnode&&S.vnode(o),o}function V(){return{current:null}}function j(t){return t.children}function q(t,n){this.props=t,this.context=n}function B(t,n){if(null==n)return t.__?B(t.__,t.__i+1):null;for(var e;nn&&U.sort(P));J.__r=0}function K(t,n,e,_,i,o,r,u,l,f,s){var c,h,a,p,d,v=_&&_.__k||F,y=n.length;for(e.__d=l,Q(e,n,v),l=e.__d,c=0;c0?I(i.type,i.props,i.key,i.ref?i.ref:null,i.__v):i)?(i.__=t,i.__b=t.__b+1,u=Z(i,e,r,s),i.__i=u,o=null,-1!==u&&(s--,(o=e[u])&&(o.__u|=131072)),null==o||null===o.__v?(-1==u&&c--,"function"!=typeof i.type&&(i.__u|=65536)):u!==r&&(u==r-1?c--:u==r+1?c++:u>r?s>l-r?c+=u-r:c--:u(null!=l&&0==(131072&l.__u)?1:0))for(;r>=0||u=0){if((l=n[r])&&0==(131072&l.__u)&&i==l.key&&o===l.type)return r;r--}if(u2&&(u.children=arguments.length>3?w.call(arguments,2):e),I(t.type,u,_||t.key,i||t.ref,null)}function ht(t,n){var e={__c:n="__cC"+D++,__:t,Consumer:function(t,n){return t.children(n)},Provider:function(t){var e,_;return this.getChildContext||(e=[],(_={})[n]=this,this.getChildContext=function(){return _},this.componentWillUnmount=function(){e=null},this.shouldComponentUpdate=function(t){this.props.value!==t.value&&e.some((function(t){t.__e=!0,G(t)}))},this.sub=function(t){e.push(t);var n=t.componentWillUnmount;t.componentWillUnmount=function(){e&&e.splice(e.indexOf(t),1),n&&n.call(t)}}),t.children}};return e.Provider.__=e.Consumer.contextType=e}w=F.slice,S={__e:function(t,n,e,_){for(var i,o,r;n=n.__;)if((i=n.__c)&&!i.__)try{if((o=i.constructor)&&null!=o.getDerivedStateFromError&&(i.setState(o.getDerivedStateFromError(t)),r=i.__d),null!=i.componentDidCatch&&(i.componentDidCatch(t,_||{}),r=i.__d),r)return i.__E=i}catch(n){t=n}throw t}},x=0,C=function(t){return null!=t&&null==t.constructor},q.prototype.setState=function(t,n){var e;e=null!=this.__s&&this.__s!==this.state?this.__s:this.__s=L({},this.state),"function"==typeof t&&(t=t(L({},e),this.props)),t&&L(e,t),null!=t&&this.__v&&(n&&this._sb.push(n),G(this))},q.prototype.forceUpdate=function(t){this.__v&&(this.__e=!0,t&&this.__h.push(t),G(this))},q.prototype.render=j,U=[],H="function"==typeof Promise?Promise.prototype.then.bind(Promise.resolve()):setTimeout,P=function(t,n){return t.__v.__b-n.__v.__b},J.__r=0,N=0,$=et(!1),T=et(!0),D=0;var at,pt,dt,vt,yt=0,mt=[],gt=S,bt=gt.__b,kt=gt.__r,wt=gt.diffed,St=gt.__c,xt=gt.unmount,Ct=gt.__;function Ut(t,n){gt.__h&>.__h(pt,t,yt||n),yt=0;var e=pt.__H||(pt.__H={__:[],__h:[]});return t>=e.__.length&&e.__.push({}),e.__[t]}function Et(t){return yt=1,Ht(Bt,t)}function Ht(t,n,e){var _=Ut(at++,2);if(_.t=t,!_.__c&&(_.__=[e?e(n):Bt(void 0,n),function(t){var n=_.__N?_.__N[0]:_.__[0],e=_.t(n,t);n!==e&&(_.__N=[e,_.__[1]],_.__c.setState({}))}],_.__c=pt,!pt.u)){var i=function(t,n,e){if(!_.__c.__H)return!0;var i=_.__c.__H.__.filter((function(t){return!!t.__c}));if(i.every((function(t){return!t.__N})))return!o||o.call(this,t,n,e);var r=!1;return i.forEach((function(t){if(t.__N){var n=t.__[0];t.__=t.__N,t.__N=void 0,n!==t.__[0]&&(r=!0)}})),!(!r&&_.__c.props===t)&&(!o||o.call(this,t,n,e))};pt.u=!0;var o=pt.shouldComponentUpdate,r=pt.componentWillUpdate;pt.componentWillUpdate=function(t,n,e){if(this.__e){var _=o;o=void 0,i(t,n,e),o=_}r&&r.call(this,t,n,e)},pt.shouldComponentUpdate=i}return _.__N||_.__}function Pt(t,n){var e=Ut(at++,3);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__H.__h.push(e))}function Nt(t,n){var e=Ut(at++,4);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__h.push(e))}function $t(t){return yt=5,Dt((function(){return{current:t}}),[])}function Tt(t,n,e){yt=6,Nt((function(){return"function"==typeof t?(t(n()),function(){return t(null)}):t?(t.current=n(),function(){return t.current=null}):void 0}),null==e?e:e.concat(t))}function Dt(t,n){var e=Ut(at++,7);return qt(e.__H,n)&&(e.__=t(),e.__H=n,e.__h=t),e.__}function Mt(t,n){return yt=8,Dt((function(){return t}),n)}function Ft(t){var n=pt.context[t.__c],e=Ut(at++,9);return e.c=t,n?(null==e.__&&(e.__=!0,n.sub(pt)),n.props.value):t.__}function At(t,n){gt.useDebugValue&>.useDebugValue(n?n(t):t)}function Wt(t){var n=Ut(at++,10),e=Et();return n.__=t,pt.componentDidCatch||(pt.componentDidCatch=function(t,_){n.__&&n.__(t,_),e[1](t)}),[e[0],function(){e[1](void 0)}]}function Lt(){var t=Ut(at++,11);if(!t.__){for(var n=pt.__v;null!==n&&!n.__m&&null!==n.__;)n=n.__;var e=n.__m||(n.__m=[0,0]);t.__="P"+e[0]+"-"+e[1]++}return t.__}function Ot(){for(var t;t=mt.shift();)if(t.__P&&t.__H)try{t.__H.__h.forEach(Vt),t.__H.__h.forEach(jt),t.__H.__h=[]}catch(n){t.__H.__h=[],gt.__e(n,t.__v)}}gt.__b=function(t){pt=null,bt&&bt(t)},gt.__=function(t,n){t&&n.__k&&n.__k.__m&&(t.__m=n.__k.__m),Ct&&Ct(t,n)},gt.__r=function(t){kt&&kt(t),at=0;var n=(pt=t.__c).__H;n&&(dt===pt?(n.__h=[],pt.__h=[],n.__.forEach((function(t){t.__N&&(t.__=t.__N),t.i=t.__N=void 0}))):(n.__h.forEach(Vt),n.__h.forEach(jt),n.__h=[],at=0)),dt=pt},gt.diffed=function(t){wt&&wt(t);var n=t.__c;n&&n.__H&&(n.__H.__h.length&&(1!==mt.push(n)&&vt===gt.requestAnimationFrame||((vt=gt.requestAnimationFrame)||It)(Ot)),n.__H.__.forEach((function(t){t.i&&(t.__H=t.i),t.i=void 0}))),dt=pt=null},gt.__c=function(t,n){n.some((function(t){try{t.__h.forEach(Vt),t.__h=t.__h.filter((function(t){return!t.__||jt(t)}))}catch(r){n.some((function(t){t.__h&&(t.__h=[])})),n=[],gt.__e(r,t.__v)}})),St&&St(t,n)},gt.unmount=function(t){xt&&xt(t);var n,e=t.__c;e&&e.__H&&(e.__H.__.forEach((function(t){try{Vt(t)}catch(t){n=t}})),e.__H=void 0,n&>.__e(n,e.__v))};var Rt="function"==typeof requestAnimationFrame;function It(t){var n,e=function(){clearTimeout(_),Rt&&cancelAnimationFrame(n),setTimeout(t)},_=setTimeout(e,100);Rt&&(n=requestAnimationFrame(e))}function Vt(t){var n=pt,e=t.__c;"function"==typeof e&&(t.__c=void 0,e()),pt=n}function jt(t){var n=pt;t.__c=t.__(),pt=n}function qt(t,n){return!t||t.length!==n.length||n.some((function(n,e){return n!==t[e]}))}function Bt(t,n){return"function"==typeof n?n(t):n}function zt(t,n){S[t]=n.bind(null,S[t]||(()=>{}))}let Gt,Jt;function Kt(t){if(Jt)Jt();Jt=t&&t.S()}function Qt({data:t}){const n=Yt(t);n.value=t;const e=Dt(()=>{let t=this.__v;while(t=t.__)if(t.__c){t.__c.__$f|=4;break}this.__$u.c=()=>{var t;if(!C(e.peek())&&3===(null==(t=this.base)?void 0:t.nodeType))this.base.data=e.peek();else{this.__$f|=1;this.setState({})}};return v(()=>{let t=n.value.value;return 0===t?0:!0===t?"":t||""})},[]);return e.value}Qt.displayName="_st";Object.defineProperties(s.prototype,{constructor:{configurable:!0,value:void 0},type:{configurable:!0,value:Qt},props:{configurable:!0,get(){return{data:this}}},__b:{configurable:!0,value:1}});zt("__b",(t,n)=>{if("string"==typeof n.type){let t,e=n.props;for(let _ in e){if("children"===_)continue;let i=e[_];if(i instanceof s){if(!t)n.__np=t={};t[_]=i;e[_]=i.peek()}}}t(n)});zt("__r",(t,n)=>{Kt();let e,_=n.__c;if(_){_.__$f&=-2;e=_.__$u;if(void 0===e)_.__$u=e=function(t){let n;k((function(){n=this}));n.c=()=>{_.__$f|=1;_.setState({})};return n}()}Gt=_;Kt(e);t(n)});zt("__e",(t,n,e,_)=>{Kt();Gt=void 0;t(n,e,_)});zt("diffed",(t,n)=>{Kt();Gt=void 0;let e;if("string"==typeof n.type&&(e=n.__e)){let t=n.__np,_=n.props;if(t){let n=e.U;if(n)for(let e in n){let _=n[e];if(void 0!==_&&!(e in t)){_.d();n[e]=void 0}}else{n={};e.U=n}for(let i in t){let o=n[i],r=t[i];if(void 0===o){o=Xt(e,i,r,_);n[i]=o}else o.o(r,_)}}}t(n)});function Xt(t,n,e,_){const i=n in t&&void 0===t.ownerSVGElement,o=c(e);return{o:(t,n)=>{o.value=t;_=n},d:k(()=>{const e=o.value.value;if(_[n]!==e){_[n]=e;if(i)t[n]=e;else if(e)t.setAttribute(n,e);else t.removeAttribute(n)}})}}zt("unmount",(t,n)=>{if("string"==typeof n.type){let t=n.__e;if(t){const n=t.U;if(n){t.U=void 0;for(let t in n){let e=n[t];if(e)e.d()}}}}else{let t=n.__c;if(t){const n=t.__$u;if(n){t.__$u=void 0;n.d()}}}t(n)});zt("__h",(t,n,e,_)=>{if(_<3||9===_)n.__$f|=2;t(n,e,_)});q.prototype.shouldComponentUpdate=function(t,n){const e=this.__$u;if(!(e&&void 0!==e.s||4&this.__$f))return!0;if(3&this.__$f)return!0;for(let _ in n)return!0;for(let _ in t)if("__source"!==_&&t[_]!==this.props[_])return!0;for(let _ in this.props)if(!(_ in t))return!0;return!1};function Yt(t){return Dt(()=>c(t),[])}function Zt(t){const n=$t(t);n.current=t;Gt.__$f|=4;return Dt(()=>v(()=>n.current()),[])}function tn(t){const n=$t(t);n.current=t;Pt(()=>k(()=>n.current()),[])}var nn=function(t,n,e,_){var i;n[0]=0;for(var o=1;o=5&&((i||!t&&5===_)&&(r.push(_,0,i,e),_=6),t&&(r.push(_,t,0,e),_=6)),i=""},l=0;l"===n?(_=1,i=""):i=n+i[0]:o?n===o?o="":i+=n:'"'===n||"'"===n?o=n:">"===n?(u(),_=1):_&&("="===n?(_=5,e=i,i=""):"/"===n&&(_<5||">"===t[l][f+1])?(u(),3===_&&(r=r[0]),_=r,(r=r[0]).push(2,0,_),_=0):" "===n||"\t"===n||"\n"===n||"\r"===n?(u(),_=2):i+=n),3===_&&"!--"===i&&(_=4,r=r[0])}return u(),r}(t)),n),arguments,[])).length>1?n:n[0]}var on=_n.bind(R);export{q as Component,j as Fragment,s as Signal,e as batch,ct as cloneElement,v as computed,ht as createContext,R as createElement,V as createRef,k as effect,R as h,on as html,st as hydrate,C as isValidElement,S as options,ft as render,c as signal,Y as toChildArray,o as untracked,Mt as useCallback,Zt as useComputed,Ft as useContext,At as useDebugValue,Pt as useEffect,Wt as useErrorBoundary,Lt as useId,Tt as useImperativeHandle,Nt as useLayoutEffect,Dt as useMemo,Ht as useReducer,$t as useRef,Yt as useSignal,tn as useSignalEffect,Et as useState}; +const t=Symbol.for("preact-signals");function n(){if(r>1){r--;return}let t,n=!1;while(void 0!==i){let _=i;i=void 0;u++;while(void 0!==_){const i=_.o;_.o=void 0;_.f&=-3;if(!(8&_.f)&&h(_))try{_.c()}catch(e){if(!n){t=e;n=!0}}_=i}}u=0;r--;if(n)throw t}function e(t){if(r>0)return t();r++;try{return t()}finally{n()}}let _,i;function o(t){const n=_;_=void 0;try{return t()}finally{_=n}}let r=0,u=0,l=0;function s(t){if(void 0===_)return;let n=t.n;if(void 0===n||n.t!==_){n={i:0,S:t,p:_.s,n:void 0,t:_,e:void 0,x:void 0,r:n};if(void 0!==_.s)_.s.n=n;_.s=n;t.n=n;if(32&_.f)t.S(n);return n}else if(-1===n.i){n.i=0;if(void 0!==n.n){n.n.p=n.p;if(void 0!==n.p)n.p.n=n.n;n.p=_.s;n.n=void 0;_.s.n=n;_.s=n}return n}}function f(t){this.v=t;this.i=0;this.n=void 0;this.t=void 0}f.prototype.brand=t;f.prototype.h=function(){return!0};f.prototype.S=function(t){if(this.t!==t&&void 0===t.e){t.x=this.t;if(void 0!==this.t)this.t.e=t;this.t=t}};f.prototype.U=function(t){if(void 0!==this.t){const n=t.e,e=t.x;if(void 0!==n){n.x=e;t.e=void 0}if(void 0!==e){e.e=n;t.x=void 0}if(t===this.t)this.t=e}};f.prototype.subscribe=function(t){return k(()=>{const n=this.value,e=_;_=void 0;try{t(n)}finally{_=e}})};f.prototype.valueOf=function(){return this.value};f.prototype.toString=function(){return this.value+""};f.prototype.toJSON=function(){return this.value};f.prototype.peek=function(){const t=_;_=void 0;try{return this.value}finally{_=t}};Object.defineProperty(f.prototype,"value",{get(){const t=s(this);if(void 0!==t)t.i=this.i;return this.v},set(t){if(t!==this.v){if(u>100)throw new Error("Cycle detected");this.v=t;this.i++;l++;r++;try{for(let t=this.t;void 0!==t;t=t.x)t.t.N()}finally{n()}}}});function c(t){return new f(t)}function h(t){for(let n=t.s;void 0!==n;n=n.n)if(n.S.i!==n.i||!n.S.h()||n.S.i!==n.i)return!0;return!1}function a(t){for(let n=t.s;void 0!==n;n=n.n){const e=n.S.n;if(void 0!==e)n.r=e;n.S.n=n;n.i=-1;if(void 0===n.n){t.s=n;break}}}function p(t){let n,e=t.s;while(void 0!==e){const t=e.p;if(-1===e.i){e.S.U(e);if(void 0!==t)t.n=e.n;if(void 0!==e.n)e.n.p=t}else n=e;e.S.n=e.r;if(void 0!==e.r)e.r=void 0;e=t}t.s=n}function d(t){f.call(this,void 0);this.x=t;this.s=void 0;this.g=l-1;this.f=4}(d.prototype=new f).h=function(){this.f&=-3;if(1&this.f)return!1;if(32==(36&this.f))return!0;this.f&=-5;if(this.g===l)return!0;this.g=l;this.f|=1;if(this.i>0&&!h(this)){this.f&=-2;return!0}const t=_;try{a(this);_=this;const t=this.x();if(16&this.f||this.v!==t||0===this.i){this.v=t;this.f&=-17;this.i++}}catch(t){this.v=t;this.f|=16;this.i++}_=t;p(this);this.f&=-2;return!0};d.prototype.S=function(t){if(void 0===this.t){this.f|=36;for(let t=this.s;void 0!==t;t=t.n)t.S.S(t)}f.prototype.S.call(this,t)};d.prototype.U=function(t){if(void 0!==this.t){f.prototype.U.call(this,t);if(void 0===this.t){this.f&=-33;for(let t=this.s;void 0!==t;t=t.n)t.S.U(t)}}};d.prototype.N=function(){if(!(2&this.f)){this.f|=6;for(let t=this.t;void 0!==t;t=t.x)t.t.N()}};Object.defineProperty(d.prototype,"value",{get(){if(1&this.f)throw new Error("Cycle detected");const t=s(this);this.h();if(void 0!==t)t.i=this.i;if(16&this.f)throw this.v;return this.v}});function v(t){return new d(t)}function y(t){const e=t.u;t.u=void 0;if("function"==typeof e){r++;const i=_;_=void 0;try{e()}catch(n){t.f&=-2;t.f|=8;m(t);throw n}finally{_=i;n()}}}function m(t){for(let n=t.s;void 0!==n;n=n.n)n.S.U(n);t.x=void 0;t.s=void 0;y(t)}function g(t){if(_!==this)throw new Error("Out-of-order effect");p(this);_=t;this.f&=-2;if(8&this.f)m(this);n()}function b(t){this.x=t;this.u=void 0;this.s=void 0;this.o=void 0;this.f=32}b.prototype.c=function(){const t=this.S();try{if(8&this.f)return;if(void 0===this.x)return;const n=this.x();if("function"==typeof n)this.u=n}finally{t()}};b.prototype.S=function(){if(1&this.f)throw new Error("Cycle detected");this.f|=1;this.f&=-9;y(this);a(this);r++;const t=_;_=this;return g.bind(this,t)};b.prototype.N=function(){if(!(2&this.f)){this.f|=2;this.o=i;i=this}};b.prototype.d=function(){this.f|=8;if(!(1&this.f))m(this)};function k(t){const n=new b(t);try{n.c()}catch(t){n.d();throw t}return n.d.bind(n)}var w,S,x,C,U,E,H,P,N,$,T,D,M={},A=[],F=/acit|ex(?:s|g|n|p|$)|rph|grid|ows|mnc|ntw|ine[ch]|zoo|^ord|itera/i,W=Array.isArray;function L(t,n){for(var e in n)t[e]=n[e];return t}function O(t){t&&t.parentNode&&t.parentNode.removeChild(t)}function R(t,n,e){var _,i,o,r={};for(o in n)"key"==o?_=n[o]:"ref"==o?i=n[o]:r[o]=n[o];if(arguments.length>2&&(r.children=arguments.length>3?w.call(arguments,2):e),"function"==typeof t&&null!=t.defaultProps)for(o in t.defaultProps)void 0===r[o]&&(r[o]=t.defaultProps[o]);return I(t,r,_,i,null)}function I(t,n,e,_,i){var o={type:t,props:n,key:e,ref:_,__k:null,__:null,__b:0,__e:null,__d:void 0,__c:null,constructor:void 0,__v:null==i?++x:i,__i:-1,__u:0};return null==i&&null!=S.vnode&&S.vnode(o),o}function V(){return{current:null}}function j(t){return t.children}function q(t,n){this.props=t,this.context=n}function B(t,n){if(null==n)return t.__?B(t.__,t.__i+1):null;for(var e;nn&&U.sort(P));J.__r=0}function K(t,n,e,_,i,o,r,u,l,s,f){var c,h,a,p,d,v=_&&_.__k||A,y=n.length;for(e.__d=l,Q(e,n,v),l=e.__d,c=0;c0?I(i.type,i.props,i.key,i.ref?i.ref:null,i.__v):i).__=t,i.__b=t.__b+1,o=null,-1!==(u=i.__i=Z(i,e,r,f))&&(f--,(o=e[u])&&(o.__u|=131072)),null==o||null===o.__v?(-1==u&&c--,"function"!=typeof i.type&&(i.__u|=65536)):u!==r&&(u==r-1?c--:u==r+1?c++:(u>r?c--:c++,i.__u|=65536))):i=t.__k[_]=null;if(f)for(_=0;_(null!=l&&0==(131072&l.__u)?1:0))for(;r>=0||u=0){if((l=n[r])&&0==(131072&l.__u)&&i==l.key&&o===l.type)return r;r--}if(u2&&(u.children=arguments.length>3?w.call(arguments,2):e),I(t.type,u,_||t.key,i||t.ref,null)}function ht(t,n){var e={__c:n="__cC"+D++,__:t,Consumer:function(t,n){return t.children(n)},Provider:function(t){var e,_;return this.getChildContext||(e=new Set,(_={})[n]=this,this.getChildContext=function(){return _},this.componentWillUnmount=function(){e=null},this.shouldComponentUpdate=function(t){this.props.value!==t.value&&e.forEach((function(t){t.__e=!0,G(t)}))},this.sub=function(t){e.add(t);var n=t.componentWillUnmount;t.componentWillUnmount=function(){e&&e.delete(t),n&&n.call(t)}}),t.children}};return e.Provider.__=e.Consumer.contextType=e}w=A.slice,S={__e:function(t,n,e,_){for(var i,o,r;n=n.__;)if((i=n.__c)&&!i.__)try{if((o=i.constructor)&&null!=o.getDerivedStateFromError&&(i.setState(o.getDerivedStateFromError(t)),r=i.__d),null!=i.componentDidCatch&&(i.componentDidCatch(t,_||{}),r=i.__d),r)return i.__E=i}catch(n){t=n}throw t}},x=0,C=function(t){return null!=t&&null==t.constructor},q.prototype.setState=function(t,n){var e;e=null!=this.__s&&this.__s!==this.state?this.__s:this.__s=L({},this.state),"function"==typeof t&&(t=t(L({},e),this.props)),t&&L(e,t),null!=t&&this.__v&&(n&&this._sb.push(n),G(this))},q.prototype.forceUpdate=function(t){this.__v&&(this.__e=!0,t&&this.__h.push(t),G(this))},q.prototype.render=j,U=[],H="function"==typeof Promise?Promise.prototype.then.bind(Promise.resolve()):setTimeout,P=function(t,n){return t.__v.__b-n.__v.__b},J.__r=0,N=0,$=et(!1),T=et(!0),D=0;var at,pt,dt,vt,yt=0,mt=[],gt=S,bt=gt.__b,kt=gt.__r,wt=gt.diffed,St=gt.__c,xt=gt.unmount,Ct=gt.__;function Ut(t,n){gt.__h&>.__h(pt,t,yt||n),yt=0;var e=pt.__H||(pt.__H={__:[],__h:[]});return t>=e.__.length&&e.__.push({}),e.__[t]}function Et(t){return yt=1,Ht(Bt,t)}function Ht(t,n,e){var _=Ut(at++,2);if(_.t=t,!_.__c&&(_.__=[e?e(n):Bt(void 0,n),function(t){var n=_.__N?_.__N[0]:_.__[0],e=_.t(n,t);n!==e&&(_.__N=[e,_.__[1]],_.__c.setState({}))}],_.__c=pt,!pt.u)){var i=function(t,n,e){if(!_.__c.__H)return!0;var i=_.__c.__H.__.filter((function(t){return!!t.__c}));if(i.every((function(t){return!t.__N})))return!o||o.call(this,t,n,e);var r=!1;return i.forEach((function(t){if(t.__N){var n=t.__[0];t.__=t.__N,t.__N=void 0,n!==t.__[0]&&(r=!0)}})),!(!r&&_.__c.props===t)&&(!o||o.call(this,t,n,e))};pt.u=!0;var o=pt.shouldComponentUpdate,r=pt.componentWillUpdate;pt.componentWillUpdate=function(t,n,e){if(this.__e){var _=o;o=void 0,i(t,n,e),o=_}r&&r.call(this,t,n,e)},pt.shouldComponentUpdate=i}return _.__N||_.__}function Pt(t,n){var e=Ut(at++,3);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__H.__h.push(e))}function Nt(t,n){var e=Ut(at++,4);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__h.push(e))}function $t(t){return yt=5,Dt((function(){return{current:t}}),[])}function Tt(t,n,e){yt=6,Nt((function(){return"function"==typeof t?(t(n()),function(){return t(null)}):t?(t.current=n(),function(){return t.current=null}):void 0}),null==e?e:e.concat(t))}function Dt(t,n){var e=Ut(at++,7);return qt(e.__H,n)&&(e.__=t(),e.__H=n,e.__h=t),e.__}function Mt(t,n){return yt=8,Dt((function(){return t}),n)}function At(t){var n=pt.context[t.__c],e=Ut(at++,9);return e.c=t,n?(null==e.__&&(e.__=!0,n.sub(pt)),n.props.value):t.__}function Ft(t,n){gt.useDebugValue&>.useDebugValue(n?n(t):t)}function Wt(t){var n=Ut(at++,10),e=Et();return n.__=t,pt.componentDidCatch||(pt.componentDidCatch=function(t,_){n.__&&n.__(t,_),e[1](t)}),[e[0],function(){e[1](void 0)}]}function Lt(){var t=Ut(at++,11);if(!t.__){for(var n=pt.__v;null!==n&&!n.__m&&null!==n.__;)n=n.__;var e=n.__m||(n.__m=[0,0]);t.__="P"+e[0]+"-"+e[1]++}return t.__}function Ot(){for(var t;t=mt.shift();)if(t.__P&&t.__H)try{t.__H.__h.forEach(Vt),t.__H.__h.forEach(jt),t.__H.__h=[]}catch(n){t.__H.__h=[],gt.__e(n,t.__v)}}gt.__b=function(t){pt=null,bt&&bt(t)},gt.__=function(t,n){t&&n.__k&&n.__k.__m&&(t.__m=n.__k.__m),Ct&&Ct(t,n)},gt.__r=function(t){kt&&kt(t),at=0;var n=(pt=t.__c).__H;n&&(dt===pt?(n.__h=[],pt.__h=[],n.__.forEach((function(t){t.__N&&(t.__=t.__N),t.i=t.__N=void 0}))):(n.__h.forEach(Vt),n.__h.forEach(jt),n.__h=[],at=0)),dt=pt},gt.diffed=function(t){wt&&wt(t);var n=t.__c;n&&n.__H&&(n.__H.__h.length&&(1!==mt.push(n)&&vt===gt.requestAnimationFrame||((vt=gt.requestAnimationFrame)||It)(Ot)),n.__H.__.forEach((function(t){t.i&&(t.__H=t.i),t.i=void 0}))),dt=pt=null},gt.__c=function(t,n){n.some((function(t){try{t.__h.forEach(Vt),t.__h=t.__h.filter((function(t){return!t.__||jt(t)}))}catch(r){n.some((function(t){t.__h&&(t.__h=[])})),n=[],gt.__e(r,t.__v)}})),St&&St(t,n)},gt.unmount=function(t){xt&&xt(t);var n,e=t.__c;e&&e.__H&&(e.__H.__.forEach((function(t){try{Vt(t)}catch(t){n=t}})),e.__H=void 0,n&>.__e(n,e.__v))};var Rt="function"==typeof requestAnimationFrame;function It(t){var n,e=function(){clearTimeout(_),Rt&&cancelAnimationFrame(n),setTimeout(t)},_=setTimeout(e,100);Rt&&(n=requestAnimationFrame(e))}function Vt(t){var n=pt,e=t.__c;"function"==typeof e&&(t.__c=void 0,e()),pt=n}function jt(t){var n=pt;t.__c=t.__(),pt=n}function qt(t,n){return!t||t.length!==n.length||n.some((function(n,e){return n!==t[e]}))}function Bt(t,n){return"function"==typeof n?n(t):n}function zt(t,n){S[t]=n.bind(null,S[t]||(()=>{}))}let Gt,Jt;function Kt(t){if(Jt)Jt();Jt=t&&t.S()}function Qt({data:t}){const n=Yt(t);n.value=t;const e=Dt(()=>{let t=this.__v;while(t=t.__)if(t.__c){t.__c.__$f|=4;break}this.__$u.c=()=>{var t;if(!C(e.peek())&&3===(null==(t=this.base)?void 0:t.nodeType))this.base.data=e.peek();else{this.__$f|=1;this.setState({})}};return v(()=>{let t=n.value.value;return 0===t?0:!0===t?"":t||""})},[]);return e.value}Qt.displayName="_st";Object.defineProperties(f.prototype,{constructor:{configurable:!0,value:void 0},type:{configurable:!0,value:Qt},props:{configurable:!0,get(){return{data:this}}},__b:{configurable:!0,value:1}});zt("__b",(t,n)=>{if("string"==typeof n.type){let t,e=n.props;for(let _ in e){if("children"===_)continue;let i=e[_];if(i instanceof f){if(!t)n.__np=t={};t[_]=i;e[_]=i.peek()}}}t(n)});zt("__r",(t,n)=>{Kt();let e,_=n.__c;if(_){_.__$f&=-2;e=_.__$u;if(void 0===e)_.__$u=e=function(t){let n;k((function(){n=this}));n.c=()=>{_.__$f|=1;_.setState({})};return n}()}Gt=_;Kt(e);t(n)});zt("__e",(t,n,e,_)=>{Kt();Gt=void 0;t(n,e,_)});zt("diffed",(t,n)=>{Kt();Gt=void 0;let e;if("string"==typeof n.type&&(e=n.__e)){let t=n.__np,_=n.props;if(t){let n=e.U;if(n)for(let e in n){let _=n[e];if(void 0!==_&&!(e in t)){_.d();n[e]=void 0}}else{n={};e.U=n}for(let i in t){let o=n[i],r=t[i];if(void 0===o){o=Xt(e,i,r,_);n[i]=o}else o.o(r,_)}}}t(n)});function Xt(t,n,e,_){const i=n in t&&void 0===t.ownerSVGElement,o=c(e);return{o:(t,n)=>{o.value=t;_=n},d:k(()=>{const e=o.value.value;if(_[n]!==e){_[n]=e;if(i)t[n]=e;else if(e)t.setAttribute(n,e);else t.removeAttribute(n)}})}}zt("unmount",(t,n)=>{if("string"==typeof n.type){let t=n.__e;if(t){const n=t.U;if(n){t.U=void 0;for(let t in n){let e=n[t];if(e)e.d()}}}}else{let t=n.__c;if(t){const n=t.__$u;if(n){t.__$u=void 0;n.d()}}}t(n)});zt("__h",(t,n,e,_)=>{if(_<3||9===_)n.__$f|=2;t(n,e,_)});q.prototype.shouldComponentUpdate=function(t,n){const e=this.__$u;if(!(e&&void 0!==e.s||4&this.__$f))return!0;if(3&this.__$f)return!0;for(let _ in n)return!0;for(let _ in t)if("__source"!==_&&t[_]!==this.props[_])return!0;for(let _ in this.props)if(!(_ in t))return!0;return!1};function Yt(t){return Dt(()=>c(t),[])}function Zt(t){const n=$t(t);n.current=t;Gt.__$f|=4;return Dt(()=>v(()=>n.current()),[])}function tn(t){const n=$t(t);n.current=t;Pt(()=>k(()=>n.current()),[])}var nn=function(t,n,e,_){var i;n[0]=0;for(var o=1;o=5&&((i||!t&&5===_)&&(r.push(_,0,i,e),_=6),t&&(r.push(_,t,0,e),_=6)),i=""},l=0;l"===n?(_=1,i=""):i=n+i[0]:o?n===o?o="":i+=n:'"'===n||"'"===n?o=n:">"===n?(u(),_=1):_&&("="===n?(_=5,e=i,i=""):"/"===n&&(_<5||">"===t[l][s+1])?(u(),3===_&&(r=r[0]),_=r,(r=r[0]).push(2,0,_),_=0):" "===n||"\t"===n||"\n"===n||"\r"===n?(u(),_=2):i+=n),3===_&&"!--"===i&&(_=4,r=r[0])}return u(),r}(t)),n),arguments,[])).length>1?n:n[0]}var on=_n.bind(R);export{q as Component,j as Fragment,f as Signal,e as batch,ct as cloneElement,v as computed,ht as createContext,R as createElement,V as createRef,k as effect,R as h,on as html,ft as hydrate,C as isValidElement,S as options,st as render,c as signal,Y as toChildArray,o as untracked,Mt as useCallback,Zt as useComputed,At as useContext,Ft as useDebugValue,Pt as useEffect,Wt as useErrorBoundary,Lt as useId,Tt as useImperativeHandle,Nt as useLayoutEffect,Dt as useMemo,Ht as useReducer,$t as useRef,Yt as useSignal,tn as useSignalEffect,Et as useState}; diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs index 7267f3f9c..e67bb15c1 100644 --- a/examples/server/public/json-schema-to-grammar.mjs +++ b/examples/server/public/json-schema-to-grammar.mjs @@ -529,7 +529,7 @@ export class SchemaConverter { return joinSeq(); }; - return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space") + return this._addRule(name, "\"\\\"\" (" + toRule(transform()) + ") \"\\\"\" space") } _notStrings(strings) { diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 61b900a08..45c295747 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -92,6 +92,7 @@ enum server_task_type { enum server_task_cmpl_type { SERVER_TASK_CMPL_TYPE_NORMAL, SERVER_TASK_CMPL_TYPE_EMBEDDING, + SERVER_TASK_CMPL_TYPE_RERANK, SERVER_TASK_CMPL_TYPE_INFILL, }; @@ -127,12 +128,17 @@ struct slot_params { bool stream = true; bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half - int32_t n_predict = -1; // new tokens to predict + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half + int32_t n_predict = -1; // new tokens to predict + int32_t n_indent = 0; // mininum line indentation for the generated text in number of whitespace characters + + int64_t t_max_prompt_ms = -1; // TODO: implement + int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit json input_prefix; json input_suffix; + std::vector antiprompt; }; struct server_slot { @@ -162,15 +168,24 @@ struct server_slot { json prompt; // can be either a string, array of strings or array of token ids + json input_prefix; + json input_suffix; + json input_extra; + // when a task is submitted, we first tokenize the prompt and store it here std::vector prompt_tokens; + std::vector extra_tokens; + + size_t last_nl_pos = 0; std::string generated_text; std::vector cache_tokens; std::vector generated_token_probs; server_task_cmpl_type cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL; + bool has_next_token = true; + bool has_new_line = false; bool truncated = false; bool stopped_eos = false; bool stopped_word = false; @@ -186,26 +201,20 @@ struct server_slot { // sampling json json_schema; - struct gpt_sampler_params sparams; - struct gpt_sampler * smpl = nullptr; + struct common_sampler_params sparams; + struct common_sampler * smpl = nullptr; llama_token sampled; - int32_t ga_i = 0; // group-attention state - int32_t ga_n = 1; // group-attention factor - int32_t ga_w = 512; // group-attention width - - int32_t n_past_se = 0; // self-extend - // stats - size_t n_sent_text = 0; // number of sent text character + size_t n_sent_text = 0; // number of sent text character size_t n_sent_token_probs = 0; int64_t t_start_process_prompt; int64_t t_start_generation; double t_prompt_processing; // ms - double t_token_generation; // ms + double t_token_generation; // ms std::function callback_on_release; @@ -213,7 +222,9 @@ struct server_slot { SLT_DBG(*this, "%s", "\n"); n_prompt_tokens = 0; + last_nl_pos = 0; generated_text = ""; + has_new_line = false; truncated = false; stopped_eos = false; stopped_word = false; @@ -223,13 +234,11 @@ struct server_slot { n_sent_text = 0; n_sent_token_probs = 0; cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL; - ga_i = 0; - n_past_se = 0; generated_token_probs.clear(); } - bool has_budget(gpt_params &global_params) { + bool has_budget(common_params &global_params) { if (params.n_predict == -1 && global_params.n_predict == -1) { return true; // limitless } @@ -581,9 +590,9 @@ struct server_response { struct server_context { llama_model * model = nullptr; llama_context * ctx = nullptr; - std::vector loras; + std::vector loras; - gpt_params params; + common_params params; llama_batch batch = {}; @@ -593,12 +602,6 @@ struct server_context { int32_t n_ctx; // total context for all clients / slots - // system prompt - bool system_need_update = false; - - std::string system_prompt; - std::vector system_tokens; - // slots / clients std::vector slots; json default_generation_settings_for_props; @@ -625,20 +628,20 @@ struct server_context { // Clear any sampling context for (server_slot & slot : slots) { if (slot.smpl != nullptr) { - gpt_sampler_free(slot.smpl); + common_sampler_free(slot.smpl); } } llama_batch_free(batch); } - bool load_model(const gpt_params & params_) { + bool load_model(const common_params & params_) { params = params_; - // dedicate one sequence to the system prompt + // reserve one extra sequence (seq_id == 0) for extra features params.n_parallel += 1; - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); model = llama_init.model; ctx = llama_init.context; @@ -695,22 +698,6 @@ struct server_context { SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx); - const int ga_n = params.grp_attn_n; - const int ga_w = params.grp_attn_w; - - if (ga_n != 1) { - GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT - GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT - //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT - //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT - - SLT_INF(slot, "slot self-extend: ga_n = %d, ga_w = %d\n", ga_n, ga_w); - } - - slot.ga_i = 0; - slot.ga_n = ga_n; - slot.ga_w = ga_w; - slot.sparams = params.sparams; slot.callback_on_release = [this](int) { @@ -737,12 +724,7 @@ struct server_context { metrics.init(); } - std::vector tokenize(const json & json_prompt, bool add_special) const { - // TODO: currently, we tokenize using special tokens by default - // this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216) - // but it's better compared to completely ignoring ChatML and other chat templates - const bool TMP_FORCE_SPECIAL = true; - + std::vector tokenize(const json & json_prompt, bool add_special, bool parse_special) const { // If `add_bos` is true, we only add BOS, when json_prompt is a string, // or the first element of the json_prompt array is a string. std::vector prompt_tokens; @@ -755,10 +737,10 @@ struct server_context { std::vector p; if (first) { - p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL); + p = common_tokenize(ctx, s, add_special, parse_special); first = false; } else { - p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL); + p = common_tokenize(ctx, s, false, parse_special); } prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); @@ -772,7 +754,7 @@ struct server_context { } } else { auto s = json_prompt.template get(); - prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL); + prompt_tokens = common_tokenize(ctx, s, add_special, parse_special); } return prompt_tokens; @@ -814,7 +796,7 @@ struct server_context { int slot_prompt_len = slot_prompt.size(); // length of the Longest Common Prefix between the current slot's prompt and the input prompt - int lcp_len = common_part(slot_prompt, prompt); + int lcp_len = longest_common_prefix(slot_prompt, prompt); // fraction of the common substring length compared to the current slot's prompt length similarity = static_cast(lcp_len) / slot_prompt_len; @@ -872,9 +854,12 @@ struct server_context { slot.params.stream = json_value(data, "stream", false); slot.params.cache_prompt = json_value(data, "cache_prompt", false); slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict)); + slot.params.n_indent = json_value(data, "n_indent", default_params.n_indent); slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k); slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p); slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p); + slot.sparams.xtc_probability = json_value(data, "xtc_probability", default_sparams.xtc_probability); + slot.sparams.xtc_threshold = json_value(data, "xtc_threshold", default_sparams.xtc_threshold); slot.sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); slot.sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p); slot.sparams.temp = json_value(data, "temperature", default_sparams.temp); @@ -888,11 +873,13 @@ struct server_context { slot.sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau); slot.sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta); slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl); - slot.params.n_keep = json_value(data, "n_keep", slot.params.n_keep); + slot.params.n_keep = json_value(data, "n_keep", default_params.n_keep); slot.params.n_discard = json_value(data, "n_discard", default_params.n_discard); slot.sparams.seed = json_value(data, "seed", default_sparams.seed); slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); + //slot.params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", default_params.t_max_prompt_ms); // TODO: implement + slot.params.t_max_predict_ms = json_value(data, "t_max_predict_ms", default_params.t_max_predict_ms); // process "json_schema" and "grammar" if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) { @@ -901,19 +888,14 @@ struct server_context { } if (data.contains("json_schema") && !data.contains("grammar")) { try { - auto schema = json_value(data, "json_schema", json::object()); - slot.sparams.grammar = json_schema_to_grammar(schema); + auto schema = json_value(data, "json_schema", json::object()); + slot.sparams.grammar = json_schema_to_grammar(schema); } catch (const std::exception & e) { send_error(task, std::string("\"json_schema\": ") + e.what(), ERROR_TYPE_INVALID_REQUEST); return false; } } else { - slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar); - } - - if (slot.params.cache_prompt && slot.ga_n != 1) { - slot.params.cache_prompt = false; - SLT_WRN(slot, "%s", "group-attention is not supported with prompt caching. disabling cache\n"); + slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar); } if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) { @@ -923,11 +905,29 @@ struct server_context { } // infill - slot.params.input_prefix = json_value(data, "input_prefix", default_params.input_prefix); - slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix); + slot.input_prefix = json_value(data, "input_prefix", json()); + slot.input_suffix = json_value(data, "input_suffix", json()); + slot.input_extra = json_value(data, "input_extra", json()); + + SLT_DBG(slot, "extra_context chunks: %d\n", (int) slot.input_extra.size()); + for (const auto & chunk : slot.input_extra) { + // { "text": string, "filename": string } + if (!chunk.contains("text") || !chunk["text"].is_string()) { + send_error(task, "extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST); + return false; + } + + // filename is optional + if (chunk.contains("filename") && !chunk["filename"].is_string()) { + send_error(task, "extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST); + return false; + } + + SLT_DBG(slot, "extra_context chunk in file '%s':\n%s\n", chunk.value("filename", "").c_str(), chunk.value("text", "").c_str()); + } // get prompt - if (task.cmpl_type != SERVER_TASK_CMPL_TYPE_INFILL) { + { const auto & prompt = data.find("prompt"); if (prompt == data.end()) { send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST); @@ -940,8 +940,17 @@ struct server_context { slot.prompt = *prompt; } else if (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_array()) { slot.prompt = prompt->at(0); + } else if (prompt->is_array() && prompt->size() > 1) { + // array of strings + for (const auto & el : *prompt) { + if (!el.is_string()) { + send_error(task, "\"prompt\" must be a string, an array of strings or an array of integers", ERROR_TYPE_INVALID_REQUEST); + return false; + } + } + slot.prompt = *prompt; } else { - send_error(task, "\"prompt\" must be a string or an array of integers", ERROR_TYPE_INVALID_REQUEST); + send_error(task, "\"prompt\" must be a string, an array of strings or an array of integers", ERROR_TYPE_INVALID_REQUEST); return false; } } @@ -974,7 +983,7 @@ struct server_context { slot.sparams.logit_bias.push_back({tok, bias}); } } else if (el[0].is_string()) { - auto toks = llama_tokenize(model, el[0].get(), false); + auto toks = common_tokenize(model, el[0].get(), false); for (auto tok : toks) { slot.sparams.logit_bias.push_back({tok, bias}); } @@ -1017,7 +1026,7 @@ struct server_context { sampler_names.emplace_back(name); } } - slot.sparams.samplers = gpt_sampler_types_from_names(sampler_names, false); + slot.sparams.samplers = common_sampler_types_from_names(sampler_names, false); } else { slot.sparams.samplers = default_sparams.samplers; } @@ -1025,10 +1034,10 @@ struct server_context { { if (slot.smpl != nullptr) { - gpt_sampler_free(slot.smpl); + common_sampler_free(slot.smpl); } - slot.smpl = gpt_sampler_init(model, slot.sparams); + slot.smpl = common_sampler_init(model, slot.sparams); if (slot.smpl == nullptr) { // for now, the only error that may happen here is invalid grammar send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST); @@ -1052,65 +1061,15 @@ struct server_context { clean_kv_cache = false; } - void system_prompt_update() { - SRV_DBG("updating system prompt: '%s'\n", system_prompt.c_str()); - - kv_cache_clear(); - system_tokens.clear(); - - if (!system_prompt.empty()) { - system_tokens = ::llama_tokenize(ctx, system_prompt, true); - - const int32_t n_batch = llama_n_batch(ctx); - const int32_t n_tokens_prompt = system_tokens.size(); - - for (int32_t i = 0; i < n_tokens_prompt; i += n_batch) { - const int32_t n_tokens = std::min(n_batch, n_tokens_prompt - i); - - llama_batch_clear(batch); - - for (int32_t j = 0; j < n_tokens; ++j) { - llama_batch_add(batch, system_tokens[i + j], i + j, { 0 }, false); - } - - if (llama_decode(ctx, batch) != 0) { - SRV_ERR("%s", "llama_decode() failed\n"); - return; - } - } - - // assign the system KV cache to all parallel sequences - for (int32_t i = 1; i <= params.n_parallel; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); - } - } - - system_need_update = false; - } - - bool system_prompt_set(const std::string & sys_prompt) { - SRV_DBG("system prompt set: '%s'\n", system_prompt.c_str()); - - system_prompt = sys_prompt; - - // release all slots - for (server_slot & slot : slots) { - slot.release(); - } - - system_need_update = true; - return true; - } - bool process_token(completion_token_output & result, server_slot & slot) { // remember which tokens were sampled - used for repetition penalties during sampling - const std::string token_str = llama_token_to_piece(ctx, result.tok, params.special); + const std::string token_str = common_token_to_piece(ctx, result.tok, params.special); slot.sampled = result.tok; auto match = slot.antiprompts.findSingleTokenMatch(result.tok); if (match.pos != std::string::npos && !match.is_partial) { if (match.is_grammar_trigger) { - gpt_sampler_trigger_grammar(model, slot.smpl, llama_token_to_piece(ctx, result.tok, params.special)); + common_sampler_trigger_grammar(model, slot.smpl, common_token_to_piece(ctx, result.tok, params.special)); } else { slot.stopped_word = true; slot.stopping_word = match.pattern; @@ -1155,7 +1114,7 @@ struct server_context { size_t length = slot.generated_text.size(); // If there is a lazy grammar trigger word at stop_pos, enable the lazy grammar - if (match.is_grammar_trigger && gpt_sampler_trigger_grammar(model, slot.smpl, match.pattern)) { + if (match.is_grammar_trigger && common_sampler_trigger_grammar(model, slot.smpl, match.pattern)) { is_grammar_trigger = true; length = match.pos + match.matchLength; } else if (!match.is_grammar_trigger && match.pos != std::string::npos && !match.is_partial) { @@ -1199,13 +1158,63 @@ struct server_context { SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict); } + if (slot.has_new_line) { + // if we have already seen a new line, we stop after a certain time limit + if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) { + slot.stopped_limit = true; + slot.has_next_token = false; + + SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms); + } + + // require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent + if (slot.params.n_indent > 0) { + // check the current indentation + // TODO: improve by not doing it more than once for each new line + if (slot.last_nl_pos > 0) { + size_t pos = slot.last_nl_pos; + + int n_indent = 0; + while (pos < slot.generated_text.size() && (slot.generated_text[pos] == ' ' || slot.generated_text[pos] == '\t')) { + n_indent++; + pos++; + } + + if (pos < slot.generated_text.size() && n_indent < slot.params.n_indent) { + slot.stopped_limit = true; + slot.has_next_token = false; + + // cut the last line + slot.generated_text.erase(pos, std::string::npos); + + SLT_DBG(slot, "stopped by indentation limit, n_decoded = %d, n_indent = %d\n", slot.n_decoded, n_indent); + } + } + + // find the next new line + { + const size_t pos = slot.generated_text.find('\n', slot.last_nl_pos); + + if (pos != std::string::npos) { + slot.last_nl_pos = pos + 1; + } + } + } + } + + // check if there is a new line in the generated text + if (result.text_to_send.find('\n') != std::string::npos) { + slot.has_new_line = true; + } + // if context shift is disabled, we stop when it reaches the context limit - if (slot.n_decoded >= slot.n_ctx) { + if (slot.n_past >= slot.n_ctx) { slot.truncated = true; slot.stopped_limit = true; slot.has_next_token = false; - SLT_DBG(slot, "stopped due to running out of context capacity, n_decoded = %d, n_ctx = %d\n", slot.n_decoded, slot.n_ctx); + SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n", + slot.n_decoded, slot.n_prompt_tokens, slot.n_past, slot.n_ctx); } if (llama_token_is_eog(model, result.tok)) { @@ -1217,18 +1226,18 @@ struct server_context { const auto n_ctx_train = llama_n_ctx_train(model); - if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) { + if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) { slot.truncated = true; slot.stopped_limit = true; slot.has_next_token = false; // stop prediction SLT_WRN(slot, - "n_predict (%d) is not set and self-context extend is disabled. " + "n_predict (%d) is set for infinite generation. " "Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n", slot.params.n_predict, n_ctx_train); } - SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: '%s'\n", slot.n_decoded, slot.n_remaining, token_str.c_str()); + SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n", slot.n_decoded, slot.n_remaining, result.tok, token_str.c_str()); return slot.has_next_token; // continue } @@ -1237,7 +1246,7 @@ struct server_context { std::vector samplers; samplers.reserve(slot.sparams.samplers.size()); for (const auto & sampler : slot.sparams.samplers) { - samplers.emplace_back(gpt_sampler_type_to_str(sampler)); + samplers.emplace_back(common_sampler_type_to_str(sampler)); } return json { @@ -1245,13 +1254,15 @@ struct server_context { {"n_predict", slot.n_predict}, // Server configured n_predict {"model", params.model_alias}, {"seed", slot.sparams.seed}, - {"seed_cur", slot.smpl ? gpt_sampler_get_seed(slot.smpl) : 0}, + {"seed_cur", slot.smpl ? common_sampler_get_seed(slot.smpl) : 0}, {"temperature", slot.sparams.temp}, {"dynatemp_range", slot.sparams.dynatemp_range}, {"dynatemp_exponent", slot.sparams.dynatemp_exponent}, {"top_k", slot.sparams.top_k}, {"top_p", slot.sparams.top_p}, {"min_p", slot.sparams.min_p}, + {"xtc_probability", slot.sparams.xtc_probability}, + {"xtc_threshold", slot.sparams.xtc_threshold}, {"tfs_z", slot.sparams.tfs_z}, {"typical_p", slot.sparams.typ_p}, {"repeat_last_n", slot.sparams.penalty_last_n}, @@ -1311,7 +1322,7 @@ struct server_context { }; if (slot.sparams.n_probs > 0) { - const std::vector to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false); + const std::vector to_send_toks = common_tokenize(ctx, tkn.text_to_send, false); const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size()); const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size()); @@ -1348,6 +1359,7 @@ struct server_context { {"tokens_evaluated", slot.n_prompt_tokens}, {"generation_settings", get_formated_generation(slot)}, {"prompt", slot.prompt}, + {"has_new_line", slot.has_new_line}, {"truncated", slot.truncated}, {"stopped_eos", slot.stopped_eos}, {"stopped_word", slot.stopped_word}, @@ -1361,7 +1373,7 @@ struct server_context { if (slot.sparams.n_probs > 0) { std::vector probs; if (!slot.params.stream && slot.stopped_word) { - const std::vector stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false); + const std::vector stop_word_toks = common_tokenize(ctx, slot.stopping_word, false); size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size()); probs = std::vector( @@ -1409,12 +1421,13 @@ struct server_context { res.data = json { {"embedding", std::vector(n_embd, 0.0f)}, + {"index", slot.index}, }; continue; } - llama_embd_normalize(embd, embd_res.data(), n_embd); + common_embd_normalize(embd, embd_res.data(), n_embd); res.data = json { {"embedding", embd_res}, @@ -1427,6 +1440,44 @@ struct server_context { queue_results.send(res); } + void send_rerank(const server_slot & slot, const llama_batch & batch) { + server_task_result res; + res.id = slot.id_task; + res.error = false; + res.stop = true; + + for (int i = 0; i < batch.n_tokens; ++i) { + if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) { + continue; + } + + const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); + if (embd == NULL) { + embd = llama_get_embeddings_ith(ctx, i); + } + + if (embd == NULL) { + SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]); + + res.data = json { + {"index", slot.index}, + {"score", -1e6}, + }; + + continue; + } + + res.data = json { + {"index", slot.index}, + {"score", embd[0]}, + }; + } + + SLT_DBG(slot, "sending rerank result, res = '%s'\n", res.data.dump().c_str()); + + queue_results.send(res); + } + // // Functions to create new task(s) and receive result(s) // @@ -1458,22 +1509,34 @@ struct server_context { if (prompt.is_string() || json_is_array_of_numbers(prompt)) { data["index"] = 0; create_task(data, false, nullptr); - } - // otherwise, it's a multiple-prompt task, we break it into smaller tasks - else if (prompt.is_array()) { + } else if (prompt.is_array()) { + // otherwise, it's a multiple-prompt task, we break it into smaller tasks std::vector prompts = prompt; - for (size_t i = 0; i < prompts.size(); i++) { - const auto & e = prompts[i]; - if (e.is_string() || json_is_array_of_numbers(e)) { - data["index"] = i; - create_task(data, true, e); - } else { - throw std::runtime_error(error_msg); + if (cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { + // prompts[0] is the question + // the rest are the answers/documents + SRV_DBG("creating rerank tasks, n_prompts = %d\n", (int) prompts.size() - 1); + for (size_t i = 1; i < prompts.size(); i++) { + json qd; + qd.push_back(prompts[0]); + qd.push_back(prompts[i]); + data["index"] = i - 1; + create_task(data, true, qd); + } + } else { + SRV_DBG("creating multi-prompt tasks, n_prompts = %d\n", (int) prompts.size()); + for (size_t i = 0; i < prompts.size(); i++) { + const auto & e = prompts[i]; + if (e.is_string() || json_is_array_of_numbers(e)) { + data["index"] = i; + create_task(data, true, e); + } else { + throw std::runtime_error(error_msg); + } } } - } - // invalid case - else { + } else { + // invalid case throw std::runtime_error(error_msg); } @@ -1512,7 +1575,9 @@ struct server_context { return; } - size_t idx = result.data["index"]; + const size_t idx = result.data["index"]; + GGML_ASSERT(idx < results.size() && "index out of range"); + results[idx] = result; } result_handler(results); @@ -1581,16 +1646,6 @@ struct server_context { break; } - if (task.data.contains("system_prompt")) { - std::string sys_prompt = json_value(task.data, "system_prompt", std::string()); - system_prompt_set(sys_prompt); - - for (server_slot & slot : slots) { - slot.n_past = 0; - slot.n_past_se = 0; - } - } - slot->reset(); slot->id_task = task.id; @@ -1631,6 +1686,7 @@ struct server_context { slot_data["prompt"] = slot.prompt; slot_data["next_token"] = { {"has_next_token", slot.has_next_token}, + {"has_new_line", slot.has_new_line}, {"n_remain", slot.n_remaining}, {"n_decoded", slot.n_decoded}, {"stopped_eos", slot.stopped_eos}, @@ -1754,6 +1810,9 @@ struct server_context { } slot->cache_tokens.resize(token_count); + // TODO: maybe detokenize the slot->cache_tokens instead? + slot->prompt = string_format("[restored %d tokens from file]", (int) token_count); + const int64_t t_end = ggml_time_us(); const double t_restore_ms = (t_end - t_start) / 1000.0; @@ -1804,7 +1863,7 @@ struct server_context { } break; case SERVER_TASK_TYPE_SET_LORA: { - llama_lora_adapters_apply(ctx, loras); + common_lora_adapters_apply(ctx, loras); server_task_result result; result.id = task.id; result.stop = true; @@ -1816,10 +1875,6 @@ struct server_context { } void update_slots() { - if (system_need_update) { - system_prompt_update(); - } - // check if all slots are idle { bool all_idle = true; @@ -1833,7 +1888,7 @@ struct server_context { if (all_idle) { SRV_INF("%s", "all slots are idle\n"); - if (system_prompt.empty() && clean_kv_cache) { + if (clean_kv_cache) { kv_cache_clear(); } @@ -1854,43 +1909,41 @@ struct server_context { // apply context-shift if needed // TODO: simplify and improve for (server_slot & slot : slots) { - if (slot.ga_n == 1) { - if (slot.is_processing() && (int) system_tokens.size() + slot.n_past >= slot.n_ctx - 1) { - if (!params.ctx_shift) { - // this check is redundant (for good) - // we should never get here, because generation should already stopped in process_token() - slot.release(); - send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER); - continue; - } - - // Shift context - const int n_keep = slot.params.n_keep + add_bos_token; - const int n_left = (int) system_tokens.size() + slot.n_past - n_keep; - const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2); - - SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); - - llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); - - if (slot.params.cache_prompt) { - for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { - slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; - } - - slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard); - } - - slot.n_past -= n_discard; - - slot.truncated = true; + if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) { + if (!params.ctx_shift) { + // this check is redundant (for good) + // we should never get here, because generation should already stopped in process_token() + slot.release(); + send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER); + continue; } + + // Shift context + const int n_keep = slot.params.n_keep + add_bos_token; + const int n_left = slot.n_past - n_keep; + const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2); + + SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); + + llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard); + llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, slot.n_past, -n_discard); + + if (slot.params.cache_prompt) { + for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { + slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; + } + + slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard); + } + + slot.n_past -= n_discard; + + slot.truncated = true; } } // start populating the batch for this iteration - llama_batch_clear(batch); + common_batch_clear(batch); // frist, add sampled tokens from any ongoing sequences for (auto & slot : slots) { @@ -1900,11 +1953,7 @@ struct server_context { slot.i_batch = batch.n_tokens; - const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; - - // TODO: we always have to take into account the "system_tokens" - // this is not great and needs to be improved somehow - llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true); + common_batch_add(batch, slot.sampled, slot.n_past, { slot.id + 1 }, true); slot.n_past += 1; @@ -1912,8 +1961,8 @@ struct server_context { slot.cache_tokens.push_back(slot.sampled); } - SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_system_tokens = %d, n_cache_tokens = %d, truncated = %d\n", - slot.n_ctx, slot.n_past, (int) system_tokens.size(), (int) slot.cache_tokens.size(), slot.truncated); + SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n", + slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated); } // process in chunks of params.n_batch @@ -1923,6 +1972,7 @@ struct server_context { // track if this is an embedding or non-embedding batch // if we've added sampled tokens above, we are in non-embedding mode // -1: none, 0: non-embedding, 1: embedding + // TODO: make enum int32_t batch_type = batch.n_tokens > 0 ? 0 : -1; // next, batch any pending prompts without exceeding n_batch @@ -1939,40 +1989,126 @@ struct server_context { slot.t_start_process_prompt = ggml_time_us(); slot.t_start_generation = 0; - if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_INFILL) { - const bool add_bos = llama_add_bos_token(model); - bool suff_rm_leading_spc = true; - if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { - params.input_suffix.erase(0, 1); - suff_rm_leading_spc = false; - } + switch (slot.cmpl_type) { + case SERVER_TASK_CMPL_TYPE_NORMAL: + case SERVER_TASK_CMPL_TYPE_EMBEDDING: + { + prompt_tokens = tokenize(slot.prompt, llama_add_bos_token(model), true); + } break; + case SERVER_TASK_CMPL_TYPE_RERANK: + { + // require slot.prompt to be array of 2 strings + if (!slot.prompt.is_array() || slot.prompt.size() != 2) { + SLT_ERR(slot, "%s", "invalid prompt for rerank task\n"); + slot.release(); + send_error(slot, "invalid prompt for rerank task", ERROR_TYPE_INVALID_REQUEST); + continue; + } - auto prefix_tokens = tokenize(slot.params.input_prefix, false); - auto suffix_tokens = tokenize(slot.params.input_suffix, false); + // prompt: [BOS]query[EOS][SEP]doc[EOS] + prompt_tokens.clear(); + prompt_tokens.push_back(llama_token_bos(model)); + { + const auto part = tokenize(slot.prompt[0], false, false); + prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end()); + } + prompt_tokens.push_back(llama_token_eos(model)); + prompt_tokens.push_back(llama_token_sep(model)); + { + const auto part = tokenize(slot.prompt[1], false, false); + prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end()); + } + prompt_tokens.push_back(llama_token_eos(model)); + } break; + case SERVER_TASK_CMPL_TYPE_INFILL: + { + // TODO: optimize this block by reducing memory allocations and movement - const int space_token = 29871; // TODO: this should not be hardcoded - if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) { - suffix_tokens.erase(suffix_tokens.begin()); - } + // use FIM repo-level pattern: + // ref: https://arxiv.org/pdf/2409.12186 + // + // [FIM_REP]myproject + // [FIM_SEP]filename0 + // extra chunk 0 + // [FIM_SEP]filename1 + // extra chunk 1 + // ... + // [FIM_SEP]filename + // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt + // + auto tokens_prefix = tokenize(slot.input_prefix, false, false); + auto tokens_suffix = tokenize(slot.input_suffix, false, false); + auto tokens_prompt = tokenize(slot.prompt, false, false); - prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model)); - suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model)); + slot.extra_tokens.clear(); + if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) { + static const auto k_fim_repo = tokenize("myproject\n", false, false); - auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens; - auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens; - if (add_bos) { - embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); - } - embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); + slot.extra_tokens.push_back(llama_token_fim_rep(model)); + slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end()); + } - const llama_token middle_token = llama_token_middle(model); - if (middle_token >= 0) { - embd_inp.push_back(middle_token); - } + for (const auto & chunk : slot.input_extra) { + // { "text": string, "filename": string } + const std::string text = chunk.value("text", ""); + const std::string filename = chunk.value("filename", "tmp"); - prompt_tokens = embd_inp; - } else { - prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt + if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) { + const auto k_fim_file = tokenize(filename + "\n", false, false); + + slot.extra_tokens.insert(slot.extra_tokens.end(), llama_token_fim_sep(model)); + slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); + } else { + // chunk separator in binary form to avoid confusing the AI + static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00}; + static const auto k_chunk_prefix_tokens = tokenize(k_chunk_prefix_str, false, false); + + slot.extra_tokens.insert(slot.extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end()); + } + + const auto chunk_tokens = tokenize(text, false, false); + slot.extra_tokens.insert(slot.extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end()); + } + + if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) { + // TODO: current filename + static const auto k_fim_file = tokenize("filename\n", false, false); + + slot.extra_tokens.insert(slot.extra_tokens.end(), llama_token_fim_sep(model)); + slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); + } + + // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?) + const int n_suffix_take = std::min(tokens_suffix.size(), (n_batch/4)); + const int n_prefix_take = std::min(tokens_prefix.size(), 3*(n_batch/4) - 3); + + // fill the rest of the context with extra chunks + const int n_extra_take = std::min(std::max(0, slot.n_ctx - (n_batch) - 2*slot.n_predict), slot.extra_tokens.size()); + + tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take); + tokens_suffix.resize(n_suffix_take); + + tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model)); + tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end()); + tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model)); + + auto embd_inp = params.spm_infill ? tokens_suffix : tokens_prefix; + auto embd_end = params.spm_infill ? tokens_prefix : tokens_suffix; + + if (llama_add_bos_token(model)) { + embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + } + + SLT_DBG(slot, "extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", slot.n_ctx, n_extra_take, (int) slot.extra_tokens.size()); + + // put the extra context before the FIM prefix + embd_inp.insert(embd_inp.begin(), slot.extra_tokens.end() - n_extra_take, slot.extra_tokens.end()); + + embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); + embd_inp.push_back(llama_token_fim_mid(model)); + + prompt_tokens = std::move(embd_inp); + } break; } slot.n_past = 0; @@ -1980,6 +2116,19 @@ struct server_context { SLT_INF(slot, "prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens); + // print prompt tokens (for debugging) + if (1) { + // first 16 tokens (avoid flooding logs) + for (int i = 0; i < std::min(16, prompt_tokens.size()); i++) { + SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); + } + } else { + // all + for (int i = 0; i < (int) prompt_tokens.size(); i++) { + SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); + } + } + // empty prompt passed -> release the slot and send empty response if (prompt_tokens.empty()) { SLT_WRN(slot, "%s", "empty prompt - releasing slot\n"); @@ -1990,7 +2139,7 @@ struct server_context { continue; } - if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) { + if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { // this prompt is too large to process - discard it if (slot.n_prompt_tokens > n_ubatch) { slot.release(); @@ -2000,7 +2149,9 @@ struct server_context { } else { if (!params.ctx_shift) { // if context shift is disabled, we make sure prompt size is smaller than KV size - if ((int) system_tokens.size() + slot.n_prompt_tokens >= slot.n_ctx) { + // TODO: there should be a separate parameter that control prompt truncation + // context shift should be applied only during the generation phase + if (slot.n_prompt_tokens >= slot.n_ctx) { slot.release(); send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST); continue; @@ -2011,8 +2162,8 @@ struct server_context { } slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); - // if input prompt is too big, truncate it (if group attention self-extend is disabled) - if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) { + // if input prompt is too big, truncate it + if (slot.n_prompt_tokens >= slot.n_ctx) { const int n_left = slot.n_ctx - slot.params.n_keep; const int n_block_size = n_left / 2; @@ -2037,20 +2188,62 @@ struct server_context { GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); } - gpt_sampler_reset(slot.smpl); - - if (!slot.params.cache_prompt) { - slot.n_past_se = 0; - slot.ga_i = 0; - } else { - GGML_ASSERT(slot.ga_n == 1); + common_sampler_reset(slot.smpl); + if (slot.params.cache_prompt) { // reuse any previously computed tokens that are common with the new prompt - slot.n_past = common_part(slot.cache_tokens, prompt_tokens); + slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens); // push the prompt into the sampling context (do not apply grammar) for (int i = 0; i < slot.n_past; ++i) { - gpt_sampler_accept(slot.smpl, slot.cache_tokens[i], false); + common_sampler_accept(slot.smpl, slot.cache_tokens[i], false); + } + + // reuse chunks from the cached prompt by shifting their KV cache in the new position + if (params.n_cache_reuse > 0) { + size_t head_c = slot.n_past; // cache + size_t head_p = slot.n_past; // current prompt + + SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params.n_cache_reuse, slot.n_past); + + while (head_c < slot.cache_tokens.size() && + head_p < prompt_tokens.size()) { + + size_t n_match = 0; + while (head_c + n_match < slot.cache_tokens.size() && + head_p + n_match < prompt_tokens.size() && + slot.cache_tokens[head_c + n_match] == prompt_tokens[head_p + n_match]) { + + n_match++; + } + + if (n_match >= (size_t) params.n_cache_reuse) { + SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match); + //for (size_t i = head_p; i < head_p + n_match; i++) { + // SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); + //} + + const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; + + llama_kv_cache_seq_rm (ctx, slot.id + 1, head_p, head_c); + llama_kv_cache_seq_add(ctx, slot.id + 1, head_c, -1, kv_shift); + + for (size_t i = 0; i < n_match; i++) { + slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i]; + + common_sampler_accept(slot.smpl, slot.cache_tokens[head_p + i], false); + + slot.n_past++; + } + + head_c += n_match; + head_p += n_match; + } else { + head_c += 1; + } + } + + SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past); } } } @@ -2060,15 +2253,13 @@ struct server_context { SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens); slot.n_past--; - if (slot.ga_i > 0) { - slot.n_past_se--; - } } slot.n_prompt_tokens_processed = 0; } - if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) { + // non-causal tasks require to fit the entire prompt in the physical batch + if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { // cannot fit the prompt in the current batch - will try next iter if (batch.n_tokens + slot.n_prompt_tokens > n_batch) { continue; @@ -2076,7 +2267,10 @@ struct server_context { } // check that we are in the right batch_type, if not defer the slot - bool slot_type = slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING ? 1 : 0; + const bool slot_type = + slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || + slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK ? 1 : 0; + if (batch_type == -1) { batch_type = slot_type; } else if (batch_type != slot_type) { @@ -2084,55 +2278,31 @@ struct server_context { } // keep only the common part - int p0 = (int) system_tokens.size() + slot.n_past; - if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) { + if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1)) { // could not partially delete (likely using a non-Transformer model) llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1); - p0 = (int) system_tokens.size(); - if (p0 != 0) { - // copy over the system prompt when there is one - llama_kv_cache_seq_cp(ctx, 0, slot.id + 1, -1, -1); - } - - // there is no common part left (except for the system prompt) + // there is no common part left slot.n_past = 0; - slot.n_past_se = 0; - slot.ga_i = 0; - // TODO: is the system prompt ever in the sampling context? - gpt_sampler_reset(slot.smpl); + + common_sampler_reset(slot.smpl); } + SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past); + // remove the non-common part from the cache slot.cache_tokens.resize(slot.n_past); - SLT_INF(slot, "kv cache rm [%d, end)\n", p0); - - int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; - - int32_t ga_i = slot.ga_i; - int32_t ga_n = slot.ga_n; - int32_t ga_w = slot.ga_w; - // add prompt tokens for processing in the current batch - // TODO: the self-extend stuff here is a mess - simplify and/or abstract it somehow - for (; slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch; ++slot.n_past) { - if (slot.ga_n != 1) { - while (slot_npast >= ga_i + ga_w) { - const int bd = (ga_w/ga_n)*(ga_n - 1); - slot_npast -= bd; - ga_i += ga_w/ga_n; - } - } - - llama_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false); + while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) { + common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id + 1 }, false); if (slot.params.cache_prompt) { slot.cache_tokens.push_back(prompt_tokens[slot.n_past]); } slot.n_prompt_tokens_processed++; - slot_npast++; + slot.n_past++; } SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens); @@ -2173,34 +2343,6 @@ struct server_context { for (int32_t i = 0; i < batch.n_tokens; i += n_batch) { const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i); - for (auto & slot : slots) { - if (slot.ga_n != 1) { - // context extension via Self-Extend - // TODO: simplify and/or abstract this - while (slot.n_past_se >= slot.ga_i + slot.ga_w) { - const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w; - const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1); - const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w; - - SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd); - SLT_DBG(slot, "div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); - SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); - - llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd); - llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n); - llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd); - - slot.n_past_se -= bd; - - slot.ga_i += slot.ga_w / slot.ga_n; - - SLT_DBG(slot, "\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i); - } - - slot.n_past_se += n_tokens; - } - } - llama_batch batch_view = { n_tokens, batch.token + i, @@ -2209,7 +2351,6 @@ struct server_context { batch.n_seq_id + i, batch.seq_id + i, batch.logits + i, - 0, 0, 0, // unused }; const int ret = llama_decode(ctx, batch_view); @@ -2249,6 +2390,13 @@ struct server_context { continue; // continue loop of slots } + if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { + send_rerank(slot, batch_view); + slot.release(); + slot.i_batch = -1; + continue; // continue loop of slots + } + // prompt evaluated for next-token prediction slot.state = SLOT_STATE_GENERATING; } else if (slot.state != SLOT_STATE_GENERATING) { @@ -2256,9 +2404,9 @@ struct server_context { } completion_token_output result; - const llama_token id = gpt_sampler_sample(slot.smpl, ctx, slot.i_batch - i); + const llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i); - gpt_sampler_accept(slot.smpl, id, true); + common_sampler_accept(slot.smpl, id, true); slot.n_decoded += 1; if (slot.n_decoded == 1) { @@ -2269,7 +2417,7 @@ struct server_context { result.tok = id; - const auto * cur_p = gpt_sampler_get_candidates(slot.smpl); + const auto * cur_p = common_sampler_get_candidates(slot.smpl); for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) { result.probs.push_back({ @@ -2333,13 +2481,13 @@ inline void signal_handler(int signal) { int main(int argc, char ** argv) { // own arguments required by this example - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) { return 1; } - gpt_init(); + common_init(); // enabling this will output extra debug information in the HTTP responses from the server // see format_final_response_oaicompat() @@ -2348,10 +2496,6 @@ int main(int argc, char ** argv) { // struct that contains llama context and inference server_context ctx_server; - if (!params.system_prompt.empty()) { - ctx_server.system_prompt_set(params.system_prompt); - } - if (params.model_alias == "unknown") { params.model_alias = params.model; } @@ -2361,7 +2505,7 @@ int main(int argc, char ** argv) { LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency()); LOG_INF("\n"); - LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); LOG_INF("\n"); std::unique_ptr svr; @@ -2455,20 +2599,10 @@ int main(int argc, char ** argv) { // auto middleware_validate_api_key = [¶ms, &res_error](const httplib::Request & req, httplib::Response & res) { - // TODO: should we apply API key to all endpoints, including "/health" and "/models"? - static const std::unordered_set protected_endpoints = { - "/props", - "/completion", - "/completions", - "/v1/completions", - "/chat/completions", - "/v1/chat/completions", - "/infill", - "/tokenize", - "/detokenize", - "/embedding", - "/embeddings", - "/v1/embeddings", + static const std::unordered_set public_endpoints = { + "/health", + "/models", + "/v1/models", }; // If API key is not set, skip validation @@ -2476,8 +2610,8 @@ int main(int argc, char ** argv) { return true; } - // If path is not in protected_endpoints list, skip validation - if (protected_endpoints.find(req.path) == protected_endpoints.end()) { + // If path is public, skip validation + if (public_endpoints.find(req.path) != public_endpoints.end()) { return true; } @@ -2539,7 +2673,7 @@ int main(int argc, char ** argv) { const auto handle_slots = [&](const httplib::Request & req, httplib::Response & res) { if (!params.endpoint_slots) { - res_error(res, format_error_response("This server does not support slots endpoint. Start it without `--no-slots`", ERROR_TYPE_NOT_SUPPORTED)); + res_error(res, format_error_response("This server does not support slots endpoint. Start it with `--slots`", ERROR_TYPE_NOT_SUPPORTED)); return; } @@ -2788,29 +2922,33 @@ int main(int argc, char ** argv) { }; const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) { - std::string template_key = "tokenizer.chat_template", curr_tmpl; - int32_t tlen = llama_model_meta_val_str(ctx_server.model, template_key.c_str(), nullptr, 0); - if (tlen > 0) { - std::vector curr_tmpl_buf(tlen + 1, 0); - if (llama_model_meta_val_str(ctx_server.model, template_key.c_str(), curr_tmpl_buf.data(), curr_tmpl_buf.size()) == tlen) { - curr_tmpl = std::string(curr_tmpl_buf.data(), tlen); - } - } json data = { - { "system_prompt", ctx_server.system_prompt.c_str() }, { "default_generation_settings", ctx_server.default_generation_settings_for_props }, { "total_slots", ctx_server.params.n_parallel }, - { "bos_token", llama_token_to_piece(ctx_server.ctx, llama_token_bos(ctx_server.model), true) }, - { "eos_token", llama_token_to_piece(ctx_server.ctx, llama_token_eos(ctx_server.model), true) }, - { "chat_template", curr_tmpl.c_str() }, + { "bos_token", common_token_to_piece(ctx_server.ctx, llama_token_bos(ctx_server.model), true) }, + { "eos_token", common_token_to_piece(ctx_server.ctx, llama_token_eos(ctx_server.model), true) }, + { "chat_template", llama_get_chat_template(ctx_server.model) }, }; res_ok(res, data); }; + const auto handle_props_change = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { + if (!ctx_server.params.endpoint_props) { + res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED)); + return; + } + + json data = json::parse(req.body); + + // update any props here + + res_ok(res, {{ "success", true }}); + }; + const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_cmpl_type cmpl_type, json & data, httplib::Response & res) { - if (ctx_server.params.embedding) { - res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); + if (ctx_server.params.embedding || ctx_server.params.reranking) { + res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED)); return; } @@ -2863,15 +3001,31 @@ int main(int argc, char ** argv) { return handle_completions_generic(SERVER_TASK_CMPL_TYPE_NORMAL, data, res); }; - const auto handle_infill = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) { + const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) { + std::string err; + if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) { + err += "prefix token is missing. "; + } + if (llama_token_fim_suf(ctx_server.model) == LLAMA_TOKEN_NULL) { + err += "suffix token is missing. "; + } + if (llama_token_fim_mid(ctx_server.model) == LLAMA_TOKEN_NULL) { + err += "middle token is missing. "; + } + + if (!err.empty()) { + res_error(res, format_error_response(string_format("Infill is not supported by this model: %s", err.c_str()), ERROR_TYPE_NOT_SUPPORTED)); + return; + } + json data = json::parse(req.body); return handle_completions_generic(SERVER_TASK_CMPL_TYPE_INFILL, data, res); }; // TODO: maybe merge this function with "handle_completions_generic" const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) { - if (ctx_server.params.embedding) { - res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); + if (ctx_server.params.embedding || ctx_server.params.reranking) { + res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED)); return; } @@ -2962,11 +3116,12 @@ int main(int argc, char ** argv) { if (body.count("content") != 0) { const bool add_special = json_value(body, "add_special", false); const bool with_pieces = json_value(body, "with_pieces", false); - std::vector tokens = ctx_server.tokenize(body.at("content"), add_special); + + std::vector tokens = ctx_server.tokenize(body.at("content"), add_special, true); if (with_pieces) { for (const auto& token : tokens) { - std::string piece = llama_token_to_piece(ctx_server.ctx, token); + std::string piece = common_token_to_piece(ctx_server.ctx, token); json piece_json; // Check if the piece is valid UTF-8 @@ -3008,6 +3163,11 @@ int main(int argc, char ** argv) { }; const auto handle_embeddings = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { + // TODO: somehow clean up this checks in the future + if (!ctx_server.params.embedding || ctx_server.params.reranking) { + res_error(res, format_error_response("This server does not support embeddings. Start it with `--embeddings` and without `--reranking`", ERROR_TYPE_NOT_SUPPORTED)); + return; + } const json body = json::parse(req.body); bool is_openai = false; @@ -3058,6 +3218,79 @@ int main(int argc, char ** argv) { res_ok(res, root); }; + const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { + if (!ctx_server.params.reranking) { + res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED)); + return; + } + const json body = json::parse(req.body); + + // TODO: implement + //int top_n = 1; + //if (body.count("top_n") != 1) { + // top_n = body.at("top_n"); + //} else { + // res_error(res, format_error_response("\"top_n\" must be provided", ERROR_TYPE_INVALID_REQUEST)); + // return; + //} + + json query; + if (body.count("query") == 1) { + query = body.at("query"); + if (!query.is_string()) { + res_error(res, format_error_response("\"query\" must be a string", ERROR_TYPE_INVALID_REQUEST)); + return; + } + } else { + res_error(res, format_error_response("\"query\" must be provided", ERROR_TYPE_INVALID_REQUEST)); + return; + } + + std::vector documents = json_value(body, "documents", std::vector()); + if (documents.empty()) { + res_error(res, format_error_response("\"documents\" must be a non-empty string array", ERROR_TYPE_INVALID_REQUEST)); + return; + } + + // construct prompt object: array of ["query", "doc0", "doc1", ...] + json prompt; + prompt.push_back(query); + for (const auto & doc : documents) { + prompt.push_back(doc); + } + + LOG_DBG("rerank prompt: %s\n", prompt.dump().c_str()); + + // create and queue the task + json responses = json::array(); + bool error = false; + { + std::vector tasks = ctx_server.create_tasks_cmpl({{"prompt", prompt}}, SERVER_TASK_CMPL_TYPE_RERANK); + ctx_server.queue_results.add_waiting_tasks(tasks); + ctx_server.queue_tasks.post(tasks); + + // get the result + std::unordered_set task_ids = server_task::get_list_id(tasks); + + ctx_server.receive_cmpl_results(task_ids, [&](std::vector & results) { + for (const auto & res : results) { + responses.push_back(res.data); + } + }, [&](const json & error_data) { + res_error(res, error_data); + error = true; + }); + } + + if (error) { + return; + } + + // write JSON response + json root = format_response_rerank(body, responses); + res_ok(res, root); + }; + const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) { json result = json::array(); for (size_t i = 0; i < ctx_server.loras.size(); ++i) { @@ -3121,30 +3354,39 @@ int main(int argc, char ** argv) { svr->set_base_dir(params.public_path); } - // using embedded static files - svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8")); - svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8")); - svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8")); - svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8")); + if (!params.api_keys.empty()) { + // for now, if API key is set, web UI is unusable + svr->Get("/", [&](const httplib::Request &, httplib::Response & res) { + return res.set_content("Web UI is disabled because API key is set.", "text/html; charset=utf-8"); + }); + } else { + // using embedded static files + svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8")); + svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8")); + svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8")); + svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8")); - // add new-ui files - svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8")); - svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8")); - svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8")); - svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8")); - svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8")); + // add new-ui files + svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8")); + svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8")); + svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8")); + svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8")); + svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8")); + } // register API routes - svr->Get ("/health", handle_health); + svr->Get ("/health", handle_health); // public endpoint (no API key check) svr->Get ("/metrics", handle_metrics); svr->Get ("/props", handle_props); - svr->Get ("/v1/models", handle_models); + svr->Post("/props", handle_props_change); + svr->Get ("/models", handle_models); // public endpoint (no API key check) + svr->Get ("/v1/models", handle_models); // public endpoint (no API key check) svr->Post("/completion", handle_completions); // legacy svr->Post("/completions", handle_completions); svr->Post("/v1/completions", handle_completions); @@ -3154,6 +3396,10 @@ int main(int argc, char ** argv) { svr->Post("/embedding", handle_embeddings); // legacy svr->Post("/embeddings", handle_embeddings); svr->Post("/v1/embeddings", handle_embeddings); + svr->Post("/rerank", handle_rerank); + svr->Post("/reranking", handle_rerank); + svr->Post("/v1/rerank", handle_rerank); + svr->Post("/v1/reranking", handle_rerank); svr->Post("/tokenize", handle_tokenize); svr->Post("/detokenize", handle_detokenize); // LoRA adapters hotswap @@ -3218,10 +3464,11 @@ int main(int argc, char ** argv) { } // print sample chat example to make it clear which template is used - LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), llama_chat_format_example(ctx_server.model, params.chat_template).c_str()); + LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), common_chat_format_example(ctx_server.model, params.chat_template).c_str()); ctx_server.queue_tasks.on_new_task(std::bind( &server_context::process_single_task, &ctx_server, std::placeholders::_1)); + ctx_server.queue_tasks.on_update_slots(std::bind( &server_context::update_slots, &ctx_server)); diff --git a/examples/server/tests/features/ctx_shift.feature b/examples/server/tests/features/ctx_shift.feature index ba3afcf06..ae6c6b01b 100644 --- a/examples/server/tests/features/ctx_shift.feature +++ b/examples/server/tests/features/ctx_shift.feature @@ -13,6 +13,10 @@ Feature: llama.cpp server And 32 as batch size And 2 slots + # the prompt is 301 tokens + # the slot context is 256/2 = 128 tokens + # the prompt is truncated to keep the last 109 tokens + # 64 tokens are generated thanks to shifting the context when it gets full Scenario: Inference with context shift And 64 server max tokens to predict Then the server is starting diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature index 818ea3beb..f4fe2ee43 100644 --- a/examples/server/tests/features/embeddings.feature +++ b/examples/server/tests/features/embeddings.feature @@ -15,7 +15,7 @@ Feature: llama.cpp server And 128 as batch size And 128 as ubatch size And 512 KV cache size - And embeddings extraction + And enable embeddings endpoint Then the server is starting Then the server is healthy diff --git a/examples/server/tests/features/rerank.feature b/examples/server/tests/features/rerank.feature new file mode 100644 index 000000000..c36cc8e21 --- /dev/null +++ b/examples/server/tests/features/rerank.feature @@ -0,0 +1,42 @@ +@llama.cpp +@rerank +Feature: llama.cpp server + + Background: Server startup + Given a server listening on localhost:8080 + And a model url https://huggingface.co/ggml-org/models/resolve/main/jina-reranker-v1-tiny-en/ggml-model-f16.gguf + And a model file jina-reranker-v1-tiny-en.gguf + And a model alias jina-reranker-v1-tiny-en + And 42 as server seed + And 2 slots + And 512 as batch size + And 512 as ubatch size + And 512 KV cache size + And enable reranking endpoint + Then the server is starting + Then the server is healthy + + Scenario: Rerank + Given a rerank query: + """ + Machine learning is + """ + And a rerank document: + """ + A machine is a physical system that uses power to apply forces and control movement to perform an action. The term is commonly applied to artificial devices, such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines. + """ + And a rerank document: + """ + Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences. The ability to learn is possessed by humans, non-human animals, and some machines; there is also evidence for some kind of learning in certain plants. + """ + And a rerank document: + """ + Machine learning is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions. + """ + And a rerank document: + """ + Paris, capitale de la France, est une grande ville européenne et un centre mondial de l'art, de la mode, de la gastronomie et de la culture. Son paysage urbain du XIXe siècle est traversé par de larges boulevards et la Seine. + """ + When reranking request + Then reranking results are returned + Then reranking highest score is index 2 and lowest score is index 3 diff --git a/examples/server/tests/features/security.feature b/examples/server/tests/features/security.feature index eb82e7aca..0a3c5cc77 100644 --- a/examples/server/tests/features/security.feature +++ b/examples/server/tests/features/security.feature @@ -5,7 +5,7 @@ Feature: Security Background: Server startup with an api key defined Given a server listening on localhost:8080 And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models - And a server api key llama.cpp + And a server api key THIS_IS_THE_KEY Then the server is starting Then the server is healthy @@ -16,11 +16,11 @@ Feature: Security And a completion request with api error Examples: Prompts - | api_key | api_error | - | llama.cpp | no | - | llama.cpp | no | - | hackeme | raised | - | | raised | + | api_key | api_error | + | THIS_IS_THE_KEY | no | + | THIS_IS_THE_KEY | no | + | hackeme | raised | + | | raised | Scenario Outline: OAI Compatibility Given a system prompt test @@ -32,10 +32,10 @@ Feature: Security Given an OAI compatible chat completions request with api error Examples: Prompts - | api_key | api_error | - | llama.cpp | no | - | llama.cpp | no | - | hackme | raised | + | api_key | api_error | + | THIS_IS_THE_KEY | no | + | THIS_IS_THE_KEY | no | + | hackme | raised | Scenario Outline: OAI Compatibility (invalid response formats) Given a system prompt test @@ -55,7 +55,7 @@ Feature: Security Scenario Outline: CORS Options - Given a user api key llama.cpp + Given a user api key THIS_IS_THE_KEY When an OPTIONS request is sent from Then CORS header is set to diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index f1a97deec..aa70c46d3 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -68,6 +68,7 @@ def step_server_config(context, server_fqdn: str, server_port: str): context.server_api_key = None context.server_continuous_batching = False context.server_embeddings = False + context.server_reranking = False context.server_metrics = False context.server_process = None context.seed = None @@ -87,6 +88,10 @@ def step_server_config(context, server_fqdn: str, server_port: str): context.concurrent_tasks = [] context.prompts = [] + context.reranking_query = None + context.reranking_documents = [] + context.reranking_results = None + @step('a model file {hf_file} from HF repo {hf_repo}') def step_download_hf_model(context, hf_file: str, hf_repo: str): @@ -186,10 +191,13 @@ def step_server_continuous_batching(context): context.server_continuous_batching = True -@step('embeddings extraction') +@step('enable embeddings endpoint') def step_server_embeddings(context): context.server_embeddings = True +@step('enable reranking endpoint') +def step_server_reranking(context): + context.server_reranking = True @step('prometheus compatible metrics exposed') def step_server_metrics(context): @@ -473,6 +481,14 @@ def step_impl(context, n_ga_w): def step_prompt_passkey(context): context.prompt_passkey = context_text(context) +@step('a rerank query') +def step_set_rerank_query(context): + context.reranking_query = context_text(context) + context.reranking_documents = [] + +@step('a rerank document') +def step_set_rerank_document(context): + context.reranking_documents.append(context_text(context)) @step('{n_prompts:d} fixed prompts') def step_fixed_prompts(context, n_prompts): @@ -692,6 +708,22 @@ async def step_compute_embedding(context): context.embeddings = await request_embedding(context_text(context), None, base_url=context.base_url) +@step('reranking request') +@async_run_until_complete +async def step_compute_reranking(context): + async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session: + async with session.post(f'{context.base_url}/reranking', + json={ + "query": context.reranking_query, + "documents": context.reranking_documents, + }) as response: + if response.status == 200: + response_json = await response.json() + context.reranking_results = response_json['results'] + else: + context.reranking_results = response.status + + @step('all embeddings are the same') @async_run_until_complete async def step_all_embeddings_are_the_same(context): @@ -777,6 +809,24 @@ async def all_embeddings_are_generated(context): for i in range(n_embedding_requests): assert_embeddings(context.tasks_result.pop().pop()) +@step('reranking results are returned') +def reranking_results_are_returned(context): + assert len(context.reranking_results) == len(context.reranking_documents) + +@step('reranking highest score is index {idx_high:d} and lowest score is index {idx_low:d}') +def reranking_results_are_returned(context, idx_high: int, idx_low: int): + max_score, max_idx = 0, 0 + min_score, min_idx = 0, 0 + for res in context.reranking_results: + if max_score < res['relevance_score']: + max_score = res['relevance_score'] + max_idx = res['index'] + if min_score > res['relevance_score']: + min_score = res['relevance_score'] + min_idx = res['index'] + print(context.reranking_results) + assert max_idx == idx_high + assert min_idx == idx_low @step('adding special tokens') def step_tokenize_set_add_special(context): @@ -1338,7 +1388,8 @@ async def wait_for_slots_status(context, async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session: while True: - async with await session.get(f'{base_url}/slots', params=params) as slots_response: + headers = {'Authorization': f'Bearer {context.server_api_key}'} + async with await session.get(f'{base_url}/slots', params=params, headers=headers) as slots_response: status_code = slots_response.status slots = await slots_response.json() if context.debug: @@ -1426,6 +1477,7 @@ def start_server_background(context): context.server_path = os.environ['LLAMA_SERVER_BIN_PATH'] server_listen_addr = context.server_fqdn server_args = [ + '--slots', # requires to get slot status via /slots endpoint '--host', server_listen_addr, '--port', context.server_port, ] @@ -1451,6 +1503,8 @@ def start_server_background(context): server_args.append('--cont-batching') if context.server_embeddings: server_args.append('--embedding') + if context.server_reranking: + server_args.append('--reranking') if context.server_metrics: server_args.append('--metrics') if context.model_alias: diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt index f2d7e5c57..553954872 100644 --- a/examples/server/tests/requirements.txt +++ b/examples/server/tests/requirements.txt @@ -1,6 +1,6 @@ aiohttp~=3.9.3 behave~=1.2.6 -huggingface_hub~=0.20.3 +huggingface_hub~=0.23.2 numpy~=1.26.4 openai~=1.30.3 prometheus-client~=0.20.0 diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index fc66fb591..83d3de2da 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -59,7 +59,7 @@ static T json_value(const json & body, const std::string & key, const T & defaul // Format given chat. If tmpl is empty, we take the template from model metadata inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector & messages) { - std::vector chat; + std::vector chat; for (size_t i = 0; i < messages.size(); ++i) { const auto & curr_msg = messages[i]; @@ -87,12 +87,25 @@ inline std::string format_chat(const struct llama_model * model, const std::stri chat.push_back({role, content}); } - const auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true); + const auto formatted_chat = common_chat_apply_template(model, tmpl, chat, true); LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str()); return formatted_chat; } +static std::string llama_get_chat_template(const struct llama_model * model) { + std::string template_key = "tokenizer.chat_template"; + // call with NULL buffer to get the total size of the string + int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0); + if (res < 0) { + return ""; + } else { + std::vector model_template(res, 0); + llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); + return std::string(model_template.data(), model_template.size()); + } +} + // // base64 utils (TODO: move to common in the future) // @@ -185,14 +198,14 @@ static std::string gen_chatcmplid() { // other common utils // -static size_t common_part(const std::vector & a, const std::vector & b) { +static size_t longest_common_prefix(const std::vector & a, const std::vector & b) { size_t i; for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} return i; } -static size_t common_part(const std::string & a, const std::string & b) { +static size_t longest_common_prefix(const std::string & a, const std::string & b) { size_t i; for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} @@ -231,7 +244,7 @@ template static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { std::string ret; for (; begin != end; ++begin) { - ret += llama_token_to_piece(ctx, *begin); + ret += common_token_to_piece(ctx, *begin); } return ret; @@ -239,7 +252,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { // format incomplete utf-8 multibyte character for output static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) { - std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token); + std::string out = token == -1 ? "" : common_token_to_piece(ctx, token); // if the size is 1 and first bit is 1, meaning it's a partial character // (size > 1 meaning it's already a known token) @@ -406,9 +419,9 @@ static json oaicompat_completion_params_parse( // Handle "logprobs" field // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future - if (body.contains("logprobs")) { + if (json_value(body, "logprobs", false)) { llama_params["n_probs"] = json_value(body, "top_logprobs", 20); - } else if (body.contains("top_logprobs")) { + } else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) { throw std::runtime_error("top_logprobs requires logprobs to be set to true"); } @@ -627,7 +640,7 @@ static json format_embeddings_response_oaicompat(const json & request, const jso json res = json { {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, {"object", "list"}, - {"usage", json { + {"usage", json { // TODO: fill {"prompt_tokens", 0}, {"total_tokens", 0} }}, @@ -637,6 +650,29 @@ static json format_embeddings_response_oaicompat(const json & request, const jso return res; } +static json format_response_rerank(const json & request, const json & ranks) { + json data = json::array(); + int i = 0; + for (const auto & rank : ranks) { + data.push_back(json{ + {"index", i++}, + {"relevance_score", json_value(rank, "score", 0.0)}, + }); + } + + json res = json { + {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, + {"object", "list"}, + {"usage", json { // TODO: fill + {"prompt_tokens", 0}, + {"total_tokens", 0} + }}, + {"results", data} + }; + + return res; +} + static bool is_valid_utf8(const std::string & str) { const unsigned char* bytes = reinterpret_cast(str.data()); const unsigned char* end = bytes + str.length(); diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt index 070cfbe7a..b63afbb8b 100644 --- a/examples/simple/CMakeLists.txt +++ b/examples/simple/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET llama-simple) add_executable(${TARGET} simple.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index c2b7267c8..59760fe95 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -1,50 +1,112 @@ -#include "arg.h" -#include "common.h" -#include "log.h" #include "llama.h" - +#include +#include +#include #include static void print_usage(int, char ** argv) { - LOG("\nexample usage:\n"); - LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]); - LOG("\n"); + printf("\nexample usage:\n"); + printf("\n %s -m model.gguf [-n n_predict] [-ngl n_gpu_layers] [prompt]\n", argv[0]); + printf("\n"); } int main(int argc, char ** argv) { - gpt_params params; + // path to the model gguf file + std::string model_path; + // prompt to generate text from + std::string prompt = "Hello my name is"; + // number of layers to offload to the GPU + int ngl = 99; + // number of tokens to predict + int n_predict = 32; - params.prompt = "Hello my name is"; - params.n_predict = 32; + // parse command line arguments - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { - return 1; + { + int i = 1; + for (; i < argc; i++) { + if (strcmp(argv[i], "-m") == 0) { + if (i + 1 < argc) { + model_path = argv[++i]; + } else { + print_usage(argc, argv); + return 1; + } + } else if (strcmp(argv[i], "-n") == 0) { + if (i + 1 < argc) { + try { + n_predict = std::stoi(argv[++i]); + } catch (...) { + print_usage(argc, argv); + return 1; + } + } else { + print_usage(argc, argv); + return 1; + } + } else if (strcmp(argv[i], "-ngl") == 0) { + if (i + 1 < argc) { + try { + ngl = std::stoi(argv[++i]); + } catch (...) { + print_usage(argc, argv); + return 1; + } + } else { + print_usage(argc, argv); + return 1; + } + } else { + // prompt starts here + break; + } + } + if (model_path.empty()) { + print_usage(argc, argv); + return 1; + } + if (i < argc) { + prompt = argv[i++]; + for (; i < argc; i++) { + prompt += " "; + prompt += argv[i]; + } + } } - gpt_init(); - - // total length of the sequence including the prompt - const int n_predict = params.n_predict; - - // init LLM - - llama_backend_init(); - llama_numa_init(params.numa); - // initialize the model - llama_model_params model_params = llama_model_params_from_gpt_params(params); + llama_model_params model_params = llama_model_default_params(); + model_params.n_gpu_layers = ngl; - llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); + llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params); if (model == NULL) { fprintf(stderr , "%s: error: unable to load model\n" , __func__); return 1; } + // tokenize the prompt + + // find the number of tokens in the prompt + const int n_prompt = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true); + + // allocate space for the tokens and tokenize the prompt + std::vector prompt_tokens(n_prompt); + if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) { + fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__); + return 1; + } + // initialize the context - llama_context_params ctx_params = llama_context_params_from_gpt_params(params); + llama_context_params ctx_params = llama_context_default_params(); + // n_ctx is the context size + ctx_params.n_ctx = n_prompt + n_predict - 1; + // n_batch is the maximum number of tokens that can be processed in a single call to llama_decode + ctx_params.n_batch = n_prompt; + // enable performance counters + ctx_params.no_perf = false; llama_context * ctx = llama_new_context_with_model(model, ctx_params); @@ -53,117 +115,87 @@ int main(int argc, char ** argv) { return 1; } + // initialize the sampler + auto sparams = llama_sampler_chain_default_params(); - sparams.no_perf = false; - llama_sampler * smpl = llama_sampler_chain_init(sparams); llama_sampler_chain_add(smpl, llama_sampler_init_greedy()); - // tokenize the prompt - - std::vector tokens_list; - tokens_list = ::llama_tokenize(ctx, params.prompt, true); - - const int n_ctx = llama_n_ctx(ctx); - const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size()); - - LOG("\n"); - LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req); - - // make sure the KV cache is big enough to hold all the prompt and generated tokens - if (n_kv_req > n_ctx) { - LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__); - LOG_ERR("%s: either reduce n_predict or increase n_ctx\n", __func__); - return 1; - } - // print the prompt token-by-token - LOG("\n"); - - for (auto id : tokens_list) { - LOG("%s", llama_token_to_piece(ctx, id).c_str()); + for (auto id : prompt_tokens) { + char buf[128]; + int n = llama_token_to_piece(model, id, buf, sizeof(buf), 0, true); + if (n < 0) { + fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__); + return 1; + } + std::string s(buf, n); + printf("%s", s.c_str()); } - // create a llama_batch with size 512 - // we use this object to submit token data for decoding + // prepare a batch for the prompt - llama_batch batch = llama_batch_init(512, 0, 1); - - // evaluate the initial prompt - for (size_t i = 0; i < tokens_list.size(); i++) { - llama_batch_add(batch, tokens_list[i], i, { 0 }, false); - } - - // llama_decode will output logits only for the last token of the prompt - batch.logits[batch.n_tokens - 1] = true; - - if (llama_decode(ctx, batch) != 0) { - LOG("%s: llama_decode() failed\n", __func__); - return 1; - } + llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); // main loop - int n_cur = batch.n_tokens; - int n_decode = 0; - const auto t_main_start = ggml_time_us(); + int n_decode = 0; + llama_token new_token_id; + + for (int n_pos = 0; n_pos + batch.n_tokens < n_prompt + n_predict; ) { + // evaluate the current batch with the transformer model + if (llama_decode(ctx, batch)) { + fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); + return 1; + } + + n_pos += batch.n_tokens; - while (n_cur <= n_predict) { // sample the next token { - const llama_token new_token_id = llama_sampler_sample(smpl, ctx, -1); + new_token_id = llama_sampler_sample(smpl, ctx, -1); // is it an end of generation? - if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { - LOG("\n"); - + if (llama_token_is_eog(model, new_token_id)) { break; } - LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str()); + char buf[128]; + int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true); + if (n < 0) { + fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__); + return 1; + } + std::string s(buf, n); + printf("%s", s.c_str()); fflush(stdout); - // prepare the next batch - llama_batch_clear(batch); - - // push this new token for next evaluation - llama_batch_add(batch, new_token_id, n_cur, { 0 }, true); + // prepare the next batch with the sampled token + batch = llama_batch_get_one(&new_token_id, 1); n_decode += 1; } - - n_cur += 1; - - // evaluate the current batch with the transformer model - if (llama_decode(ctx, batch)) { - LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1); - return 1; - } } - LOG("\n"); + printf("\n"); const auto t_main_end = ggml_time_us(); - LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", + fprintf(stderr, "%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); - LOG("\n"); + fprintf(stderr, "\n"); llama_perf_sampler_print(smpl); llama_perf_context_print(ctx); + fprintf(stderr, "\n"); - LOG("\n"); - - llama_batch_free(batch); llama_sampler_free(smpl); llama_free(ctx); llama_free_model(model); - llama_backend_free(); - return 0; } diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index adf6255e1..a40e755a2 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -26,20 +26,25 @@ struct seq_draft { std::vector tokens; std::vector> dists; - struct gpt_sampler * smpl = nullptr; + struct common_sampler * smpl = nullptr; }; int main(int argc, char ** argv) { - gpt_params params; + common_params params; // needed to get candidate probs even for temp <= 0.0 params.sparams.n_probs = 128; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) { return 1; } - gpt_init(); + if (params.n_predict < -1) { + LOG_ERR("%s: --n-predict must be >= -1\n", __func__); + return 1; + } + + common_init(); if (params.model_draft.empty()) { LOG_ERR("%s: --model-draft is required\n", __func__); @@ -66,7 +71,7 @@ int main(int argc, char ** argv) { llama_context * ctx_dft = NULL; // load the target model - llama_init_result llama_init_tgt = llama_init_from_gpt_params(params); + common_init_result llama_init_tgt = common_init_from_params(params); model_tgt = llama_init_tgt.model; ctx_tgt = llama_init_tgt.context; @@ -78,7 +83,7 @@ int main(int argc, char ** argv) { } params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads; - llama_init_result llama_init_dft = llama_init_from_gpt_params(params); + common_init_result llama_init_dft = common_init_from_params(params); model_dft = llama_init_dft.model; ctx_dft = llama_init_dft.context; @@ -124,8 +129,8 @@ int main(int argc, char ** argv) { if (std::strcmp(token_text_tgt, token_text_dft) != 0) { LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__); LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i, - llama_token_to_piece(ctx_tgt, i).c_str(), - llama_token_to_piece(ctx_dft, i).c_str()); + common_token_to_piece(ctx_tgt, i).c_str(), + common_token_to_piece(ctx_dft, i).c_str()); return 1; } } @@ -134,7 +139,7 @@ int main(int argc, char ** argv) { // Tokenize the prompt std::vector inp; - inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true); + inp = common_tokenize(ctx_tgt, params.prompt, true, true); const int max_context_size = llama_n_ctx(ctx_tgt); const int max_tokens_list_size = max_context_size - 4; @@ -147,7 +152,7 @@ int main(int argc, char ** argv) { LOG("\n\n"); for (auto id : inp) { - LOG("%s", llama_token_to_piece(ctx_tgt, id).c_str()); + LOG("%s", common_token_to_piece(ctx_tgt, id).c_str()); } const int n_input = inp.size(); @@ -155,9 +160,9 @@ int main(int argc, char ** argv) { const auto t_enc_start = ggml_time_us(); // eval the prompt with both models - llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1, 0, 0)); - llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0)); - llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input, 0, 0)); + llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1)); + llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(), 1)); + llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input)); const auto t_enc_end = ggml_time_us(); @@ -178,20 +183,18 @@ int main(int argc, char ** argv) { bool has_eos = false; // target model sampling context (reuse the llama_context's sampling instance) - struct gpt_sampler * smpl = gpt_sampler_init(model_tgt, params.sparams); - - struct llama_sampler * softmax = llama_sampler_init_softmax(); + struct common_sampler * smpl = common_sampler_init(model_tgt, params.sparams); // draft sequence data std::vector drafts(n_seq_dft); for (int s = 0; s < n_seq_dft; ++s) { - // allocate gpt_sampler for each draft sequence - drafts[s].smpl = gpt_sampler_init(model_dft, params.sparams); + // allocate llama_sampler for each draft sequence + drafts[s].smpl = common_sampler_init(model_dft, params.sparams); } - llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1); - llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft); + llama_batch batch_dft = llama_batch_init(llama_n_batch(ctx_dft), 0, 1); + llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, n_seq_dft); const auto t_dec_start = ggml_time_us(); @@ -229,9 +232,9 @@ int main(int argc, char ** argv) { bool accept = false; if (params.sparams.temp > 0) { // stochastic verification - gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true); + common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true); - auto & dist_tgt = *gpt_sampler_get_candidates(smpl); + auto & dist_tgt = *common_sampler_get_candidates(smpl); float p_tgt = 0.0f; float p_dft = 0.0f; @@ -277,13 +280,13 @@ int main(int argc, char ** argv) { s_keep = s; accept = true; token_id = drafts[s].tokens[i_dft]; - token_str = llama_token_to_piece(ctx_tgt, token_id); - gpt_sampler_accept(smpl, token_id, true); + token_str = common_token_to_piece(ctx_tgt, token_id); + common_sampler_accept(smpl, token_id, true); LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str()); break; } else { - LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str()); + LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], common_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str()); drafts[s].active = false; // calculate residual probability @@ -349,19 +352,19 @@ int main(int argc, char ** argv) { const int idx = dist(rng); token_id = dist_tgt.data[idx].id; - gpt_sampler_accept(smpl, token_id, true); - token_str = llama_token_to_piece(ctx_tgt, token_id); + common_sampler_accept(smpl, token_id, true); + token_str = common_token_to_piece(ctx_tgt, token_id); } } else { // greedy verification // sample from the target model LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]); - token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]); + token_id = common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]); - gpt_sampler_accept(smpl, token_id, true); + common_sampler_accept(smpl, token_id, true); - token_str = llama_token_to_piece(ctx_tgt, token_id); + token_str = common_token_to_piece(ctx_tgt, token_id); for (int s = 0; s < n_seq_dft; ++s) { if (!drafts[s].active) { @@ -431,8 +434,8 @@ int main(int argc, char ** argv) { drafts[0].dists.push_back(std::vector()); drafts[0].i_batch_tgt.push_back(0); - llama_batch_clear(batch_dft); - llama_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true); + common_batch_clear(batch_dft); + common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true); llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1); // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str()); @@ -441,14 +444,14 @@ int main(int argc, char ** argv) { ++n_past_dft; } - if (n_predict > params.n_predict || has_eos) { + if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) { break; } if (drafts[0].smpl) { - gpt_sampler_free(drafts[0].smpl); + common_sampler_free(drafts[0].smpl); } - drafts[0].smpl = gpt_sampler_clone(smpl); + drafts[0].smpl = common_sampler_clone(smpl); int n_seq_cur = 1; int n_past_cur = n_past_dft; @@ -461,8 +464,8 @@ int main(int argc, char ** argv) { drafts[0].drafting = true; drafts[0].i_batch_dft = 0; - llama_batch_clear(batch_tgt); - llama_batch_add (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true); + common_batch_clear(batch_tgt); + common_batch_add (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true); // sample n_draft tokens from the draft model using tree-based sampling for (int i = 0; i < n_draft; ++i) { @@ -477,13 +480,13 @@ int main(int argc, char ** argv) { continue; } - gpt_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true); + common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true); - const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl); + const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl); for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) { LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n", - k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str()); + k, s, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str()); } std::vector sa(1, s); @@ -518,9 +521,9 @@ int main(int argc, char ** argv) { drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt; if (drafts[n_seq_cur].smpl) { - gpt_sampler_free(drafts[n_seq_cur].smpl); + common_sampler_free(drafts[n_seq_cur].smpl); } - drafts[n_seq_cur].smpl = gpt_sampler_clone(drafts[s].smpl); + drafts[n_seq_cur].smpl = common_sampler_clone(drafts[s].smpl); sa.push_back(n_seq_cur); @@ -536,7 +539,7 @@ int main(int argc, char ** argv) { const int s = sa[is]; - gpt_sampler_accept(drafts[s].smpl, id, true); + common_sampler_accept(drafts[s].smpl, id, true); drafts[s].tokens.push_back(id); // save cur_p.data into drafts[s].dists @@ -545,12 +548,12 @@ int main(int argc, char ** argv) { // add unique drafted tokens to the target batch drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens); - llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true); + common_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true); // add the token to the batch for batched decoding with the draft model drafts[s].i_batch_dft = batch_dft.n_tokens; - llama_batch_add(batch_dft, id, n_past_cur, { s }, true); + common_batch_add(batch_dft, id, n_past_cur, { s }, true); if (batch_tgt.n_tokens > n_draft) { drafts[s].drafting = false; @@ -617,14 +620,13 @@ int main(int argc, char ** argv) { LOG_INF("\n"); LOG_INF("target:\n\n"); - gpt_perf_print(ctx_tgt, smpl); + common_perf_print(ctx_tgt, smpl); - gpt_sampler_free(smpl); + common_sampler_free(smpl); for (int s = 0; s < n_seq_dft; ++s) { - gpt_sampler_free(drafts[s].smpl); + common_sampler_free(drafts[s].smpl); } - llama_sampler_free(softmax); llama_batch_free(batch_dft); llama_free(ctx_tgt); diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index a9af6471f..12ad54256 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -365,7 +365,7 @@ int main(int raw_argc, char ** raw_argv) { const bool parse_special = !no_parse_special; std::vector tokens; - tokens = ::llama_tokenize(model, prompt, add_bos, parse_special); + tokens = common_tokenize(model, prompt, add_bos, parse_special); if (printing_ids) { printf("["); @@ -380,7 +380,7 @@ int main(int raw_argc, char ** raw_argv) { } else { bool invalid_utf8 = false; printf("%6d -> '", tokens[i]); - write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8); + write_utf8_cstr_to_stdout(common_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8); if (invalid_utf8) { printf("' (utf-8 decode failure)\n"); } else { diff --git a/flake.lock b/flake.lock index 6333a09f0..702527028 100644 --- a/flake.lock +++ b/flake.lock @@ -5,11 +5,11 @@ "nixpkgs-lib": "nixpkgs-lib" }, "locked": { - "lastModified": 1726153070, - "narHash": "sha256-HO4zgY0ekfwO5bX0QH/3kJ/h4KvUDFZg8YpkNwIbg1U=", + "lastModified": 1727826117, + "narHash": "sha256-K5ZLCyfO/Zj9mPFldf3iwS6oZStJcU4tSpiXTMYaaL0=", "owner": "hercules-ci", "repo": "flake-parts", - "rev": "bcef6817a8b2aa20a5a6dbb19b43e63c5bf8619a", + "rev": "3d04084d54bedc3d6b8b736c70ef449225c361b1", "type": "github" }, "original": { @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1726755586, - "narHash": "sha256-PmUr/2GQGvFTIJ6/Tvsins7Q43KTMvMFhvG6oaYK+Wk=", + "lastModified": 1728492678, + "narHash": "sha256-9UTxR8eukdg+XZeHgxW5hQA9fIKHsKCdOIUycTryeVw=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "c04d5652cfa9742b1d519688f65d1bbccea9eb7e", + "rev": "5633bcff0c6162b9e4b5f1264264611e950c8ec7", "type": "github" }, "original": { @@ -36,14 +36,14 @@ }, "nixpkgs-lib": { "locked": { - "lastModified": 1725233747, - "narHash": "sha256-Ss8QWLXdr2JCBPcYChJhz4xJm+h/xjl4G0c0XlP6a74=", + "lastModified": 1727825735, + "narHash": "sha256-0xHYkMkeLVQAMa7gvkddbPqpxph+hDzdu1XdGPJR+Os=", "type": "tarball", - "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz" + "url": "https://github.com/NixOS/nixpkgs/archive/fb192fec7cc7a4c26d51779e9bab07ce6fa5597a.tar.gz" }, "original": { "type": "tarball", - "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz" + "url": "https://github.com/NixOS/nixpkgs/archive/fb192fec7cc7a4c26d51779e9bab07ce6fa5597a.tar.gz" } }, "root": { diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 89fdf9d1c..cfa6e3f70 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -99,6 +99,9 @@ option(GGML_AVX512 "ggml: enable AVX512" OFF) option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF) option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF) option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF) +option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF) +option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF) +option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF) option(GGML_FMA "ggml: enable FMA" ${INS_ENB}) if (NOT MSVC) option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512 @@ -158,6 +161,7 @@ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)") option(GGML_OPENMP "ggml: use OpenMP" ON) option(GGML_RPC "ggml: use RPC" OFF) +option(GGML_AMX "ggml: use AMX" OFF) option(GGML_SYCL "ggml: use SYCL" OFF) option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF) set (GGML_SYCL_TARGET "INTEL" CACHE STRING diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h index 0dff47d65..23600eea9 100644 --- a/ggml/include/ggml-alloc.h +++ b/ggml/include/ggml-alloc.h @@ -24,7 +24,7 @@ GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, st // Graph allocator /* Example usage: - ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type()); + ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); // optional: create a worst-case graph and reserve the buffers to avoid reallocations ggml_gallocr_reserve(galloc, build_graph(max_batch)); diff --git a/ggml/include/ggml-amx.h b/ggml/include/ggml-amx.h new file mode 100644 index 000000000..22b3f70f4 --- /dev/null +++ b/ggml/include/ggml-amx.h @@ -0,0 +1,25 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +// buffer_type API +GGML_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void); + +GGML_API bool ggml_backend_is_amx(ggml_backend_t backend); + +// backend API +GGML_API ggml_backend_t ggml_backend_amx_init(void); + +GGML_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads); + +GGML_API ggml_backend_reg_t ggml_backend_amx_reg(void); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 71c0bef8e..5933b8e8f 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -12,43 +12,52 @@ extern "C" { typedef struct ggml_backend_event * ggml_backend_event_t; typedef struct ggml_backend * ggml_backend_t; typedef void * ggml_backend_graph_plan_t; + typedef struct ggml_backend_reg * ggml_backend_reg_t; + typedef struct ggml_backend_device * ggml_backend_dev_t; + + + // + // Backend buffer type + // + + GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); + GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); + GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); + GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); + GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); + GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); + GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft); // // Backend buffer // - // buffer type - GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); - GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); - GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); - GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); - GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); - GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); - - // buffer enum ggml_backend_buffer_usage { GGML_BACKEND_BUFFER_USAGE_ANY = 0, GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1, GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2, }; - GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer); - GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); - GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer); - GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); - GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); - GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer); - GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); - GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); - GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); - GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer); - GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer); - GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer); + GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); + GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); + GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); + GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer); + GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer); + + // tensor copy between different backends + GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst); // - // Backend + // Backend (stream) // GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend); @@ -64,9 +73,9 @@ extern "C" { GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); // "offset" refers to the offset of the tensor data for setting/getting data - GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - GGML_API GGML_CALL void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); + GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); GGML_API void ggml_backend_synchronize(ggml_backend_t backend); @@ -76,65 +85,121 @@ extern "C" { GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan); GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); + + // NOTE: will be removed, use device version instead GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft); GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op); - // tensor copy between different backends - GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst); - // asynchronous copy // the copy is performed after all the currently queued operations in backend_src // backend_dst will wait for the copy to complete before performing other operations // automatic fallback to sync copy if async is not supported GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst); - // events - GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend); - GGML_API void ggml_backend_event_free (ggml_backend_event_t event); - GGML_API void ggml_backend_event_record (ggml_backend_event_t event); - GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event); - GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); + GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend); // - // CPU backend + // Events // - GGML_API ggml_backend_t ggml_backend_cpu_init(void); + GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device); + GGML_API void ggml_backend_event_free(ggml_backend_event_t event); + GGML_API void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend); + GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event); + GGML_API void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event); - GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend); - GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads); - GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool); - GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); + // + // Backend device + // - // Create a backend buffer from an existing pointer - GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); + enum ggml_backend_dev_type { + GGML_BACKEND_DEVICE_TYPE_CPU, + GGML_BACKEND_DEVICE_TYPE_GPU, + // devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication) + GGML_BACKEND_DEVICE_TYPE_CPU_FULL, + GGML_BACKEND_DEVICE_TYPE_GPU_FULL + }; - GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); + // functionality supported by the device + struct ggml_backend_dev_caps { + // asynchronous operations + bool async; + // pinned host buffer + bool host_buffer; + // creating buffers from host ptr + bool buffer_from_host_ptr; + // event synchronization + bool events; + }; -#ifdef GGML_USE_CPU_HBM - GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); -#endif + // all the device properties + struct ggml_backend_dev_props { + const char * name; + const char * description; + size_t memory_free; + size_t memory_total; + enum ggml_backend_dev_type type; + struct ggml_backend_dev_caps caps; + }; + + GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device); + GGML_API const char * ggml_backend_dev_description(ggml_backend_dev_t device); + GGML_API void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total); + GGML_API enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device); + GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props); + GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device); + GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params); + GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device); + GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device); + GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size); + + GGML_API bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op); + GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft); + GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op); + + // + // Backend (reg) + // + + GGML_API const char * ggml_backend_reg_name(ggml_backend_reg_t reg); + GGML_API size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg); + GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index); + GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name); + + + // Functions that may be obtained using ggml_backend_reg_get_proc_address + typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *); + typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t, int); // // Backend registry // - // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way + // Backend (reg) enumeration + GGML_API size_t ggml_backend_reg_count(void); + GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index); + GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name); - GGML_API size_t ggml_backend_reg_get_count(void); - GGML_API size_t ggml_backend_reg_find_by_name(const char * name); // returns index of backend with name, or SIZE_MAX if not found - GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional) - GGML_API const char * ggml_backend_reg_get_name(size_t i); - GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific - GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i); - GGML_API ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size); + // Device enumeration + GGML_API size_t ggml_backend_dev_count(void); + GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index); + GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name); + GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type); + + // Direct backend (stream) initialization + // = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params) + GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params); + // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params) + GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params); + // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL) + GGML_API ggml_backend_t ggml_backend_init_best(void); // // Backend scheduler // - // The backend scheduler allows for multiple backends to be used together + // The backend scheduler allows for multiple backend devices to be used together // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends // The backends are selected based on: // - the backend that supports the operation @@ -169,9 +234,9 @@ extern "C" { } */ - struct ggml_backend_sched; typedef struct ggml_backend_sched * ggml_backend_sched_t; + // Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback) // when ask == true, the scheduler wants to know if the user wants to observe this node // this allows the scheduler to batch nodes together in order to evaluate them in a single call // @@ -185,7 +250,7 @@ extern "C" { GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); // Initialize backend buffers from a measure graph - GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); + GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched); GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i); @@ -200,7 +265,7 @@ extern "C" { GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); // Allocate and compute graph on the backend scheduler - GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); + GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph); GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched); @@ -226,7 +291,7 @@ extern "C" { GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph); GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy); - typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); + typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); // Compare the output of two backends GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data); @@ -235,6 +300,26 @@ extern "C" { GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor); + // + // CPU backend + // + + GGML_API ggml_backend_t ggml_backend_cpu_init(void); + + GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend); + GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads); + GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool); + GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); + + // Create a backend buffer from an existing pointer + GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); + GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); + + GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void); + +#ifdef GGML_USE_CPU_HBM + GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); +#endif #ifdef __cplusplus } diff --git a/ggml/include/ggml-blas.h b/ggml/include/ggml-blas.h index f2e37de06..25b2e637f 100644 --- a/ggml/include/ggml-blas.h +++ b/ggml/include/ggml-blas.h @@ -9,13 +9,15 @@ extern "C" { #endif // backend API -GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void); +GGML_API ggml_backend_t ggml_backend_blas_init(void); -GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend); +GGML_API bool ggml_backend_is_blas(ggml_backend_t backend); // number of threads used for conversion to float // for openblas and blis, this will also set the number of threads used for blas operations -GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads); +GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads); + +GGML_API ggml_backend_reg_t ggml_backend_blas_reg(void); #ifdef __cplusplus diff --git a/ggml/include/ggml-cann.h b/ggml/include/ggml-cann.h index 031ad1ce2..528975493 100644 --- a/ggml/include/ggml-cann.h +++ b/ggml/include/ggml-cann.h @@ -34,6 +34,8 @@ extern "C" { */ #define GGML_CANN_MAX_DEVICES 16 +GGML_API ggml_backend_reg_t ggml_backend_cann_reg(void); + /** * @brief Initializes the CANN backend for a specified device. * @@ -44,7 +46,7 @@ extern "C" { * @param device The index of the device to initialize. * @return A pointer to the initialized backend instance, or nullptr on failure. */ -GGML_API GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device); +GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device); /** * @brief Checks if a given backend is a CANN backend. @@ -55,7 +57,7 @@ GGML_API GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device); * @param backend The backend instance to check. * @return True if the backend is a CANN backend, false otherwise. */ -GGML_API GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend); +GGML_API bool ggml_backend_is_cann(ggml_backend_t backend); /** * @brief Retrieves the CANN buffer type for a specified device. @@ -67,7 +69,7 @@ GGML_API GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend); * @return A pointer to the buffer type interface for the specified device, or * nullptr if the device index is out of range. */ -GGML_API GGML_CALL ggml_backend_buffer_type_t +GGML_API ggml_backend_buffer_type_t ggml_backend_cann_buffer_type(int32_t device); /** @@ -78,14 +80,14 @@ ggml_backend_cann_buffer_type(int32_t device); * * @return The number of CANN devices available. */ -GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void); +GGML_API int32_t ggml_backend_cann_get_device_count(void); /** * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU. * * @return A pointer to the host buffer type interface. */ -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void); +GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void); /** * @brief Retrieves the description of a specific CANN device. @@ -97,7 +99,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type * @param description Pointer to a buffer where the description will be written. * @param description_size Size of the description buffer. */ -GGML_API GGML_CALL void ggml_backend_cann_get_device_description( +GGML_API void ggml_backend_cann_get_device_description( int32_t device, char* description, size_t description_size); /** @@ -112,20 +114,9 @@ GGML_API GGML_CALL void ggml_backend_cann_get_device_description( * @param total Pointer to a variable where the total memory size will be * stored. */ -GGML_API GGML_CALL void ggml_backend_cann_get_device_memory(int32_t device, - size_t* free, - size_t* total); - -/** - * @brief Set the logging callback for GGML. - * - * This function sets the logging callback and user data for logging. - * - * @param log_callback The logging callback to set. - * @param user_data User data to pass to the logging callback. - */ -GGML_API void ggml_backend_cann_log_set_callback(ggml_log_callback log_callback, - void* user_data); +GGML_API void ggml_backend_cann_get_device_memory(int32_t device, + size_t* free, + size_t* total); #ifdef __cplusplus } diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h index 71bb6dcf0..f44d8f4e6 100644 --- a/ggml/include/ggml-cuda.h +++ b/ggml/include/ggml-cuda.h @@ -3,6 +3,10 @@ #include "ggml.h" #include "ggml-backend.h" +#ifdef __cplusplus +extern "C" { +#endif + #ifdef GGML_USE_HIPBLAS #define GGML_CUDA_NAME "ROCm" #define GGML_CUBLAS_NAME "hipBLAS" @@ -13,35 +17,31 @@ #define GGML_CUDA_NAME "CUDA" #define GGML_CUBLAS_NAME "cuBLAS" #endif - -#ifdef __cplusplus -extern "C" { -#endif - #define GGML_CUDA_MAX_DEVICES 16 // backend API -GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device); +GGML_API ggml_backend_t ggml_backend_cuda_init(int device); -GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend); +GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend); // device buffer -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); +GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); // split tensor buffer that splits matrices by rows across multiple devices -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split); +GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split); // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); +GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); -GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void); -GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size); -GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total); +GGML_API int ggml_backend_cuda_get_device_count(void); +GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size); +GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total); -GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size); -GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer); +GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size); +GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer); + +GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void); -GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data); #ifdef __cplusplus } #endif diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h index d483cf1ac..b8d3f678b 100644 --- a/ggml/include/ggml-metal.h +++ b/ggml/include/ggml-metal.h @@ -1,3 +1,5 @@ +// Note: this description is outdated +// // An interface allowing to compute ggml_cgraph with Metal // // This is a fully functional interface that extends ggml with GPU support for Apple devices. @@ -25,9 +27,6 @@ #include #include -// max memory buffers that can be mapped to the device -#define GGML_METAL_MAX_BUFFERS 64 - struct ggml_tensor; struct ggml_cgraph; @@ -40,19 +39,17 @@ extern "C" { // user-code should use only these functions // -GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data); - GGML_API ggml_backend_t ggml_backend_metal_init(void); GGML_API bool ggml_backend_is_metal(ggml_backend_t backend); -GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size); - -GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb); +GGML_DEPRECATED( + GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size), + "obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713"); GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data); -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); +GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); // helper to check if the device supports a specific family // ideally, the user code should be doing these checks @@ -62,6 +59,8 @@ GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int fam // capture all command buffers committed the next time `ggml_backend_graph_compute` is called GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend); +GGML_API ggml_backend_reg_t ggml_backend_metal_reg(void); + #ifdef __cplusplus } #endif diff --git a/ggml/include/ggml-rpc.h b/ggml/include/ggml-rpc.h index aa144832a..d57967368 100644 --- a/ggml/include/ggml-rpc.h +++ b/ggml/include/ggml-rpc.h @@ -10,14 +10,18 @@ extern "C" { #define GGML_RPC_MAX_SERVERS 16 // backend API -GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint); -GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend); +GGML_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint); +GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend); -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint); +GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint); -GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total); +GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total); -GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem); +GGML_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem); + +GGML_API ggml_backend_reg_t ggml_backend_rpc_reg(void); + +GGML_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint); #ifdef __cplusplus } diff --git a/ggml/include/ggml-sycl.h b/ggml/include/ggml-sycl.h index 43ab1519c..af521f599 100644 --- a/ggml/include/ggml-sycl.h +++ b/ggml/include/ggml-sycl.h @@ -19,24 +19,31 @@ extern "C" { // backend API GGML_API ggml_backend_t ggml_backend_sycl_init(int device); +GGML_API bool ggml_backend_is_sycl(ggml_backend_t backend); + // devide buffer GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device); // split tensor buffer that splits matrices by rows across multiple devices -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split); +GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split); // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void); -GGML_API void ggml_backend_sycl_print_sycl_devices(void); -GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len); -GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size); -GGML_API GGML_CALL int ggml_backend_sycl_get_device_count(); -GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total); +GGML_API void ggml_backend_sycl_print_sycl_devices(void); +GGML_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len); +GGML_API void ggml_backend_sycl_get_device_description(int device, + char *description, + size_t description_size); +GGML_API int ggml_backend_sycl_get_device_count(); +GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total); // SYCL doesn't support registering host memory, keep here for reference -// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size); -// GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer); +// GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size); +// GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer); + +GGML_API ggml_backend_reg_t ggml_backend_sycl_reg(void); + #ifdef __cplusplus } #endif diff --git a/ggml/include/ggml-vulkan.h b/ggml/include/ggml-vulkan.h index af661c2d7..c03bbfe5e 100644 --- a/ggml/include/ggml-vulkan.h +++ b/ggml/include/ggml-vulkan.h @@ -13,16 +13,18 @@ extern "C" { GGML_API void ggml_vk_instance_init(void); // backend API -GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num); +GGML_API ggml_backend_t ggml_backend_vk_init(size_t dev_num); -GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend); -GGML_API GGML_CALL int ggml_backend_vk_get_device_count(void); -GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size); -GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total); +GGML_API bool ggml_backend_is_vk(ggml_backend_t backend); +GGML_API int ggml_backend_vk_get_device_count(void); +GGML_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size); +GGML_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total); -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num); +GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num); // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void); +GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void); + +GGML_API ggml_backend_reg_t ggml_backend_vk_reg(void); #ifdef __cplusplus } diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index e24b8a319..de3c706fc 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -187,16 +187,6 @@ # define GGML_API #endif -#ifdef GGML_MULTIPLATFORM -# if defined(_WIN32) -# define GGML_CALL -# else -# define GGML_CALL __attribute__((__ms_abi__)) -# endif -#else -# define GGML_CALL -#endif - // TODO: support for clang #ifdef __GNUC__ # define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint))) @@ -229,14 +219,16 @@ #define GGML_MAX_PARAMS 2048 #define GGML_MAX_CONTEXTS 64 #define GGML_MAX_SRC 10 -#ifndef GGML_MAX_NAME -#define GGML_MAX_NAME 64 #define GGML_MAX_N_THREADS 512 - -#endif #define GGML_MAX_OP_PARAMS 64 + +#ifndef GGML_MAX_NAME +# define GGML_MAX_NAME 64 +#endif + #define GGML_DEFAULT_N_THREADS 4 #define GGML_DEFAULT_GRAPH_SIZE 2048 + #if UINTPTR_MAX == 0xFFFFFFFF #define GGML_MEM_ALIGN 4 #else @@ -259,21 +251,21 @@ #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1)) #ifndef NDEBUG -#define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0) +# define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0) #elif defined(__GNUC__) -#define GGML_UNREACHABLE() __builtin_unreachable() +# define GGML_UNREACHABLE() __builtin_unreachable() #elif defined(_MSC_VER) -#define GGML_UNREACHABLE() __assume(0) +# define GGML_UNREACHABLE() __assume(0) #else -#define GGML_UNREACHABLE() ((void) 0) +# define GGML_UNREACHABLE() ((void) 0) #endif #ifdef __cplusplus -#define GGML_NORETURN [[noreturn]] +# define GGML_NORETURN [[noreturn]] #elif defined(_MSC_VER) -#define GGML_NORETURN __declspec(noreturn) +# define GGML_NORETURN __declspec(noreturn) #else -#define GGML_NORETURN _Noreturn +# define GGML_NORETURN _Noreturn #endif #define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__) @@ -338,7 +330,7 @@ extern "C" { }; // get ggml_status name string - GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status); + GGML_API const char * ggml_status_to_string(enum ggml_status status); // ieee 754-2008 half-precision float16 // todo: make this not an integral type @@ -464,6 +456,7 @@ extern "C" { GGML_OP_SUM_ROWS, GGML_OP_MEAN, GGML_OP_ARGMAX, + GGML_OP_COUNT_EQUAL, GGML_OP_REPEAT, GGML_OP_REPEAT_BACK, GGML_OP_CONCAT, @@ -575,10 +568,10 @@ extern "C" { // this tensor... enum ggml_tensor_flag { - GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph - GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph - GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters - GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up) + GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph + GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph + GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters + GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up) }; // n-dimensional tensor @@ -714,46 +707,46 @@ extern "C" { GGML_API void ggml_print_object (const struct ggml_object * obj); GGML_API void ggml_print_objects(const struct ggml_context * ctx); - GGML_API GGML_CALL int64_t ggml_nelements (const struct ggml_tensor * tensor); - GGML_API GGML_CALL int64_t ggml_nrows (const struct ggml_tensor * tensor); - GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor); - GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN + GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor); + GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor); + GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor); + GGML_API size_t ggml_nbytes_pad(const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN - GGML_API GGML_CALL int64_t ggml_blck_size(enum ggml_type type); - GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block - GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row + GGML_API int64_t ggml_blck_size(enum ggml_type type); + GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block + GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row GGML_DEPRECATED( GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float "use ggml_row_size() instead"); - GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type); - GGML_API GGML_CALL const char * ggml_op_name (enum ggml_op op); - GGML_API const char * ggml_op_symbol(enum ggml_op op); + GGML_API const char * ggml_type_name(enum ggml_type type); + GGML_API const char * ggml_op_name (enum ggml_op op); + GGML_API const char * ggml_op_symbol(enum ggml_op op); - GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op); - GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name + GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op); + GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name - GGML_API GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor); + GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor); - GGML_API GGML_CALL bool ggml_is_quantized(enum ggml_type type); + GGML_API bool ggml_is_quantized(enum ggml_type type); // TODO: temporary until model loading of ggml examples is refactored GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype); - GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor); - GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor); - GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor); - GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor); - GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor); - GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor); - GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor); - GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars + GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor); + GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_empty (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor); + GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars - GGML_API GGML_CALL bool ggml_is_contiguous (const struct ggml_tensor * tensor); - GGML_API GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous() - GGML_API GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1 - GGML_API GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2 + GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous() + GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1 + GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2 GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1); GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1); @@ -845,7 +838,7 @@ extern "C" { GGML_API void * ggml_get_data (const struct ggml_tensor * tensor); GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); - GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor); + GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor); GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name); @@ -1002,6 +995,12 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + // count number of equal elements in a and b + GGML_API struct ggml_tensor * ggml_count_equal( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + // if a is the same shape as b, and a is not parameter, return a // otherwise, return a new tensor: repeat(a) to fit in b GGML_API struct ggml_tensor * ggml_repeat( @@ -1408,14 +1407,14 @@ extern "C" { // supports 3D: a->ne[2] == b->ne[1] GGML_API struct ggml_tensor * ggml_get_rows( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); + struct ggml_tensor * a, // data + struct ggml_tensor * b); // row indices GGML_API struct ggml_tensor * ggml_get_rows_back( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - struct ggml_tensor * c); + struct ggml_tensor * a, // gradients of ggml_get_rows result + struct ggml_tensor * b, // row indices + struct ggml_tensor * c); // data for ggml_get_rows, only used for its shape GGML_API struct ggml_tensor * ggml_diag( struct ggml_context * ctx, @@ -1559,16 +1558,16 @@ extern "C" { "use ggml_rope_ext_inplace instead"); // compute correction dims for YaRN RoPE scaling - GGML_CALL void ggml_rope_yarn_corr_dims( + void ggml_rope_yarn_corr_dims( int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]); // rotary position embedding backward, i.e compute dx from dy // a - dy GGML_API struct ggml_tensor * ggml_rope_back( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - struct ggml_tensor * c, + struct ggml_tensor * a, // gradients of ggml_rope result + struct ggml_tensor * b, // positions + struct ggml_tensor * c, // freq factors int n_dims, int mode, int n_ctx_orig, @@ -2034,15 +2033,15 @@ extern "C" { // loss function GGML_API struct ggml_tensor * ggml_cross_entropy_loss( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); + struct ggml_context * ctx, + struct ggml_tensor * a, // logits + struct ggml_tensor * b); // labels GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - struct ggml_tensor * c); + struct ggml_context * ctx, + struct ggml_tensor * a, // logits + struct ggml_tensor * b, // labels + struct ggml_tensor * c); // gradients of cross_entropy_loss result // AdamW optimizer step // Paper: https://arxiv.org/pdf/1711.05101v3.pdf @@ -2050,6 +2049,7 @@ extern "C" { GGML_API struct ggml_tensor * ggml_opt_step_adamw( struct ggml_context * ctx, struct ggml_tensor * a, + struct ggml_tensor * grad, float alpha, float beta1, float beta2, @@ -2064,7 +2064,7 @@ extern "C" { GGML_API void ggml_set_loss(struct ggml_tensor * tensor); GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); - GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate, bool keep); + GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate); GGML_API void ggml_build_opt_adamw( struct ggml_context * ctx, @@ -2174,6 +2174,10 @@ extern "C" { typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel); typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data); + // Set callback for all future logging events. + // If this is not called, or NULL is supplied, everything is output on stderr. + GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data); + // optimization parameters // // see ggml.c (ggml_opt_default_params) for default values @@ -2484,6 +2488,7 @@ extern "C" { GGML_API int ggml_cpu_has_avx512_vbmi(void); GGML_API int ggml_cpu_has_avx512_vnni(void); GGML_API int ggml_cpu_has_avx512_bf16(void); + GGML_API int ggml_cpu_has_amx_int8 (void); GGML_API int ggml_cpu_has_fma (void); GGML_API int ggml_cpu_has_neon (void); GGML_API int ggml_cpu_has_sve (void); @@ -2507,6 +2512,9 @@ extern "C" { GGML_API int ggml_cpu_has_cann (void); GGML_API int ggml_cpu_has_llamafile (void); + // get the sve vector length in bytes + GGML_API int ggml_cpu_get_sve_cnt(void); + // // Internal types and functions exposed for tests and benchmarks // @@ -2528,7 +2536,7 @@ extern "C" { typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y, int nr, int nc); - typedef struct { + struct ggml_type_traits { const char * type_name; int64_t blck_size; int64_t blck_size_interleave; // interleave elements in blocks @@ -2544,9 +2552,9 @@ extern "C" { int64_t ncols; // number of columns to process simultaneously ggml_gemv_t gemv; ggml_gemm_t gemm; - } ggml_type_traits_t; + }; - GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); + GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type); #ifdef __cplusplus } diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index cbc349500..aa405e4d0 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -163,8 +163,8 @@ if (GGML_OPENMP) list(APPEND GGML_EXTRA_LIBS_PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) if (GGML_MUSA) - list(APPEND GGML_EXTRA_INCLUDES "/usr/lib/llvm-10/include/openmp") - list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-10/lib/libomp.so") + list(APPEND GGML_EXTRA_INCLUDES "/usr/lib/llvm-14/lib/clang/14.0.0/include") + list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so") endif() else() message(WARNING "OpenMP not found") @@ -190,22 +190,24 @@ if (GGML_BLAS) # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268 find_package(PkgConfig REQUIRED) if (${GGML_BLAS_VENDOR} MATCHES "Generic") - pkg_check_modules(DepBLAS REQUIRED blas) + pkg_check_modules(DepBLAS blas) elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS") # As of openblas v0.3.22, the 64-bit is named openblas64.pc pkg_check_modules(DepBLAS openblas64) if (NOT DepBLAS_FOUND) - pkg_check_modules(DepBLAS REQUIRED openblas) + pkg_check_modules(DepBLAS openblas) endif() elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME") - pkg_check_modules(DepBLAS REQUIRED blis) + add_compile_definitions(GGML_BLAS_USE_BLIS) + pkg_check_modules(DepBLAS blis) elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS") - pkg_check_modules(DepBLAS REQUIRED blas-atlas) + pkg_check_modules(DepBLAS blas-atlas) elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS") - pkg_check_modules(DepBLAS REQUIRED flexiblas_api) + pkg_check_modules(DepBLAS flexiblas_api) elseif (${GGML_BLAS_VENDOR} MATCHES "Intel") + add_compile_definitions(GGML_BLAS_USE_MKL) # all Intel* libraries share the same include path - pkg_check_modules(DepBLAS REQUIRED mkl-sdl) + pkg_check_modules(DepBLAS mkl-sdl) elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC") # this doesn't provide pkg-config # suggest to assign BLAS_INCLUDE_DIRS on your own @@ -265,6 +267,26 @@ if (GGML_LLAMAFILE) set(GGML_SOURCES_LLAMAFILE llamafile/sgemm.cpp) endif() +if (GGML_AMX) + if (CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0) + else() + set(GGML_AMX OFF) + message(WARNING "AMX requires gcc version > 11.0. Turning off GGML_AMX.") + endif() + + if (GGML_AMX) + message(STATUS "Using AMX") + + list(APPEND GGML_CDEF_PUBLIC GGML_USE_AMX) + + file(GLOB GGML_HEADERS_AMX "ggml-amx/*.h") + list(APPEND GGML_HEADERS_AMX "../include/ggml-amx.h") + + file(GLOB GGML_SOURCES_AMX "ggml-amx/*.cpp") + list(APPEND GGML_SOURCES_AMX "ggml-amx.cpp") + endif() +endif() + if (GGML_CUDA) cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES @@ -511,8 +533,8 @@ if (GGML_HIPBLAS) endif() if (GGML_SYCL) - if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$") - message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA") + if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$") + message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD") endif() check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL) @@ -532,6 +554,9 @@ if (GGML_SYCL) list(APPEND GGML_CDEF_PUBLIC GGML_USE_SYCL) if (GGML_SYCL_F16) + if (GGML_SYCL_TARGET STREQUAL "AMD") + message(WARNING "AMD target does not entirely support FP16 in the SYCL backend.") + endif() add_compile_definitions(GGML_SYCL_F16) endif() @@ -543,6 +568,12 @@ if (GGML_SYCL) if (GGML_SYCL_TARGET STREQUAL "NVIDIA") add_compile_definitions(GGML_SYCL_WARP_SIZE=32) + elseif (GGML_SYCL_TARGET STREQUAL "AMD") + # INFO: Allowed Sub_group_sizes are not consistent through all + # hip targets. For example, 64 is used for certain models, but the backend + # does not support it. + # Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32) + add_compile_definitions(GGML_SYCL_WARP_SIZE=32) else() add_compile_definitions(GGML_SYCL_WARP_SIZE=16) endif() @@ -576,6 +607,12 @@ if (GGML_SYCL) elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl) + elseif (GGML_SYCL_TARGET STREQUAL "AMD") + if (GGML_SYCL_HIP_TARGET STREQUAL "") + message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_HIP_TARGET has not been set.") + endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${GGML_SYCL_HIP_TARGET}") + list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl) endif() endif() endif() @@ -1163,6 +1200,18 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW add_compile_definitions($<$:__AVX512BF16__>) add_compile_definitions($<$:__AVX512BF16__>) endif() + if (GGML_AMX_TILE) + add_compile_definitions($<$:__AMX_TILE__>) + add_compile_definitions($<$:__AMX_TILE__>) + endif() + if (GGML_AMX_INT8) + add_compile_definitions($<$:__AMX_INT8__>) + add_compile_definitions($<$:__AMX_INT8__>) + endif() + if (GGML_AMX_BF16) + add_compile_definitions($<$:__AMX_BF16__>) + add_compile_definitions($<$:__AMX_BF16__>) + endif() elseif (GGML_AVX2) list(APPEND ARCH_FLAGS /arch:AVX2) elseif (GGML_AVX) @@ -1198,6 +1247,15 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW if (GGML_AVX512_BF16) list(APPEND ARCH_FLAGS -mavx512bf16) endif() + if (GGML_AMX_TILE) + list(APPEND ARCH_FLAGS -mamx-tile) + endif() + if (GGML_AMX_INT8) + list(APPEND ARCH_FLAGS -mamx-int8) + endif() + if (GGML_AMX_BF16) + list(APPEND ARCH_FLAGS -mamx-bf16) + endif() endif() elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") message(STATUS "PowerPC detected") @@ -1310,7 +1368,7 @@ add_library(ggml ../include/ggml-backend.h ggml.c ggml-alloc.c - ggml-backend.c + ggml-backend.cpp ggml-quants.c ggml-quants.h ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} @@ -1323,6 +1381,7 @@ add_library(ggml ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} ${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS} ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE} + ${GGML_SOURCES_AMX} ${GGML_HEADERS_AMX} ${GGML_SOURCES_CANN} ${GGML_HEADERS_CANN} ggml-aarch64.c ggml-aarch64.h ) @@ -1346,6 +1405,10 @@ if (MATH_LIBRARY) endif() endif() +if (CMAKE_SYSTEM_NAME MATCHES "Android") + list(APPEND GGML_EXTRA_LIBS_PRIVATE dl) # Must be linked explicitly +endif() + list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PRIVATE) list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PUBLIC) target_link_libraries(ggml PRIVATE ${GGML_EXTRA_LIBS_PRIVATE} PUBLIC ${GGML_EXTRA_LIBS_PUBLIC}) diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c index 8912de63d..b27f41147 100644 --- a/ggml/src/ggml-aarch64.c +++ b/ggml/src/ggml-aarch64.c @@ -598,15 +598,6 @@ size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_ return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8); } -// Return the number of byte lanes in the SVE vector if SVE is supported; otherwise, returns 0 if SVE is not supported. -static int sve_lane_count(void) { -#if defined(__ARM_FEATURE_SVE) - return ggml_sve_cnt_b; -#else - return 0; -#endif -} - void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; @@ -843,7 +834,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) #if defined(__ARM_FEATURE_SVE) - if (ggml_cpu_has_sve() && sve_lane_count() == QK8_0) { + if (ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0) { const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -2020,7 +2011,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) - if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && sve_lane_count() == QK8_0) { + if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) { const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index 70187b9b6..041de9e3e 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -14,7 +14,7 @@ //#define GGML_ALLOCATOR_DEBUG -//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__) +//#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__) #define AT_PRINTF(...) @@ -89,7 +89,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso size = GGML_PAD(size, talloc->alignment); if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) { - fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n", + GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n", __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset); GGML_ABORT("not enough space in the buffer"); } @@ -172,7 +172,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz best_fit_block = alloc->n_free_blocks - 1; } else { // this should never happen - fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n", + GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n", __func__, size, max_avail); GGML_ABORT("not enough space in the buffer"); } @@ -209,16 +209,16 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz } } } - fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0); + GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0); for (int i = 0; i < 1024; i++) { if (alloc->allocated_tensors[i].tensor) { - fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name, + GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name, alloc->allocated_tensors[i].offset, alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor), ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0); } } - fprintf(stderr, "\n"); + GGML_LOG_DEBUG("\n"); } #endif @@ -348,7 +348,6 @@ struct tensor_alloc { }; struct leaf_alloc { - int buffer_id; struct tensor_alloc leaf; }; @@ -740,7 +739,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c for (int i = 0; i < graph->n_leafs; i++) { struct ggml_tensor * leaf = graph->leafs[i]; struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); - galloc->leaf_allocs[i].buffer_id = hn->buffer_id; if (leaf->view_src || leaf->data) { galloc->leaf_allocs[i].leaf.buffer_id = -1; galloc->leaf_allocs[i].leaf.offset = SIZE_MAX; @@ -768,13 +766,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views if (new_size > cur_size || galloc->buffers[i] == NULL) { #ifndef NDEBUG - fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); + GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); #endif ggml_backend_buffer_free(galloc->buffers[i]); galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size); if (galloc->buffers[i] == NULL) { - fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); + GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); return false; } ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE); @@ -825,14 +823,14 @@ static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_t static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) { if (galloc->n_nodes != graph->n_nodes) { #ifndef NDEBUG - fprintf(stderr, "%s: graph has different number of nodes\n", __func__); + GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__); #endif return true; } if (galloc->n_leafs != graph->n_leafs) { #ifndef NDEBUG - fprintf(stderr, "%s: graph has different number of leafs\n", __func__); + GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__); #endif return true; } @@ -843,7 +841,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) { #ifndef NDEBUG - fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name); + GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name); #endif return true; } @@ -855,7 +853,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph } if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) { #ifndef NDEBUG - fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name); + GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name); #endif return true; } @@ -869,14 +867,14 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) if (ggml_gallocr_needs_realloc(galloc, graph)) { if (galloc->n_buffers == 1) { #ifndef NDEBUG - fprintf(stderr, "%s: reallocating buffers automatically\n", __func__); + GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__); #endif if (!ggml_gallocr_reserve(galloc, graph)) { return false; } } else { #ifndef NDEBUG - fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__); + GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__); #endif return false; } @@ -940,7 +938,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx, ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size); if (buffer == NULL) { #ifndef NDEBUG - fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size); + GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size); #endif for (size_t i = 0; i < *n_buffers; i++) { ggml_backend_buffer_free((*buffers)[i]); @@ -990,7 +988,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte } if (this_size > max_size) { - fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n", + GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n", __func__, t->name, ggml_backend_buft_name(buft), this_size, max_size); @@ -1022,7 +1020,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte if (n_buffers == 0) { #ifndef NDEBUG - fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__); + GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__); #endif return NULL; } diff --git a/ggml/src/ggml-amx.cpp b/ggml/src/ggml-amx.cpp new file mode 100644 index 000000000..ac6ec2342 --- /dev/null +++ b/ggml/src/ggml-amx.cpp @@ -0,0 +1,453 @@ +#include "ggml-amx.h" +#include "ggml-amx/common.h" +#include "ggml-amx/mmq.h" +#include "ggml-backend-impl.h" +#include "ggml-impl.h" + +#if defined(__gnu_linux__) +#include +#include +#endif + +#include +#include +#include + +#if defined(__AMX_INT8__) + +// AMX buffer interface +static const char * ggml_backend_amx_buffer_get_name(ggml_backend_buffer_t buffer) { + return "AMX"; + + GGML_UNUSED(buffer); +} + +static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) { + free(buffer->context); +} + +static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) { + return (void *)(buffer->context); +} + +static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { + memset((char *)tensor->data + offset, value, size); + + GGML_UNUSED(buffer); +} + +static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + if (qtype_has_amx_kernels(tensor->type)) { + ggml_backend_amx_convert_weight(tensor, data, offset, size); + } else { + memcpy((char *)tensor->data + offset, data, size); + } + + GGML_UNUSED(buffer); +} + +static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_ASSERT(!qtype_has_amx_kernels(tensor->type)); + memcpy(data, (const char *)tensor->data + offset, size); + + GGML_UNUSED(buffer); +} + +static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { + if (ggml_backend_buffer_is_host(src->buffer)) { + if (qtype_has_amx_kernels(src->type)) { + ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_backend_amx_get_alloc_size(dst)); + } else { + memcpy(dst->data, src->data, ggml_nbytes(src)); + } + return true; + } + return false; + + GGML_UNUSED(buffer); +} + +static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + memset(buffer->context, value, buffer->size); +} + +static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = { + /* .get_name = */ ggml_backend_amx_buffer_get_name, + /* .free_buffer = */ ggml_backend_amx_buffer_free_buffer, + /* .get_base = */ ggml_backend_amx_buffer_get_base, + /* .init_tensor = */ NULL, // no initialization required + /* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_amx_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_amx_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_amx_buffer_cpy_tensor, + /* .clear = */ ggml_backend_amx_buffer_clear, + /* .reset = */ NULL, +}; + +static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return "AMX"; + + GGML_UNUSED(buft); +} + +static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + void * data = aligned_alloc(TENSOR_ALIGNMENT, size); + if (data == NULL) { + fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size); + return NULL; + } + + return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size); +} + +static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + return TENSOR_ALIGNMENT; + + GGML_UNUSED(buft); +} + +static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) { + return ggml_backend_amx_get_alloc_size(tensor); + + GGML_UNUSED(buft); +} + +static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + return false; + + GGML_UNUSED(buft); +} + +ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() { + static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = { + /* .iface = */ { + /* .get_name = */ ggml_backend_amx_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size, + /* .is_host = */ ggml_backend_amx_buffer_type_is_host, + }, + /* .device = */ NULL, + /* .context = */ NULL, + }; + + return &ggml_backend_buffer_type_amx; +} + +// backend interface + +static const char * ggml_backend_amx_name(ggml_backend_t backend) { + return "AMX"; + + GGML_UNUSED(backend); +} + +static void ggml_backend_amx_free(ggml_backend_t backend) { + ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context; + delete ctx; + delete backend; +} + +static ggml_backend_buffer_type_t ggml_backend_amx_get_default_buffer_type(ggml_backend_t backend) { + return ggml_backend_amx_buffer_type(); + + GGML_UNUSED(backend); +} + +static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context; + + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * node = cgraph->nodes[i]; + + switch (node->op) { + case GGML_OP_MUL_MAT: + ggml_backend_amx_mul_mat(ctx, node); + break; + + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + break; + + default: + fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node)); + GGML_ASSERT(false); + } + } + + return GGML_STATUS_SUCCESS; + + GGML_UNUSED(backend); +} + +static struct ggml_backend_i ggml_backend_amx_i = { + /* .get_name = */ ggml_backend_amx_name, + /* .free = */ ggml_backend_amx_free, + /* .get_default_buffer_type = */ ggml_backend_amx_get_default_buffer_type, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_amx_graph_compute, + /* .supports_op = */ NULL, + /* .supports_buft = */ NULL, + /* .offload_op = */ NULL, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, +}; + +static ggml_guid_t ggml_backend_amx_guid() { + static ggml_guid guid = { 0x13, 0xb8, 0xa4, 0xc4, 0xba, 0xfe, 0x51, 0x67, 0x87, 0x44, 0x55, 0x15, 0xb2, 0x35, 0x62, 0x3e }; + return &guid; +} + +#define ARCH_GET_XCOMP_PERM 0x1022 +#define ARCH_REQ_XCOMP_PERM 0x1023 +#define XFEATURE_XTILECFG 17 +#define XFEATURE_XTILEDATA 18 + +static bool ggml_amx_init() { +#if defined(__gnu_linux__) + if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) { + fprintf(stderr, "AMX is not ready to be used!\n"); + return false; + } + return true; +#elif defined(_WIN32) + return true; +#endif +} + +ggml_backend_t ggml_backend_amx_init() { + + // invoke a Linux system call to request access to AMX features + ggml_amx_init(); + + // backend context + ggml_backend_amx_context * ctx = new ggml_backend_amx_context; + + // ggml amx backend + ggml_backend_t backend = new ggml_backend { + /* .guid = */ ggml_backend_amx_guid(), + /* .interface = */ ggml_backend_amx_i, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0), + /* .context = */ ctx, + }; + + return backend; +} + +bool ggml_backend_is_amx(ggml_backend_t backend) { + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_amx_guid()); +} + +void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) { + GGML_ASSERT(ggml_backend_is_amx(backend_amx)); + + ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend_amx->context; + ctx->n_threads = n_threads; +} + +// device interface + +static const char * ggml_backend_amx_device_get_name(ggml_backend_dev_t dev) { + return "AMX"; + + GGML_UNUSED(dev); +} + +static const char * ggml_backend_amx_device_get_description(ggml_backend_dev_t dev) { + return "Intel Advanced Matrix Extensions"; + + GGML_UNUSED(dev); +} + +static void ggml_backend_amx_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + // TODO + *free = 0; + *total = 0; + + GGML_UNUSED(dev); +} + +static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) { + return GGML_BACKEND_DEVICE_TYPE_CPU; + + GGML_UNUSED(dev); +} + +static void ggml_backend_amx_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { + props->name = ggml_backend_amx_device_get_name(dev); + props->description = ggml_backend_amx_device_get_description(dev); + props->type = ggml_backend_amx_device_get_type(dev); + ggml_backend_amx_device_get_memory(dev, &props->memory_free, &props->memory_total); + + // `buffer_from_host_ptr` is intended to be used in mmap, when memory layout unchanged + props->caps = { + /* .async = */ false, + /* .host_buffer = */ false, + /* .buffer_from_host_ptr = */ false, + /* .events = */ false, + }; +} + +static ggml_backend_t ggml_backend_amx_device_init(ggml_backend_dev_t dev, const char * params) { + return ggml_backend_amx_init(); + + GGML_UNUSED(dev); + GGML_UNUSED(params); +} + +static ggml_backend_buffer_type_t ggml_backend_amx_device_get_buffer_type(ggml_backend_dev_t dev) { + return ggml_backend_amx_buffer_type(); + + GGML_UNUSED(dev); +} + +static bool ggml_backend_amx_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { + + // handle only 2d gemm for now + auto is_contiguous_2d = [](const struct ggml_tensor * t) { + return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1; + }; + + switch (op->op) { + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + return true; + + case GGML_OP_MUL_MAT: { + const struct ggml_tensor * src0 = op->src[0]; + const struct ggml_tensor * src1 = op->src[1]; + + const enum ggml_type type = src0->type; + const int64_t ne0 = op->ne[0]; + + bool is_training = src0->grad || src1->grad; + + // amx kernels enables for Q4_0, Q4_1, Q8_0, F16 + // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256 + bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16); + + bool can_use_amx = + is_contiguous_2d(src0) && // src0 must be contiguous + is_contiguous_2d(src1) && // src1 must be contiguous + !is_training && // inference only + src1->type == GGML_TYPE_F32 && // src1 must be float32 + has_amx_kernels && // with amx kernel impls + ne0 % (TILE_N * 2) == 0; // out_features is 32x + + return can_use_amx; + } + default: + return false; + } + + GGML_UNUSED(dev); +} + +static bool ggml_backend_amx_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name; + + GGML_UNUSED(dev); +} + +static const struct ggml_backend_device_i ggml_backend_amx_device_i = { + /* .get_name = */ ggml_backend_amx_device_get_name, + /* .get_description = */ ggml_backend_amx_device_get_description, + /* .get_memory = */ ggml_backend_amx_device_get_memory, + /* .get_type = */ ggml_backend_amx_device_get_type, + /* .get_props = */ ggml_backend_amx_device_get_props, + /* .init_backend = */ ggml_backend_amx_device_init, + /* .get_buffer_type = */ ggml_backend_amx_device_get_buffer_type, + /* .get_host_buffer_type = */ NULL, + /* .buffer_from_host_ptr = */ NULL, + /* .supports_op = */ ggml_backend_amx_device_supports_op, + /* .supports_buft = */ ggml_backend_amx_device_supports_buft, + /* .offload_op = */ NULL, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, +}; + +// backend reg interface + +static const char * ggml_backend_amx_reg_get_name(ggml_backend_reg_t reg) { + return "AMX"; + + GGML_UNUSED(reg); +} + +static size_t ggml_backend_amx_reg_get_device_count(ggml_backend_reg_t reg) { + return 1; + + GGML_UNUSED(reg); +} + +static ggml_backend_dev_t ggml_backend_amx_reg_get_device(ggml_backend_reg_t reg, size_t index) { + GGML_ASSERT(index == 0); + + static ggml_backend_device ggml_backend_amx_device = { + /* .iface = */ ggml_backend_amx_device_i, + /* .reg = */ reg, + /* .context = */ nullptr, + }; + + return &ggml_backend_amx_device; + + GGML_UNUSED(reg); + GGML_UNUSED(index); +} + +static void * ggml_backend_amx_get_proc_address(ggml_backend_reg_t reg, const char * name) { + if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) { + return (void *)ggml_backend_amx_set_n_threads; + } + return NULL; + + GGML_UNUSED(reg); + GGML_UNUSED(name); +} + +static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = { + /* .get_name = */ ggml_backend_amx_reg_get_name, + /* .get_device_count = */ ggml_backend_amx_reg_get_device_count, + /* .get_device = */ ggml_backend_amx_reg_get_device, + /* .get_proc_address = */ ggml_backend_amx_get_proc_address, +}; + +ggml_backend_reg_t ggml_backend_amx_reg(void) { + static struct ggml_backend_reg ggml_backend_amx_reg = { + /* .iface = */ ggml_backend_amx_reg_i, + /* .context = */ NULL, + }; + + return &ggml_backend_amx_reg; +} + +#else // if defined(__AMX_INT8__) + +ggml_backend_t ggml_backend_amx_init(void) { + fprintf(stderr, "GGML is not compiled with AMX support!\n"); + return ggml_backend_t{}; +} + +void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) { + fprintf(stderr, "GGML is not compiled with AMX support!\n"); + + GGML_UNUSED(backend_amx); + GGML_UNUSED(n_threads); +} + +#endif diff --git a/ggml/src/ggml-amx/common.h b/ggml/src/ggml-amx/common.h new file mode 100644 index 000000000..2b6c63527 --- /dev/null +++ b/ggml/src/ggml-amx/common.h @@ -0,0 +1,93 @@ +#pragma once + +#include "ggml.h" +#include "ggml-cpu-impl.h" // + +#include +#include +#include + +#if defined(_OPENMP) +#include +#endif + +#define TILE_M 16 +#define TILE_N 16 +#define TILE_K 32 +#define VNNI_BLK 4 + +#define AMX_BLK_SIZE 32 + +#define TMM0 0 +#define TMM1 1 +#define TMM2 2 +#define TMM3 3 +#define TMM4 4 +#define TMM5 5 +#define TMM6 6 +#define TMM7 7 + +// parallel routines +template ::value, int>::type = 0> +inline T div_up(T x, T y) { return (x + y - 1) / y; } + +template +inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) { +#if 0 + // onednn partition pattern + T& n_my = n_end; + if (nth <= 1 || n == 0) { + n_start = 0; + n_my = n; + } else { + T n1 = div_up(n, nth); + T n2 = n1 - 1; + T T1 = n - n2 * nth; + n_my = ith < T1 ? n1 : n2; + n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2; + } + n_end += n_start; +#else + // pytorch aten partition pattern + T n_my = div_up(n, nth); + n_start = ith * n_my; + n_end = std::min(n_start + n_my, n); +#endif +} + +template +inline void parallel_for(int nth, int n, const func_t& f) { +#if defined(_OPENMP) +#pragma omp parallel num_threads(nth) +{ + //int nth = omp_get_num_threads(); + int ith = omp_get_thread_num(); + int tbegin, tend; + balance211(n, nth, ith, tbegin, tend); + f(tbegin, tend); +} +#else + f(0, n); + + GGML_UNUSED(nth); +#endif +} + +// quantized types that have AMX support +inline bool qtype_has_amx_kernels(const enum ggml_type type) { + // TODO: fix padding for vnni format + return (type == GGML_TYPE_Q4_0) || + (type == GGML_TYPE_Q4_1); + //(type == GGML_TYPE_Q8_0) || + //(type == GGML_TYPE_Q4_K) || + //(type == GGML_TYPE_Q5_K) || + //(type == GGML_TYPE_Q6_K) || + //(type == GGML_TYPE_IQ4_XS); +} + +// ggml backend context +struct ggml_backend_amx_context { + int n_threads = GGML_DEFAULT_N_THREADS; + std::unique_ptr work_data; + size_t work_size = 0; +}; diff --git a/ggml/src/ggml-amx/mmq.cpp b/ggml/src/ggml-amx/mmq.cpp new file mode 100644 index 000000000..239d15121 --- /dev/null +++ b/ggml/src/ggml-amx/mmq.cpp @@ -0,0 +1,2509 @@ + +#if defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wpedantic" +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" +#endif + +#include "mmq.h" +#include "ggml-impl.h" +#include "ggml-quants.h" +#include +#include + +#if defined(__gnu_linux__) +#include +#include +#endif + +#if defined(_OPENMP) +#include +#endif + +#if (defined(_WIN32) || defined(_WIN64)) +#define RESTRICT __restrict +#else +#define RESTRICT __restrict__ +#endif + +#if (defined(_WIN32) || defined(_WIN64)) +#define ALWAYS_INLINE __forceinline +#elif __has_attribute(always_inline) || defined(__GNUC__) +#define ALWAYS_INLINE __attribute__((__always_inline__)) inline +#else +#define ALWAYS_INLINE inline +#endif + +#if defined(__AMX_INT8__) + +namespace { + +// Forced unrolling +template +struct Unroll { + template + ALWAYS_INLINE void operator()(const Func& f, Args... args) const { + Unroll{}(f, args...); + f(std::integral_constant{}, args...); + } +}; + +template <> +struct Unroll<1> { + template + ALWAYS_INLINE void operator()(const Func& f, Args... args) const { + f(std::integral_constant{}, args...); + } +}; + +// type traits +template struct PackedTypes {}; +template <> struct PackedTypes { using type = int8_t; }; +template <> struct PackedTypes { using type = uint8_t; }; +template <> struct PackedTypes { using type = int8_t; }; +template using packed_B_type = typename PackedTypes::type; + +template +struct do_compensate : std::integral_constant::value> {}; + +template +struct do_unpack : std::integral_constant::value || + std::is_same::value> {}; + +template +struct is_type_qkk : std::integral_constant::value || + std::is_same::value || + std::is_same::value || + std::is_same::value> {}; + +#define GGML_DISPATCH_FLOATING_TYPES(TYPE, ...) \ + [&] { \ + switch (TYPE) { \ + case GGML_TYPE_F16: { \ + using type = ggml_fp16_t; \ + constexpr int blck_size = 16; \ + return __VA_ARGS__(); \ + } \ + case GGML_TYPE_BF16: { \ + using type = ggml_bf16_t; \ + constexpr int blck_size = 32; \ + return __VA_ARGS__(); \ + } \ + default: \ + fprintf(stderr, "Unsupported floating data type\n"); \ + } \ + }() + +#define GGML_DISPATCH_QTYPES(QT, ...) \ + [&] { \ + switch (QT) { \ + case GGML_TYPE_Q4_0: { \ + using type = block_q4_0; \ + using vec_dot_type = block_q8_0; \ + constexpr int blck_size = QK4_0; \ + return __VA_ARGS__(); \ + } \ + case GGML_TYPE_Q4_1: { \ + using type = block_q4_1; \ + using vec_dot_type = block_q8_1; \ + constexpr int blck_size = QK4_1; \ + return __VA_ARGS__(); \ + } \ + case GGML_TYPE_Q8_0: { \ + using type = block_q8_0; \ + using vec_dot_type = block_q8_0; \ + constexpr int blck_size = QK8_0; \ + return __VA_ARGS__(); \ + } \ + case GGML_TYPE_Q4_K: { \ + using type = block_q4_K; \ + using vec_dot_type = block_q8_K; \ + constexpr int blck_size = QK_K; \ + return __VA_ARGS__(); \ + } \ + case GGML_TYPE_Q5_K: { \ + using type = block_q5_K; \ + using vec_dot_type = block_q8_K; \ + constexpr int blck_size = QK_K; \ + return __VA_ARGS__(); \ + } \ + case GGML_TYPE_Q6_K: { \ + using type = block_q6_K; \ + using vec_dot_type = block_q8_K; \ + constexpr int blck_size = QK_K; \ + return __VA_ARGS__(); \ + } \ + case GGML_TYPE_IQ4_XS: { \ + using type = block_iq4_xs; \ + using vec_dot_type = block_q8_K; \ + constexpr int blck_size = QK_K; \ + return __VA_ARGS__(); \ + } \ + default: \ + fprintf(stderr, "Unsupported quantized data type: %d\n", int(TYPE)); \ + } \ + }() + +#define GGML_DISPATCH_BOOL(BOOL_V, BOOL_NAME, ...) \ + [&] { \ + if (BOOL_V) { \ + constexpr bool BOOL_NAME = true; \ + return __VA_ARGS__(); \ + } else { \ + constexpr bool BOOL_NAME = false; \ + return __VA_ARGS__(); \ + } \ + }() + +// define amx tile config data structure +struct tile_config_t{ + uint8_t palette_id = 0; + uint8_t start_row = 0; + uint8_t reserved_0[14] = {0}; + uint16_t colsb[16] = {0}; + uint8_t rows[16] = {0}; +}; + +// Notes: amx tile config +// +// Typically, TMUL calculates A and B of size 16 x 64 containing INT8 values, +// and accumulate the result to a 16 x 16 matrix C containing INT32 values, +// +// As many GGUF quantized types as `block_size` of 32, so a 16-16-32 config is used +// instead of the normally used 16-16-64 config. +// +// Block A: {16, 32}, dtype = int8_t +// Block B: {16, 32}, dtype = uint8_t/int8_t +// Block C: {16, 16}, dtype = int32_t +// +// Block B needs to be prepacked to vnni format before feeding into TMUL: +// packed_B: from {n, k} to {k/vnni_blk, n, vnni_blck}, viewed in 2d, we get {8, 64} +// +// Therefore, we get tileconfig: +// A B C +// rows 16 8 16 +// colsb 32 64 16 +// +// For tile distribution, follow a 2-2-4 pattern, e.g. A used TMM2-TMM3, B used TMM0-TMM1, +// C used TMM4-TMM7: +// B TMM0 B TMM1 +// A TMM2 C TMM4 C TMM6 +// A TMM3 C TMM5 C TMM7 +// +// Each `amx` kernel handles 4 blocks at a time: 2MB * 2NB, when m < 2 * BLOCK_M, unpack A +// will be needed. +// +// Here another commonly used pattern 1-3-3 is skipped, as it is mostly used when m <=16; +// and the sinlge batch gemm (m=1) has a special fast path with `avx512-vnni`. +// +// ref: https://www.intel.com/content/www/us/en/developer/articles/code-sample/ +// advanced-matrix-extensions-intrinsics-functions.html +// + +#define TC_CONFIG_TILE(i, r, cb) tc.rows[i] = r; tc.colsb[i] = cb +void ggml_tile_config_init(void) { + static thread_local bool is_first_time = true; + + if (!is_first_time) { + return; + } + + static thread_local tile_config_t tc; + tile_config_t current_tc; + _tile_storeconfig(¤t_tc); + + // load only when config changes + if (tc.palette_id == 0 || (memcmp(¤t_tc.colsb, &tc.colsb, sizeof(uint16_t) * 8) != 0 && + memcmp(¤t_tc.rows, &tc.rows, sizeof(uint8_t) * 8) != 0)) { + tc.palette_id = 1; + tc.start_row = 0; + TC_CONFIG_TILE(TMM0, 8, 64); + TC_CONFIG_TILE(TMM1, 8, 64); + TC_CONFIG_TILE(TMM2, 16, 32); + TC_CONFIG_TILE(TMM3, 16, 32); + TC_CONFIG_TILE(TMM4, 16, 64); + TC_CONFIG_TILE(TMM5, 16, 64); + TC_CONFIG_TILE(TMM6, 16, 64); + TC_CONFIG_TILE(TMM7, 16, 64); + _tile_loadconfig(&tc); + } + + is_first_time = false; +} + +// we need an extra 16 * 4B (TILE_N * int32_t) for each NB/KB block for compensation. +// See the notes `s8s8 igemm compensation in avx512-vnni` for detail. +template +int get_tile_size() { + int tile_size = TILE_N * sizeof(TB); + if (do_compensate::value) { + tile_size += TILE_N * sizeof(int32_t); + } + if (std::is_same::value || + std::is_same::value) { + tile_size += TILE_N * 4; + } + if (std::is_same::value) { + tile_size += TILE_N * 2; + } + return tile_size; +} + +template +int get_row_size(int K) { + int KB = K / BLOCK_K; + int row_size = KB * sizeof(TB); + if (do_compensate::value) { + row_size += KB * sizeof(int32_t); + } + if (std::is_same::value || + std::is_same::value) { + row_size += KB * 4; + } + if (std::is_same::value) { + row_size += KB * 2; + } + return row_size; +} + +// vectorized dtype conversion +inline float FP16_TO_FP32(ggml_half val) { + __m256i v = _mm256_setr_epi16( + val, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + __m512 o = _mm512_cvtph_ps(v); + return _mm512_cvtss_f32(o); +} + +inline __m512 FP16_TO_FP32_VEC(ggml_half val) { + __m256i v = _mm256_set1_epi16(val); + return _mm512_cvtph_ps(v); +} + +// horizontal reduce +inline float _mm512_reduce_max_ps(const __m512 x) { + __m512 v = x; + __m512 v1 = _mm512_shuffle_f32x4(v, v, 0x4E); + v = _mm512_max_ps(v, v1); + v1 = _mm512_shuffle_f32x4(v, v, 0xB1); + v = _mm512_max_ps(v, v1); + v1 = _mm512_shuffle_ps(v, v, 0x4E); + v = _mm512_max_ps(v, v1); + v1 = _mm512_shuffle_ps(v, v, 0xB1); + v = _mm512_max_ps(v, v1); + return _mm512_cvtss_f32(v); +} + +// transpose utils +#define SHUFFLE_EPI32(a, b, mask) \ + _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), mask)) +inline void transpose_8x8_32bit(__m256i * v, __m256i * v1) { + // unpacking and 32-bit elements + v1[0] = _mm256_unpacklo_epi32(v[0], v[1]); + v1[1] = _mm256_unpackhi_epi32(v[0], v[1]); + v1[2] = _mm256_unpacklo_epi32(v[2], v[3]); + v1[3] = _mm256_unpackhi_epi32(v[2], v[3]); + v1[4] = _mm256_unpacklo_epi32(v[4], v[5]); + v1[5] = _mm256_unpackhi_epi32(v[4], v[5]); + v1[6] = _mm256_unpacklo_epi32(v[6], v[7]); + v1[7] = _mm256_unpackhi_epi32(v[6], v[7]); + + // shuffling the 32-bit elements + v[0] = SHUFFLE_EPI32(v1[0], v1[2], 0x44); + v[1] = SHUFFLE_EPI32(v1[0], v1[2], 0xee); + v[2] = SHUFFLE_EPI32(v1[4], v1[6], 0x44); + v[3] = SHUFFLE_EPI32(v1[4], v1[6], 0xee); + v[4] = SHUFFLE_EPI32(v1[1], v1[3], 0x44); + v[5] = SHUFFLE_EPI32(v1[1], v1[3], 0xee); + v[6] = SHUFFLE_EPI32(v1[5], v1[7], 0x44); + v[7] = SHUFFLE_EPI32(v1[5], v1[7], 0xee); + + // shuffling 128-bit elements + v1[0] = _mm256_permute2f128_si256(v[2], v[0], 0x02); + v1[1] = _mm256_permute2f128_si256(v[3], v[1], 0x02); + v1[2] = _mm256_permute2f128_si256(v[6], v[4], 0x02); + v1[3] = _mm256_permute2f128_si256(v[7], v[5], 0x02); + v1[4] = _mm256_permute2f128_si256(v[2], v[0], 0x13); + v1[5] = _mm256_permute2f128_si256(v[3], v[1], 0x13); + v1[6] = _mm256_permute2f128_si256(v[6], v[4], 0x13); + v1[7] = _mm256_permute2f128_si256(v[7], v[5], 0x13); +} + +inline void transpose_16x4_32bit(__m512i * r, __m512i * d) { + + static const __m512i index1 = _mm512_set_epi32( + 0x0f, 0x0b, 0x07, 0x03, + 0x0e, 0x0a, 0x06, 0x02, + 0x0d, 0x09, 0x05, 0x01, + 0x0c, 0x08, 0x04, 0x00); + + d[0] = _mm512_permutexvar_epi32(index1, r[0]); + d[1] = _mm512_permutexvar_epi32(index1, r[1]); + d[2] = _mm512_permutexvar_epi32(index1, r[2]); + d[3] = _mm512_permutexvar_epi32(index1, r[3]); + + r[0] = _mm512_shuffle_i32x4(d[0], d[1], 0x44); + r[1] = _mm512_shuffle_i32x4(d[0], d[1], 0xee); + r[2] = _mm512_shuffle_i32x4(d[2], d[3], 0x44); + r[3] = _mm512_shuffle_i32x4(d[2], d[3], 0xee); + + d[0] = _mm512_shuffle_i32x4(r[0], r[2], 0x88); + d[1] = _mm512_shuffle_i32x4(r[0], r[2], 0xdd); + d[2] = _mm512_shuffle_i32x4(r[1], r[3], 0x88); + d[3] = _mm512_shuffle_i32x4(r[1], r[3], 0xdd); +} + +inline void transpose_16x16_32bit(__m512i * v) { + __m512i v1[16]; + v1[0] = _mm512_unpacklo_epi32(v[0], v[1]); + v1[1] = _mm512_unpackhi_epi32(v[0], v[1]); + v1[2] = _mm512_unpacklo_epi32(v[2], v[3]); + v1[3] = _mm512_unpackhi_epi32(v[2], v[3]); + v1[4] = _mm512_unpacklo_epi32(v[4], v[5]); + v1[5] = _mm512_unpackhi_epi32(v[4], v[5]); + v1[6] = _mm512_unpacklo_epi32(v[6], v[7]); + v1[7] = _mm512_unpackhi_epi32(v[6], v[7]); + v1[8] = _mm512_unpacklo_epi32(v[8], v[9]); + v1[9] = _mm512_unpackhi_epi32(v[8], v[9]); + v1[10] = _mm512_unpacklo_epi32(v[10], v[11]); + v1[11] = _mm512_unpackhi_epi32(v[10], v[11]); + v1[12] = _mm512_unpacklo_epi32(v[12], v[13]); + v1[13] = _mm512_unpackhi_epi32(v[12], v[13]); + v1[14] = _mm512_unpacklo_epi32(v[14], v[15]); + v1[15] = _mm512_unpackhi_epi32(v[14], v[15]); + + v[0] = _mm512_unpacklo_epi64(v1[0], v1[2]); + v[1] = _mm512_unpackhi_epi64(v1[0], v1[2]); + v[2] = _mm512_unpacklo_epi64(v1[1], v1[3]); + v[3] = _mm512_unpackhi_epi64(v1[1], v1[3]); + v[4] = _mm512_unpacklo_epi64(v1[4], v1[6]); + v[5] = _mm512_unpackhi_epi64(v1[4], v1[6]); + v[6] = _mm512_unpacklo_epi64(v1[5], v1[7]); + v[7] = _mm512_unpackhi_epi64(v1[5], v1[7]); + v[8] = _mm512_unpacklo_epi64(v1[8], v1[10]); + v[9] = _mm512_unpackhi_epi64(v1[8], v1[10]); + v[10] = _mm512_unpacklo_epi64(v1[9], v1[11]); + v[11] = _mm512_unpackhi_epi64(v1[9], v1[11]); + v[12] = _mm512_unpacklo_epi64(v1[12], v1[14]); + v[13] = _mm512_unpackhi_epi64(v1[12], v1[14]); + v[14] = _mm512_unpacklo_epi64(v1[13], v1[15]); + v[15] = _mm512_unpackhi_epi64(v1[13], v1[15]); + + v1[0] = _mm512_shuffle_i32x4(v[0], v[4], 0x88); + v1[1] = _mm512_shuffle_i32x4(v[1], v[5], 0x88); + v1[2] = _mm512_shuffle_i32x4(v[2], v[6], 0x88); + v1[3] = _mm512_shuffle_i32x4(v[3], v[7], 0x88); + v1[4] = _mm512_shuffle_i32x4(v[0], v[4], 0xdd); + v1[5] = _mm512_shuffle_i32x4(v[1], v[5], 0xdd); + v1[6] = _mm512_shuffle_i32x4(v[2], v[6], 0xdd); + v1[7] = _mm512_shuffle_i32x4(v[3], v[7], 0xdd); + v1[8] = _mm512_shuffle_i32x4(v[8], v[12], 0x88); + v1[9] = _mm512_shuffle_i32x4(v[9], v[13], 0x88); + v1[10] = _mm512_shuffle_i32x4(v[10], v[14], 0x88); + v1[11] = _mm512_shuffle_i32x4(v[11], v[15], 0x88); + v1[12] = _mm512_shuffle_i32x4(v[8], v[12], 0xdd); + v1[13] = _mm512_shuffle_i32x4(v[9], v[13], 0xdd); + v1[14] = _mm512_shuffle_i32x4(v[10], v[14], 0xdd); + v1[15] = _mm512_shuffle_i32x4(v[11], v[15], 0xdd); + + v[0] = _mm512_shuffle_i32x4(v1[0], v1[8], 0x88); + v[1] = _mm512_shuffle_i32x4(v1[1], v1[9], 0x88); + v[2] = _mm512_shuffle_i32x4(v1[2], v1[10], 0x88); + v[3] = _mm512_shuffle_i32x4(v1[3], v1[11], 0x88); + v[4] = _mm512_shuffle_i32x4(v1[4], v1[12], 0x88); + v[5] = _mm512_shuffle_i32x4(v1[5], v1[13], 0x88); + v[6] = _mm512_shuffle_i32x4(v1[6], v1[14], 0x88); + v[7] = _mm512_shuffle_i32x4(v1[7], v1[15], 0x88); + v[8] = _mm512_shuffle_i32x4(v1[0], v1[8], 0xdd); + v[9] = _mm512_shuffle_i32x4(v1[1], v1[9], 0xdd); + v[10] = _mm512_shuffle_i32x4(v1[2], v1[10], 0xdd); + v[11] = _mm512_shuffle_i32x4(v1[3], v1[11], 0xdd); + v[12] = _mm512_shuffle_i32x4(v1[4], v1[12], 0xdd); + v[13] = _mm512_shuffle_i32x4(v1[5], v1[13], 0xdd); + v[14] = _mm512_shuffle_i32x4(v1[6], v1[14], 0xdd); + v[15] = _mm512_shuffle_i32x4(v1[7], v1[15], 0xdd); +} + +void quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, int64_t k) { + assert(k % QK_K == 0); + const int KB = k / QK_K; + constexpr int kVecs = QK_K / 16; + + block_q8_K * y = reinterpret_cast(vy); + + // hold 16 float vecs from x + __m512 v[kVecs]; + + // hold the quants vecs + __m512i vq[kVecs / 4]; + + // hold the packed quants vecs + __m512i vq_packed[kVecs / 4]; + + const __m512 signBit = _mm512_set1_ps(-0.f); + + for (int i = 0; i < KB; ++i) { + // Compute max(abs(e)) for the block + __m512 vamax = _mm512_set1_ps(0.f); + for (int j = 0; j < kVecs; ++j) { + v[j] = _mm512_loadu_ps(x); x += 16; + vamax = _mm512_max_ps(vamax, _mm512_andnot_ps(signBit, v[j])); + } + const float amax = _mm512_reduce_max_ps(vamax); + + // Quantize these floats + const float iscale = 127.f / amax; + y[i].d = GGML_FP32_TO_FP16(1 / iscale); + const float id = ( amax != 0.0f ) ? iscale : 0.f; + const __m512 vscale = _mm512_set1_ps(id); + + // Apply multiplier and round to nearest integer + for (int j = 0; j < kVecs; ++j) { + v[j] = _mm512_mul_ps(v[j], vscale); + v[j] = _mm512_roundscale_ps(v[j], (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + } + + // Pack to epi8 vecs + for (int j = 0; j < kVecs / 4; ++j) { + __m128i q8_0 = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(v[j * 4 + 0])); + __m128i q8_1 = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(v[j * 4 + 1])); + __m128i q8_2 = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(v[j * 4 + 2])); + __m128i q8_3 = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(v[j * 4 + 3])); + + __m256i q8_01 = _mm256_insertf128_si256(_mm256_castsi128_si256(q8_0), (q8_1), 1); + __m256i q8_23 = _mm256_insertf128_si256(_mm256_castsi128_si256(q8_2), (q8_3), 1); + + vq[j] = _mm512_inserti32x8(_mm512_castsi256_si512(q8_01), q8_23, 1); + _mm512_storeu_si512((__m512i *)(y[i].qs + j * 64), vq[j]); + } + + // Compute the bsums with vnni + transpose_16x4_32bit(vq, vq_packed); + + const __m512i one = _mm512_set1_epi8(1); + __m512i sum = _mm512_setzero_si512(); + for (int k = 0; k < 4; ++k) { + sum = _mm512_dpbusd_epi32(sum, one, vq_packed[k]); + } + _mm256_storeu_si256((__m256i *)(y[i].bsums), _mm512_cvtepi32_epi16(sum)); + } +} + +// quantize A from float to `vec_dot_type` +template +inline void from_float(const float * x, char * vy, int64_t k); + +template <> +inline void from_float(const float * x, char * vy, int64_t k) { + quantize_row_q8_0(x, vy, k); +} + +template <> +inline void from_float(const float * x, char * vy, int64_t k) { + quantize_row_q8_1(x, vy, k); +} + +template <> +inline void from_float(const float * x, char * vy, int64_t k) { +#if 1 + // TODO: this is reference impl! + quantize_row_q8_K(x, vy, k); +#else + quantize_row_q8_K_vnni(x, vy, k); +#endif +} + +// load A from memory to array when nrows can not fill in whole tile +void unpack_A(int8_t * RESTRICT tile, const block_q8_0 * RESTRICT A, int lda, int nr) { + assert(nr != TILE_M); + for (int m = 0; m < nr; ++m) { + const __m256i v = _mm256_loadu_si256((const __m256i *)(A[m * lda].qs)); + _mm256_storeu_si256((__m256i *)(tile + m * TILE_K), v); + } +} + +void unpack_A(int8_t * RESTRICT tile, const block_q8_1 * RESTRICT A, int lda, int nr) { + assert(nr != TILE_M); + for (int m = 0; m < nr; ++m) { + const __m256i v = _mm256_loadu_si256((const __m256i *)(A[m * lda].qs)); + _mm256_storeu_si256((__m256i *)(tile + m * TILE_K), v); + } +} + +template +void unpack_A(int8_t * RESTRICT tile, const block_q8_K * RESTRICT A, int lda, int k, int nr) { + assert(nr <= TILE_M); + for (int m = 0; m < nr; ++m) { + const __m256i v = _mm256_loadu_si256((const __m256i *)(A[m * lda].qs + k * 32)); + _mm256_storeu_si256((__m256i *)(tile + m * TILE_K), v); + } +} + +template <> +void unpack_A(int8_t * RESTRICT tile, const block_q8_K * RESTRICT A, int lda, int k, int nr) { + assert(nr <= TILE_M); + // zero padding k from 16 to 32, so that we don't have to re-config amx + const __m128i zero = _mm_setzero_si128(); + for (int m = 0; m < nr; ++m) { + const __m128i v = _mm_loadu_si128((const __m128i *)(A[m * lda].qs + k * 16)); + const __m256i r = _mm256_insertf128_si256(_mm256_castsi128_si256(v), zero, 1); + _mm256_storeu_si256((__m256i *)(tile + m * TILE_K), r); + } +} + +#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) +inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) { + const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi); + const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp); + const __m256i lowMask = _mm256_set1_epi8(0xF); + return _mm256_and_si256(lowMask, bytes); +} + +// used for block_q4_K +inline __m512i bytes_from_nibbles_64(const uint8_t * rsi) { + const __m256i tmp = _mm256_loadu_si256((const __m256i *)rsi); + const __m256i lowMask = _mm256_set1_epi8(0xF); + const __m256i q4l = _mm256_and_si256(tmp, lowMask); + const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(tmp, 4), lowMask); + return _mm512_inserti32x8(_mm512_castsi256_si512(q4l), q4h, 1); +} + +// used for block_q5_K +inline __m512i bytes_from_nibbles_64(const uint8_t * qs, const uint8_t * qh, int k) { + const __m256i lowMask = _mm256_set1_epi8(0xF); + __m256i hmask = _mm256_set1_epi8(1); + hmask = _mm256_slli_epi16(hmask, k); + + const __m256i q5bits = _mm256_loadu_si256((const __m256i *)qs); + const __m256i hbits = _mm256_loadu_si256((const __m256i *)qh); + + const __m256i q5l_0 = _mm256_and_si256(q5bits, lowMask); + const __m256i q5h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), k + 0), 4); + const __m256i q5_0 = _mm256_add_epi8(q5l_0, q5h_0); + hmask = _mm256_slli_epi16(hmask, 1); + + const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), lowMask); + const __m256i q5h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), k + 1), 4); + const __m256i q5_1 = _mm256_add_epi8(q5l_1, q5h_1); + + return _mm512_inserti32x8(_mm512_castsi256_si512(q5_0), q5_1, 1); +} + +// used for block_q6_K +inline void bytes_from_nibbles_128(__m512i& r0, __m512i& r1, const uint8_t * qs, const uint8_t * qh) { + const __m256i m4 = _mm256_set1_epi8(0xF); + const __m256i m2 = _mm256_set1_epi8(0x3); + + const __m256i q6bits1 = _mm256_loadu_si256((const __m256i *)qs); + const __m256i q6bits2 = _mm256_loadu_si256((const __m256i *)(qs + 32)); + const __m256i q6bitsH = _mm256_loadu_si256((const __m256i *)qh); + + const __m256i q6h_0 = _mm256_slli_epi16(_mm256_and_si256( q6bitsH, m2), 4); + const __m256i q6h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q6bitsH, 2), m2), 4); + const __m256i q6h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q6bitsH, 4), m2), 4); + const __m256i q6h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q6bitsH, 6), m2), 4); + + const __m256i q6_0 = _mm256_or_si256(_mm256_and_si256(q6bits1, m4), q6h_0); + const __m256i q6_1 = _mm256_or_si256(_mm256_and_si256(q6bits2, m4), q6h_1); + const __m256i q6_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q6bits1, 4), m4), q6h_2); + const __m256i q6_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q6bits2, 4), m4), q6h_3); + + r0 = _mm512_inserti32x8(_mm512_castsi256_si512(q6_0), q6_1, 1); + r1 = _mm512_inserti32x8(_mm512_castsi256_si512(q6_2), q6_3, 1); +} + +inline __m512i packNibbles(__m512i r0, __m512i r1) { + return _mm512_or_si512(r0, _mm512_slli_epi16(r1, 4)); +} + +template +inline void pack_qs(void * RESTRICT packed_B, const TB * RESTRICT B, int KB) { + int8_t tmp[8 * 64]; + __m256i v[8], v2[8]; + for (int n = 0; n < 8; ++n) { + v[n] = bytes_from_nibbles_32(B[n * KB].qs); + } + transpose_8x8_32bit(v, v2); + for (int n = 0; n < 8; ++n) { + _mm256_storeu_si256((__m256i *)(tmp + n * 64), v2[n]); + } + for (int n = 0; n < 8; ++n) { + v[n] = bytes_from_nibbles_32(B[(n + 8) * KB].qs); + } + transpose_8x8_32bit(v, v2); + for (int n = 0; n < 8; ++n) { + _mm256_storeu_si256((__m256i *)(tmp + n * 64 + 32), v2[n]); + } + + // pack again with 128 to fully utilize vector length + for (int n = 0; n < 8; n += 2) { + __m512i r0 = _mm512_loadu_si512((const __m512i *)(tmp + n * 64)); + __m512i r1 = _mm512_loadu_si512((const __m512i *)(tmp + n * 64 + 64)); + __m512i r1r0 = packNibbles(r0, r1); + _mm512_storeu_si512((__m512i *)((char *)packed_B + n * 32), r1r0); + } +} + +template <> +inline void pack_qs(void * RESTRICT packed_B, const block_q8_0 * RESTRICT B, int KB) { + __m256i v[8], v2[8]; + for (int n = 0; n < 8; ++n) { + v[n] = _mm256_loadu_si256((const __m256i *)(B[n * KB].qs)); + } + transpose_8x8_32bit(v, v2); + for (int n = 0; n < 8; ++n) { + _mm256_storeu_si256((__m256i *)((char *)packed_B + n * 64), v2[n]); + } + for (int n = 0; n < 8; ++n) { + v[n] = _mm256_loadu_si256((const __m256i *)(B[(n + 8) * KB].qs)); + } + transpose_8x8_32bit(v, v2); + for (int n = 0; n < 8; ++n) { + _mm256_storeu_si256((__m256i *)((char *)packed_B + n * 64 + 32), v2[n]); + } +} + +template <> +inline void pack_qs(void * RESTRICT packed_B, const block_q4_K * RESTRICT B, int KB) { + __m512i v[16]; + // QK_K 256 with 8 groups, handle 2 groups at a time + char * pb = (char *)packed_B; + for (int k = 0; k < QK_K / 64; ++k) { + // pack 2 groups { n, g, k} to {g, k/4, 4n} + // e.g. {16, 2, 32} to {2, 8, 64} + for (int n = 0; n < TILE_N; ++n) { + v[n] = bytes_from_nibbles_64(B[n * KB].qs + k * 32); + } + + transpose_16x16_32bit(v); + + // pack again with 128 to fully utilize vector length + for (int n = 0; n < TILE_N; n += 2) { + _mm512_storeu_si512((__m512i *)pb, packNibbles(v[n], v[n + 1])); + pb += 64; + } + } +} + +template <> +inline void pack_qs(void * RESTRICT packed_B, const block_q5_K * RESTRICT B, int KB) { + __m512i v[16]; + const __m512i lowMask = _mm512_set1_epi8(0xF); + // QK_K 256 with 8 groups, handle 2 groups at a time + char * pb = (char *)packed_B; + char * ph = (char *)packed_B + (QK_K / 2) * TILE_N; + for (int k = 0; k < QK_K / 64; ++k) { + // pack 2 groups { n, g, k} to {g, k/4, 4n} + // e.g. {16, 2, 32} to {2, 8, 64} + for (int n = 0; n < TILE_N; ++n) { + v[n] = bytes_from_nibbles_64(B[n * KB].qs + k * 32, B[n * KB].qh, /* group */2 * k); + } + + transpose_16x16_32bit(v); + + // 1. pack lower 4bits with 2 groups + for (int n = 0; n < TILE_N; n += 2) { + // get lower 4 bits + const __m512i r0 = _mm512_and_si512(v[n], lowMask); + const __m512i r1 = _mm512_and_si512(v[n + 1], lowMask); + _mm512_storeu_si512((__m512i *)pb, packNibbles(r0, r1)); pb += 64; + } + + // 2. pack higher 1bit with 2 groups + const __m512i hmask = _mm512_set1_epi8(0x10); + for (int g = 0; g < 2; ++g) { + __m512i hbits = _mm512_setzero_si512(); + hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 8 + 0], hmask), 4)); + hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 8 + 1], hmask), 3)); + hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 8 + 2], hmask), 2)); + hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 8 + 3], hmask), 1)); + hbits = _mm512_add_epi8(hbits, _mm512_and_si512(v[g * 8 + 4], hmask) ); + hbits = _mm512_add_epi8(hbits, _mm512_slli_epi16(_mm512_and_si512(v[g * 8 + 5], hmask), 1)); + hbits = _mm512_add_epi8(hbits, _mm512_slli_epi16(_mm512_and_si512(v[g * 8 + 6], hmask), 2)); + hbits = _mm512_add_epi8(hbits, _mm512_slli_epi16(_mm512_and_si512(v[g * 8 + 7], hmask), 3)); + _mm512_storeu_si512((__m512i *)ph, hbits); ph += 64; + } + } +} + +template <> +inline void pack_qs(void * RESTRICT packed_B, const block_q6_K * RESTRICT B, int KB) { + __m512i v[32]; + const __m512i lowMask = _mm512_set1_epi8(0xF); + // QK_K 256 with 8 groups, handle 4 groups at a time + char * pb = (char *)packed_B; + char * ph = (char *)packed_B + (QK_K / 2) * TILE_N; + for (int k = 0; k < QK_K / 128; ++k) { + for (int n = 0; n < TILE_N; ++n) { + bytes_from_nibbles_128(v[n], v[n + 16], B[n * KB].ql + k * 64, B[n * KB].qh + k * 32); + } + + // top half: group 0,1 or 4,5; bottom half: group 2,3 or 6,7 + transpose_16x16_32bit(v); + transpose_16x16_32bit(v + 16); + + // 1. pack lower 4bits with 4 groups + for (int n = 0; n < 32; n += 2) { + const __m512i r0 = _mm512_and_si512(v[n], lowMask); + const __m512i r1 = _mm512_and_si512(v[n + 1], lowMask); + _mm512_storeu_si512((__m512i *)pb, packNibbles(r0, r1)); pb += 64; + } + + // 2. pack higher 2bit with 4 groups + const __m512i hmask = _mm512_set1_epi8(0x30); + for (int g = 0; g < 8; ++g) { + __m512i hbits = _mm512_setzero_si512(); + hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 4 + 0], hmask), 4)); + hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 4 + 1], hmask), 2)); + hbits = _mm512_add_epi8(hbits, _mm512_and_si512(v[g * 4 + 2], hmask) ); + hbits = _mm512_add_epi8(hbits, _mm512_slli_epi16(_mm512_and_si512(v[g * 4 + 3], hmask), 2)); + _mm512_storeu_si512((__m512i *)ph, hbits); ph += 64; + } + } +} + +template <> +inline void pack_qs(void * RESTRICT packed_B, const block_iq4_xs * RESTRICT B, int KB) { + __m512i v[16]; + char * pb = (char *)packed_B; + for (int k = 0; k < QK_K / 64; ++k) { + for (int n = 0; n < TILE_N; ++n) { + __m256i r0 = bytes_from_nibbles_32(B[n * KB].qs + k * 32 + 0); + __m256i r1 = bytes_from_nibbles_32(B[n * KB].qs + k * 32 + 16); + v[n] = _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); + } + + transpose_16x16_32bit(v); + + // pack again with 128 to fully utilize vector length + for (int n = 0; n < TILE_N; n += 2) { + _mm512_storeu_si512((__m512i *)pb, packNibbles(v[n], v[n + 1])); + pb += 64; + } + } +} + +// pack B to vnni formats in 4bits or 8 bits +void pack_B(void * RESTRICT packed_B, const block_q4_0 * RESTRICT B, int KB) { + pack_qs(packed_B, B, KB); + ggml_half * d0 = reinterpret_cast((char *)packed_B + TILE_N * TILE_K / 2); + for (int n = 0; n < TILE_N; ++n) { + d0[n] = B[n * KB].d; + } +} + +void pack_B(void * RESTRICT packed_B, const block_q4_1 * RESTRICT B, int KB) { + pack_qs(packed_B, B, KB); + ggml_half * d0 = reinterpret_cast((char *)packed_B + TILE_N * TILE_K / 2); + ggml_half * m0 = d0 + TILE_N; + for (int n = 0; n < TILE_N; ++n) { + d0[n] = B[n * KB].d; + m0[n] = B[n * KB].m; + } +} + +inline void s8s8_compensation(void * RESTRICT packed_B) { + // packed_B layout: + // quants {TILE_N, TILEK} int8_t + // d0 {TILE_N} ggml_half + // comp {TILE_N} int32_t + const int offset = TILE_N * TILE_K + TILE_N * sizeof(ggml_half); + __m512i vcomp = _mm512_setzero_si512(); + const __m512i off = _mm512_set1_epi8(static_cast(0x80)); + for (int k = 0; k < 8; ++k) { + __m512i vb = _mm512_loadu_si512((const __m512i *)((const char *)packed_B + k * 64)); + vcomp = _mm512_dpbusd_epi32(vcomp, off, vb); + } + _mm512_storeu_si512((__m512i *)((char *)(packed_B) + offset), vcomp); +} + +void pack_B(void * RESTRICT packed_B, const block_q8_0 * RESTRICT B, int KB) { + pack_qs(packed_B, B, KB); + ggml_half * d0 = reinterpret_cast((char *)packed_B + TILE_N * TILE_K); + for (int n = 0; n < TILE_N; ++n) { + d0[n] = B[n * KB].d; + } + s8s8_compensation(packed_B); +} + +// convert 8 * {min, scale} from int6 to int8 +inline void unpack_mins_and_scales(const uint8_t * scales, uint32_t * utmp) { + const uint32_t kmask1 = 0x3f3f3f3f; + const uint32_t kmask2 = 0x0f0f0f0f; + const uint32_t kmask3 = 0x03030303; + + memcpy(utmp, scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; +} + +// packed_B layout: +// quants {8, TILE_N, 16} uint8 +// scales {8, TILE_N} uint8 +// mins {8, TILE_N} uint8 +// d {TILE_N} ggml_half +// dmin {TILE_N} ggml_half +void pack_B(void * RESTRICT packed_B, const block_q4_K * RESTRICT B, int KB) { + pack_qs(packed_B, B, KB); + + uint8_t * scales = reinterpret_cast((char *)packed_B + (QK_K / 2) * TILE_N); + uint8_t * mins = scales + 8 * TILE_N; + ggml_half * d = reinterpret_cast(mins + 8 * TILE_N); + ggml_half * dmin = d + TILE_N; + + union { + uint32_t u32[4]; + uint8_t u8[16]; + } s; + + for (int n = 0; n < TILE_N; ++n) { + unpack_mins_and_scales(B[n * KB].scales, s.u32); + for (int k = 0; k < 8; ++k) { + scales[k * TILE_N + n] = s.u8[k]; + mins[(k >> 1) * TILE_N * 2 + n * 2 + (k & 0x1)] = s.u8[k + 8]; + } + d[n] = B[n * KB].d; + dmin[n] = B[n * KB].dmin; + } +} + +// packed_B layout: +// quants {8, TILE_N, 16} uint8 +// qh {8, TILE_N, 4} uint8 +// scales {8, TILE_N} uint8 +// mins {8, TILE_N} uint8 +// d {TILE_N} ggml_half +// dmin {TILE_N} ggml_half +void pack_B(void * RESTRICT packed_B, const block_q5_K * RESTRICT B, int KB) { + pack_qs(packed_B, B, KB); + + uint8_t * scales = reinterpret_cast((char *)packed_B + (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N); + uint8_t * mins = scales + 8 * TILE_N; + ggml_half * d = reinterpret_cast(mins + 8 * TILE_N); + ggml_half * dmin = d + TILE_N; + + union { + uint32_t u32[4]; + uint8_t u8[16]; + } s; + + for (int n = 0; n < TILE_N; ++n) { + unpack_mins_and_scales(B[n * KB].scales, s.u32); + for (int k = 0; k < 8; ++k) { + scales[k * TILE_N + n] = s.u8[k]; + mins[(k >> 1) * TILE_N * 2 + n * 2 + (k & 0x1)] = s.u8[k + 8]; + } + d[n] = B[n * KB].d; + dmin[n] = B[n * KB].dmin; + } +} + +// packed_B layout: +// quants {16, TILE_N, 8} uint8 +// qh {16, TILE_N, 4} uint8 +// scales {16, TILE_N} uint8 +// d {TILE_N} ggml_half +void pack_B(void * RESTRICT packed_B, const block_q6_K * RESTRICT B, int KB) { + pack_qs(packed_B, B, KB); + + uint8_t * scales = reinterpret_cast((char *)packed_B + (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N); + ggml_half * d = reinterpret_cast(scales + 16 * TILE_N); + for (int n = 0; n < TILE_N; ++n) { + const int8_t * ps = B[n * KB].scales; + for (int k = 0; k < 16; ++k) { + scales[k * TILE_N + n] = ps[k]; + } + d[n] = B[n * KB].d; + } +} + +// packed_B layout: +// quants {8, TILE_N, 16} uint8 +// scales {8, TILE_N} int8 +// d {TILE_N} ggml_half +void pack_B(void * RESTRICT packed_B, const block_iq4_xs * RESTRICT B, int KB) { + pack_qs(packed_B, B, KB); + + int8_t * scales = reinterpret_cast((char *)packed_B + (QK_K / 2) * TILE_N); + ggml_half * d = reinterpret_cast(scales + 8 * TILE_N); + + // pack the scales + for (int n = 0; n < TILE_N; ++n) { + uint16_t sh = B[n * KB].scales_h; + for (int k = 0; k < 8; k += 2) { + const int16_t ls1 = ((B[n * KB].scales_l[k / 2] & 0xf) | ((sh << 4) & 0x30)) - 32; + const int16_t ls2 = ((B[n * KB].scales_l[k / 2] >> 4) | ((sh << 2) & 0x30)) - 32; + scales[(k + 0) * TILE_N + n] = ls1; + scales[(k + 1) * TILE_N + n] = ls2; + sh >>= 4; + } + d[n] = B[n * KB].d; + } +} + +template> +void unpack_B(packed_B_t * RESTRICT tile, const void * RESTRICT packed_B) { + GGML_UNUSED(tile); + GGML_UNUSED(packed_B); +}; + +template <> +void unpack_B(int8_t * RESTRICT tile, const void * RESTRICT packed_B) { + const __m512i off = _mm512_set1_epi8(8); + const __m512i lowMask = _mm512_set1_epi8(0xF); + for (int n = 0; n < 8; n += 2) { + __m512i bytes = _mm512_loadu_si512((const __m512i *)((const char *)packed_B + n * 32)); + const __m512i r0 = _mm512_sub_epi8(_mm512_and_si512(bytes, lowMask), off); + const __m512i r1 = _mm512_sub_epi8(_mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask), off); + _mm512_storeu_si512((__m512i *)(tile + n * 64 + 0), r0); + _mm512_storeu_si512((__m512i *)(tile + n * 64 + 64), r1); + } +} + +template <> +void unpack_B(uint8_t * RESTRICT tile, const void * RESTRICT packed_B) { + const __m512i lowMask = _mm512_set1_epi8(0xF); + for (int n = 0; n < 8; n += 2) { + __m512i bytes = _mm512_loadu_si512((const __m512i *)((const char *)packed_B + n * 32)); + const __m512i r0 = _mm512_and_si512(bytes, lowMask); + const __m512i r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask); + _mm512_storeu_si512((__m512i *)(tile + n * 64 + 0), r0); + _mm512_storeu_si512((__m512i *)(tile + n * 64 + 64), r1); + } +} + +// packed_B_t for QKK is int8_t +template +void unpack_B(int8_t * RESTRICT tile, const void * RESTRICT packed_B, int k) { + const int packed_B_group_size = QK_K / 2 * TILE_N / 8; + const char * packed_B_group = (const char *)packed_B + k * packed_B_group_size; + const __m512i lowMask = _mm512_set1_epi8(0xF); + for (int n = 0; n < 8; n += 2) { + __m512i bytes = _mm512_loadu_si512(packed_B_group + n * 32); + const __m512i r0 = _mm512_and_si512(bytes, lowMask); + const __m512i r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask); + _mm512_storeu_si512((__m512i *)(tile + n * 64 + 0), r0); + _mm512_storeu_si512((__m512i *)(tile + n * 64 + 64), r1); + } +} + +template <> +void unpack_B(int8_t * RESTRICT tile, const void * RESTRICT packed_B, int k) { + // lower 4bits, stride 256 bytes + const int packed_l4_group_size = QK_K / 2 * TILE_N / 8; + const char * pb = (const char *)packed_B + k * packed_l4_group_size; + + // higher 1bit, stride 64 bytes + const int packed_h1_group_size = QK_K / 8 * TILE_N / 8; + const char * ph = (const char *)packed_B + (QK_K / 2) * TILE_N + k * packed_h1_group_size; + const __m512i hbits = _mm512_loadu_si512(ph); + + const __m512i lowMask = _mm512_set1_epi8(0xF); + __m512i hmask0 = _mm512_set1_epi8(0x1); + __m512i hmask1 = _mm512_set1_epi8(0x2); + + for (int n = 0; n < 8; n += 2) { + __m512i bytes = _mm512_loadu_si512(pb + n * 32); + __m512i r0 = _mm512_and_si512(bytes, lowMask); + __m512i r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask); + __m512i h0 = _mm512_slli_epi16(_mm512_srli_epi16(_mm512_and_si512(hbits, hmask0), n), 4); + __m512i h1 = _mm512_slli_epi16(_mm512_srli_epi16(_mm512_and_si512(hbits, hmask1), n + 1), 4); + + hmask0 = _mm512_slli_epi16(hmask0, 2); + hmask1 = _mm512_slli_epi16(hmask1, 2); + r0 = _mm512_add_epi8(r0, h0); + r1 = _mm512_add_epi8(r1, h1); + _mm512_storeu_si512((__m512i *)(tile + n * 64 + 0), r0); + _mm512_storeu_si512((__m512i *)(tile + n * 64 + 64), r1); + } +} + +template <> +void unpack_B(int8_t * RESTRICT tile, const void * RESTRICT packed_B, int k) { + // lower 4bits, stride 128 bytes + const int packed_l4_group_size = QK_K / 2 * TILE_N / 16; + const char * pb = (const char *)packed_B + k * packed_l4_group_size; + + // higher 2bits, stride 64 bytes + const int packed_h2_group_size = QK_K / 4 * TILE_N / 16; + const char * ph = (const char *)packed_B + (QK_K / 2) * TILE_N + k * packed_h2_group_size; + const __m512i hbits = _mm512_loadu_si512(ph); + + const __m512i off = _mm512_set1_epi8(32); + const __m512i lowMask = _mm512_set1_epi8(0xF); + __m512i hmask0 = _mm512_set1_epi8(0x3); // 0011 + __m512i hmask1 = _mm512_set1_epi8(0xC); // 1100 + + // notes: skip zero padding from row4 to row7 as we have done so in `unpack_A` + __m512i bytes = _mm512_loadu_si512(pb); + __m512i r0 = _mm512_and_si512(bytes, lowMask); + __m512i r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask); + __m512i h0 = _mm512_slli_epi16(_mm512_and_si512(hbits, hmask0), 4); + __m512i h1 = _mm512_slli_epi16(_mm512_and_si512(hbits, hmask1), 2); + _mm512_storeu_si512((__m512i *)(tile + 0), _mm512_sub_epi8(_mm512_add_epi8(r0, h0), off)); + _mm512_storeu_si512((__m512i *)(tile + 64), _mm512_sub_epi8(_mm512_add_epi8(r1, h1), off)); + + hmask0 = _mm512_slli_epi16(hmask0, 4); + hmask1 = _mm512_slli_epi16(hmask1, 4); + + bytes = _mm512_loadu_si512(pb + 64); + r0 = _mm512_and_si512(bytes, lowMask); + r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask); + h0 = _mm512_and_si512(hbits, hmask0); + h1 = _mm512_srli_epi16(_mm512_and_si512(hbits, hmask1), 2); + _mm512_storeu_si512((__m512i *)(tile + 128), _mm512_sub_epi8(_mm512_add_epi8(r0, h0), off)); + _mm512_storeu_si512((__m512i *)(tile + 192), _mm512_sub_epi8(_mm512_add_epi8(r1, h1), off)); +} + +template <> +void unpack_B(int8_t * RESTRICT tile, const void * RESTRICT packed_B, int k) { + static const __m512i values128 = _mm512_set_epi8( + 113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127, + 113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127, + 113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127, + 113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127 + ); + + const int packed_B_group_size = QK_K / 2 * TILE_N / 8; + const char * pb = (const char *)packed_B + k * packed_B_group_size; + const __m512i lowMask = _mm512_set1_epi8(0xF); + + for (int n = 0; n < 8; n += 2) { + __m512i bytes = _mm512_loadu_si512(pb + n * 32); + const __m512i r0 = _mm512_shuffle_epi8(values128, _mm512_and_si512(bytes, lowMask)); + const __m512i r1 = _mm512_shuffle_epi8(values128, _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask)); + _mm512_storeu_si512((__m512i *)(tile + n * 64 + 0), r0); + _mm512_storeu_si512((__m512i *)(tile + n * 64 + 64), r1); + } +} + +template +struct acc_C {}; + +template +struct acc_C { + static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_0 * A, int lda, const void * packed_B, int nr) { + const int offset = TILE_N * TILE_K / 2; + const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset))); + + for (int m = 0; m < nr; ++m) { + const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d)); + const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N)); + + __m512 vsum; + if (is_acc) { + vsum = _mm512_loadu_ps(C + m * ldc); + } else { + vsum = _mm512_set1_ps(0.f); + } + vsum = _mm512_fmadd_ps(vtile, _mm512_mul_ps(vd0, vd1), vsum); + _mm512_storeu_ps(C + m * ldc, vsum); + } + } +}; + +template +struct acc_C { + static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_1 * A, int lda, const void * packed_B, int nr) { + const int offset = TILE_N * TILE_K / 2; + const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset))); + const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(ggml_half)))); + + for (int m = 0; m < nr; ++m) { + const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d)); + const __m512 vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].s)); + const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N)); + + __m512 vsum; + if (is_acc) { + vsum = _mm512_loadu_ps(C + m * ldc); + } else { + vsum = _mm512_set1_ps(0.f); + } + vsum = _mm512_fmadd_ps(vtile, _mm512_mul_ps(vd0, vd1), vsum); + vsum = _mm512_fmadd_ps(vm0, vs1, vsum); + _mm512_storeu_ps(C + m * ldc, vsum); + } + } +}; + +template +struct acc_C { + static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_0 * A, int lda, const void * packed_B, int nr) { + const int offset = TILE_N * TILE_K; + const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset))); + + for (int m = 0; m < nr; ++m) { + const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d)); + const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N)); + + __m512 vsum; + if (is_acc) { + vsum = _mm512_loadu_ps(C + m * ldc); + } else { + vsum = _mm512_set1_ps(0.f); + } + vsum = _mm512_fmadd_ps(vtile, _mm512_mul_ps(vd0, vd1), vsum); + _mm512_storeu_ps(C + m * ldc, vsum); + } + } +}; + +template +struct acc_C { + static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_K * A, int lda, const void * packed_B, int nr) { + const uint8_t * scales = reinterpret_cast((const char *)packed_B + (QK_K / 2) * TILE_N); + const uint8_t * mins = scales + 8 * TILE_N; + const ggml_half * d0 = reinterpret_cast(mins + 8 * TILE_N); + const ggml_half * dmin = d0 + TILE_N; + + const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)d0)); + const __m512 vdmin = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)dmin)); + + for (int m = 0; m < nr; ++m) { + const float d1 = A[m * lda].d; + const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(d1), vd0); + const __m512 vdm = _mm512_mul_ps(_mm512_set1_ps(-d1), vdmin); + const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N)); + + __m512 vsum; + if (is_acc) { + vsum = _mm512_loadu_ps(C + m * ldc); + } else { + vsum = _mm512_set1_ps(0.f); + } + + const __m256i q8sums = _mm256_loadu_si256((const __m256i *)A[m * lda].bsums); + const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1)); + + __m512i acc_m = _mm512_setzero_si512(); + for (int k = 0; k < 4; ++k) { + __m512i vmask = _mm512_set1_epi32(k); + __m512i va = _mm512_permutexvar_epi32(vmask, _mm512_castsi128_si512(q8s)); + __m512i vb = _mm512_cvtepi8_epi16(_mm256_loadu_si256((const __m256i *)(mins + k * 32))); + acc_m = _mm512_dpwssds_epi32(acc_m, va, vb); + } + + vsum = _mm512_fmadd_ps(vtile, vd, vsum); + vsum = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc_m), vdm, vsum); + _mm512_storeu_ps(C + m * ldc, vsum); + } + } +}; + +template +struct acc_C { + static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_K * A, int lda, const void * packed_B, int nr) { + const uint8_t * scales = reinterpret_cast((const char *)packed_B + (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N); + const uint8_t * mins = scales + 8 * TILE_N; + const ggml_half * d0 = reinterpret_cast(mins + 8 * TILE_N); + const ggml_half * dmin = d0 + TILE_N; + + const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)d0)); + const __m512 vdmin = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)dmin)); + + for (int m = 0; m < nr; ++m) { + const float d1 = A[m * lda].d; + const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(d1), vd0); + const __m512 vdm = _mm512_mul_ps(_mm512_set1_ps(-d1), vdmin); + const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N)); + + __m512 vsum; + if (is_acc) { + vsum = _mm512_loadu_ps(C + m * ldc); + } else { + vsum = _mm512_set1_ps(0.f); + } + + const __m256i q8sums = _mm256_loadu_si256((const __m256i *)A[m * lda].bsums); + const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1)); + + __m512i acc_m = _mm512_setzero_si512(); + for (int k = 0; k < 4; ++k) { + __m512i vmask = _mm512_set1_epi32(k); + __m512i va = _mm512_permutexvar_epi32(vmask, _mm512_castsi128_si512(q8s)); + __m512i vb = _mm512_cvtepi8_epi16(_mm256_loadu_si256((const __m256i *)(mins + k * 32))); + acc_m = _mm512_dpwssds_epi32(acc_m, va, vb); + } + + vsum = _mm512_fmadd_ps(vtile, vd, vsum); + vsum = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc_m), vdm, vsum); + _mm512_storeu_ps(C + m * ldc, vsum); + } + } +}; + +template +struct acc_C { + static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_K * A, int lda, const void * packed_B, int nr) { + const uint8_t * scales = reinterpret_cast((const char *)packed_B + (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N); + const ggml_half * d0 = reinterpret_cast(scales + 16 * TILE_N); + + const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)d0)); + + for (int m = 0; m < nr; ++m) { + const float d1 = A[m * lda].d; + const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(d1), vd0); + const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N)); + + __m512 vsum; + if (is_acc) { + vsum = _mm512_loadu_ps(C + m * ldc); + } else { + vsum = _mm512_set1_ps(0.f); + } + + vsum = _mm512_fmadd_ps(vtile, vd, vsum); + _mm512_storeu_ps(C + m * ldc, vsum); + } + } +}; + +template +struct acc_C { + static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_K * A, int lda, const void * packed_B, int nr) { + const int8_t * scales = reinterpret_cast((const char *)packed_B + (QK_K / 2) * TILE_N); + const ggml_half * d0 = reinterpret_cast(scales + 8 * TILE_N); + + const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)d0)); + + for (int m = 0; m < nr; ++m) { + const float d1 = A[m * lda].d; + const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(d1), vd0); + const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N)); + + __m512 vsum; + if (is_acc) { + vsum = _mm512_loadu_ps(C + m * ldc); + } else { + vsum = _mm512_set1_ps(0.f); + } + + vsum = _mm512_fmadd_ps(vtile, vd, vsum); + _mm512_storeu_ps(C + m * ldc, vsum); + } + } +}; + +template constexpr int get_quants_size(); +template <> constexpr int get_quants_size() { return (QK_K / 2) * TILE_N; } +template <> constexpr int get_quants_size() { return (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N; } +template <> constexpr int get_quants_size() { return (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N; } +template <> constexpr int get_quants_size() { return (QK_K / 2) * TILE_N; } + +// used for QKK format +template ::value, int>::type = 0> +inline void scale_C(const int32_t * RESTRICT tile, int32_t * RESTRICT sumi, const void * packed_B, int k, int nr) { + const uint8_t * scales = reinterpret_cast((const char *)packed_B + get_quants_size()); + const __m512i vscale = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i *)(scales + k * TILE_N))); + + for (int m = 0; m < nr; ++m) { + __m512i vsumi; + if (is_acc) { + vsumi = _mm512_loadu_si512(sumi + m * TILE_N); + } else { + vsumi = _mm512_setzero_si512(); + } + __m512i vtile = _mm512_loadu_si512(tile + m * TILE_N); + vsumi = _mm512_add_epi32(vsumi, _mm512_mullo_epi32(vtile, vscale)); + _mm512_storeu_si512((__m512i *)(sumi + m * TILE_N), vsumi); + } +} + +template +struct tinygemm_kernel_avx { + static void apply(int K, const TA * RESTRICT A, const TB * RESTRICT B, TC * RESTRICT C, int ldc) { + GGML_UNUSED(K); + GGML_UNUSED(A); + GGML_UNUSED(B); + GGML_UNUSED(C); + GGML_UNUSED(ldc); + } +}; + +template +struct tinygemm_kernel_avx { + static void apply(int K, const float * RESTRICT A, const ggml_fp16_t * RESTRICT B, float * RESTRICT C, int ldc) { + constexpr int ROWS = BLOCK_M; + constexpr int COLS = BLOCK_N; + assert(BLOCK_K == 16); + + __m512 va; + __m512 vb[COLS]; + __m512 vc[ROWS * COLS]; + + auto loadc = [&](int idx) { + vc[idx] = _mm512_setzero_ps(); + }; + Unroll{}(loadc); + + auto compute = [&](int idx, int k) { + // TODO: use `constexpr` here to get rid of interger div + // when upgraded to C++17 + const int row = idx / COLS; + const int col = idx % COLS; + + if (col == 0) { + va = _mm512_loadu_ps(A + row * K + k); + } + if (row == 0) { + vb[col] = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(B + col * K + k))); + } + vc[idx] = _mm512_fmadd_ps(va, vb[col], vc[idx]); + }; + + for (int k = 0; k < K; k += 16) { + Unroll{}(compute, k); + } + + auto storec = [&](int idx) { + const int row = idx / COLS; + const int col = idx % COLS; + C[row * ldc + col] = _mm512_reduce_add_ps(vc[idx]); + }; + Unroll{}(storec); + } +}; + +#define LAUNCH_TINYGEMM_KERNEL_AVX(MB_SIZE, NB_SIZE) \ + tinygemm_kernel_avx::apply( \ + K, (const float *)src1->data + mb_start * K, \ + (const type *)src0->data + nb_start * K, \ + (float *)dst->data + mb_start * ldc + nb_start, ldc); + + +// re-organize in the format {NB, KB, TILE_SIZE}: +#define PACKED_INDEX(n, k, KB, tile_size) (n * KB + k) * tile_size + +template +void convert_B_packed_format(void * RESTRICT packed_B, const TB * RESTRICT B, int N, int K, int n_threads) { + const int NB = N / TILE_N; + const int KB = K / BLOCK_K; + const int TILE_SIZE = get_tile_size(); + + // parallel on NB should be enough + parallel_for(n_threads, NB, [&](int begin, int end) { + for (int n = begin; n < end; ++n) { + for (int k = 0; k < KB; ++k) { + int n0 = n * TILE_N; + pack_B((char *)packed_B + PACKED_INDEX(n, k, KB, TILE_SIZE), &B[n0 * KB + k], KB); + } + } + }); +} + +template +struct tinygemm_kernel_vnni {}; + +template +struct tinygemm_kernel_vnni { + static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) { + + constexpr int COLS = BLOCK_N / 16; + const int TILE_SIZE = TILE_N * sizeof(block_q4_0); + + const block_q8_0 * RESTRICT A = static_cast(_A); + const char * RESTRICT B = static_cast(_B); + + __m512i va[8]; + __m512 vc[COLS]; + __m512 vd1; + + // sum of offsets, shared across COLS + // + // avx512-vnni does not have `_mm512_dpbssd_epi32`, + // need to transfrom ss to us: + // a * (b - 8) is equavilent to b * a - 8 * a + // s u u u s u s + // + __m512i vcomp; + + const __m512i off = _mm512_set1_epi8(8); + const __m512i lowMask = _mm512_set1_epi8(0xF); + + auto loadc = [&](int col) { + vc[col] = _mm512_setzero_ps(); + }; + Unroll{}(loadc); + + auto compute = [&](int col, int i) { + // load a and compute compensation + if (col == 0) { + const int32_t * a_ptr = reinterpret_cast(A[0 * KB + i].qs); + vcomp = _mm512_setzero_si512(); + for (int k = 0; k < 8; ++k) { + va[k] = _mm512_set1_epi32(a_ptr[k]); + vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]); + } + vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d)); + } + + // load b + __m512i vsum = _mm512_setzero_si512(); + const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE); + for (int k = 0; k < 8; k += 2) { + __m512i bytes = _mm512_loadu_si512((const __m512i *)(b_ptr + k * 32)); + __m512i vb0 = _mm512_and_si512(bytes, lowMask); + vsum = _mm512_dpbusd_epi32(vsum, vb0, va[k + 0]); + __m512i vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask); + vsum = _mm512_dpbusd_epi32(vsum, vb1, va[k + 1]); + } + const int offset = TILE_N * TILE_K / 2; + const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset))); + vsum = _mm512_sub_epi32(vsum, vcomp); + + vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(vsum), _mm512_mul_ps(vd0, vd1), vc[col]); + }; + + for (int i = 0; i < KB; ++i) { + Unroll{}(compute, i); + } + + //store to C + auto storec = [&](int col) { + _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); + }; + Unroll{}(storec); + } +}; + +template +struct tinygemm_kernel_vnni { + static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) { + + constexpr int COLS = BLOCK_N / 16; + const int TILE_SIZE = TILE_N * sizeof(block_q4_1); + + const block_q8_1 * RESTRICT A = static_cast(_A); + const char * RESTRICT B = static_cast(_B); + + __m512i va[8]; + __m512i vb[8]; + __m512 vc[COLS]; + __m512 vd1, vs1; + + const __m512i lowMask = _mm512_set1_epi8(0xF); + + auto loadc = [&](int col) { + vc[col] = _mm512_setzero_ps(); + }; + Unroll{}(loadc); + + auto compute = [&](int col, int i) { + // load a + if (col == 0) { + const int32_t * a_ptr = reinterpret_cast(A[0 * KB + i].qs); + for (int k = 0; k < 8; ++k) { + va[k] = _mm512_set1_epi32(a_ptr[k]); + } + vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d)); + vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].s)); + } + + // load b + const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE); + for (int k = 0; k < 8; k += 2) { + __m512i bytes = _mm512_loadu_si512((const __m512i *)(b_ptr + k * 32)); + vb[k + 0] = _mm512_and_si512(bytes, lowMask); + vb[k + 1] = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask); + } + const int offset = TILE_N * TILE_K / 2; + const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset))); + const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset + TILE_N * sizeof(ggml_half)))); + + __m512i vsum = _mm512_setzero_si512(); + for (int k = 0; k < 8; ++k) { + vsum = _mm512_dpbusd_epi32(vsum, vb[k], va[k]); + } + + vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(vsum), _mm512_mul_ps(vd0, vd1), vc[col]); + vc[col] = _mm512_fmadd_ps(vm0, vs1, vc[col]); + }; + + for (int i = 0; i < KB; ++i) { + Unroll{}(compute, i); + } + + //store to C + auto storec = [&](int col) { + _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); + }; + Unroll{}(storec); + } +}; + +template +struct tinygemm_kernel_vnni { + static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) { + + constexpr int COLS = BLOCK_N / 16; + const int TILE_SIZE = TILE_N * sizeof(block_q8_0) + TILE_N * sizeof(int32_t); + + const block_q8_0 * RESTRICT A = static_cast(_A); + const char * RESTRICT B = static_cast(_B); + + __m512i va[8]; + __m512i vb[8]; + __m512 vc[COLS]; + __m512 vd1; + + // Notes: s8s8 igemm compensation in avx512-vnni + // change s8s8 to u8s8 with compensate + // a * b = (a + 128) * b - 128 * b + // s s u s u s + // + // (128 * b is pre-computed when packing B to vnni formats) + // + const __m512i off = _mm512_set1_epi8(static_cast(0x80)); + + auto loadc = [&](int col) { + vc[col] = _mm512_setzero_ps(); + }; + Unroll{}(loadc); + + auto compute = [&](int col, int i) { + // load a and add offset 128 + if (col == 0) { + const int32_t * a_ptr = reinterpret_cast(A[0 * KB + i].qs); + for (int k = 0; k < 8; ++k) { + va[k] = _mm512_set1_epi32(a_ptr[k]); + va[k] = _mm512_add_epi8(va[k], off); + } + vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d)); + } + + // load b + const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE); + for (int k = 0; k < 8; ++k) { + vb[k] = _mm512_loadu_si512((const __m512i *)(b_ptr + k * 64)); + } + const int offset = TILE_N * TILE_K; + const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset))); + const int offset2 = TILE_N * TILE_K + TILE_N * sizeof(ggml_half); + const __m512i vcomp = _mm512_loadu_si512((const __m512i *)(b_ptr + offset2)); + + __m512i vsum = _mm512_setzero_si512(); + for (int k = 0; k < 8; ++k) { + vsum = _mm512_dpbusd_epi32(vsum, va[k], vb[k]); + } + vsum = _mm512_sub_epi32(vsum, vcomp); + + vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(vsum), _mm512_mul_ps(vd0, vd1), vc[col]); + }; + + for (int i = 0; i < KB; ++i) { + Unroll{}(compute, i); + } + + //store to C + auto storec = [&](int col) { + _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); + }; + Unroll{}(storec); + } +}; + +template +struct tinygemm_kernel_vnni { + static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) { + + constexpr int COLS = BLOCK_N / 16; + const int TILE_SIZE = TILE_N * sizeof(block_q4_K) + TILE_N * 4; + + const block_q8_K * RESTRICT A = static_cast(_A); + const char * RESTRICT B = static_cast(_B); + + // a.qs: 8 groups, 32 bytes each group (m256i) + __m512i va[8]; + // a.bsum: 8 groups, 2 bytes each group (m128i) + __m512i va_bsum; + __m512 vc[COLS]; + __m512 vd1; + + // packed_B: + const int offset_scales = (QK_K / 2) * TILE_N; + const int offset_mins = (QK_K / 2) * TILE_N + 8 * TILE_N; + const int offset_d0 = (QK_K / 2) * TILE_N + 16 * TILE_N; + const int offset_dmin = (QK_K / 2) * TILE_N + 16 * TILE_N + TILE_N * sizeof(ggml_half); + + const __m512i lowMask = _mm512_set1_epi8(0xF); + + auto loadc = [&](int col) { + vc[col] = _mm512_setzero_ps(); + }; + Unroll{}(loadc); + + // Notes: vnni formats in QK_K + // a) quants vnni format + // int8 {k/4, n, 4}, viewed as 2d {k/4, 4n}, k = 32 + // from {16, 32} to {8, 64} + // + // b) min vnni format + // int16 {k/2, n, 2}, viewed as 2d {k/2, 2n}, k = 8 + // from {16, 8} to {4, 32} + // + auto compute = [&](int col, int i) { + // load a + if (col == 0) { + for (int k_group = 0; k_group < QK_K / 32; ++k_group) { + va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32))); + } + const __m256i q8sums = _mm256_loadu_si256((const __m256i *)A[0 * KB + i].bsums); + const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1)); + va_bsum = _mm512_castsi128_si512(q8s); + vd1 = _mm512_set1_ps(A[0 * KB + i].d); + } + + // step 1: accumultate the quants + __m512i acc = _mm512_setzero_si512(); + const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE); + const char * b_qs = b_ptr; + for (int k_group = 0; k_group < QK_K / 32; ++k_group) { + __m512i vsum = _mm512_setzero_si512(); + for (int k = 0; k < 8; k += 2) { + __m512i va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(k + 0), va[k_group]); + __m512i va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(k + 1), va[k_group]); + + __m512i bytes = _mm512_loadu_si512((const __m512i *)b_qs); + __m512i vb0 = _mm512_and_si512(bytes, lowMask); + vsum = _mm512_dpbusd_epi32(vsum, vb0, va0); + __m512i vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask); + vsum = _mm512_dpbusd_epi32(vsum, vb1, va1); + + b_qs += 64; + } + // vacc += scale * (q8 @ q4) + const __m512i vscale = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i *)(b_ptr + offset_scales + k_group * TILE_N))); + acc = _mm512_add_epi32(acc, _mm512_mullo_epi32(vsum, vscale)); + } + const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_d0))); + vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc), _mm512_mul_ps(vd0, vd1), vc[col]); + + // step 2: accumulate the mins + __m512i acc_m = _mm512_setzero_si512(); + for (int k = 0; k < 4; ++k) { + __m512i vmask = _mm512_set1_epi32(k); + __m512i va = _mm512_permutexvar_epi32(vmask, va_bsum); + __m512i vb = _mm512_cvtepi8_epi16(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_mins + k * 32))); + acc_m = _mm512_dpwssds_epi32(acc_m, va, vb); + } + const __m512 vdmin = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_dmin))); + vc[col] = _mm512_fnmadd_ps(_mm512_cvtepi32_ps(acc_m), _mm512_mul_ps(vdmin, vd1), vc[col]); + }; + + for (int i = 0; i < KB; ++i) { + Unroll{}(compute, i); + } + + //store to C + auto storec = [&](int col) { + _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); + }; + Unroll{}(storec); + } +}; + +template +struct tinygemm_kernel_vnni { + static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) { + + constexpr int COLS = BLOCK_N / 16; + const int TILE_SIZE = TILE_N * sizeof(block_q5_K) + TILE_N * 4; + + const block_q8_K * RESTRICT A = static_cast(_A); + const char * RESTRICT B = static_cast(_B); + + // a.qs: 8 groups, 32 bytes each group (m256i) + __m512i va[8]; + // a.bsum: 8 groups, 2 bytes each group (m128i) + __m512i va_bsum; + __m512 vc[COLS]; + __m512 vd1; + + // packed_B: + const int offset_qh = (QK_K / 2) * TILE_N; + const int offset_scales = (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N; + const int offset_mins = (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N + 8 * TILE_N; + const int offset_d0 = (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N + 16 * TILE_N; + const int offset_dmin = (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N + 16 * TILE_N + TILE_N * sizeof(ggml_half); + + const __m512i lowMask = _mm512_set1_epi8(0xF); + + auto loadc = [&](int col) { + vc[col] = _mm512_setzero_ps(); + }; + Unroll{}(loadc); + + // Q5_K and Q4_K shares the same vnni formats, refer to notes above. + auto compute = [&](int col, int i) { + // load a + if (col == 0) { + for (int k_group = 0; k_group < QK_K / 32; ++k_group) { + va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32))); + } + const __m256i q8sums = _mm256_loadu_si256((const __m256i *)A[0 * KB + i].bsums); + const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1)); + va_bsum = _mm512_castsi128_si512(q8s); + vd1 = _mm512_set1_ps(A[0 * KB + i].d); + } + + // step 1: accumultate the quants + __m512i acc = _mm512_setzero_si512(); + const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE); + const char * b_qs = b_ptr; + const char * b_qh = b_ptr + offset_qh; + for (int k_group = 0; k_group < QK_K / 32; ++k_group) { + __m512i vsum = _mm512_setzero_si512(); + __m512i hmask0 = _mm512_set1_epi8(0x1); + __m512i hmask1 = _mm512_set1_epi8(0x2); + __m512i hbits = _mm512_loadu_si512((const __m512i *)(b_qh + k_group * 64)); + for (int k = 0; k < 8; k += 2) { + __m512i va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(k + 0), va[k_group]); + __m512i va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(k + 1), va[k_group]); + + __m512i bytes = _mm512_loadu_si512((const __m512i *)b_qs); + __m512i vb0 = _mm512_and_si512(bytes, lowMask); + __m512i vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask); + + __m512i vh0 = _mm512_slli_epi16(_mm512_srli_epi16(_mm512_and_si512(hbits, hmask0), k), 4); + __m512i vh1 = _mm512_slli_epi16(_mm512_srli_epi16(_mm512_and_si512(hbits, hmask1), k + 1), 4); + + hmask0 = _mm512_slli_epi16(hmask0, 2); + hmask1 = _mm512_slli_epi16(hmask1, 2); + vb0 = _mm512_add_epi8(vb0, vh0); + vb1 = _mm512_add_epi8(vb1, vh1); + + vsum = _mm512_dpbusd_epi32(vsum, vb0, va0); + vsum = _mm512_dpbusd_epi32(vsum, vb1, va1); + + b_qs += 64; + } + // vacc += scale * (q8 @ q5) + const __m512i vscale = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i *)(b_ptr + offset_scales + k_group * TILE_N))); + acc = _mm512_add_epi32(acc, _mm512_mullo_epi32(vsum, vscale)); + } + const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_d0))); + vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc), _mm512_mul_ps(vd0, vd1), vc[col]); + + // step 2: accumulate the mins + __m512i acc_m = _mm512_setzero_si512(); + for (int k = 0; k < 4; ++k) { + __m512i vmask = _mm512_set1_epi32(k); + __m512i va = _mm512_permutexvar_epi32(vmask, va_bsum); + __m512i vb = _mm512_cvtepi8_epi16(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_mins + k * 32))); + acc_m = _mm512_dpwssds_epi32(acc_m, va, vb); + } + const __m512 vdmin = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_dmin))); + vc[col] = _mm512_fnmadd_ps(_mm512_cvtepi32_ps(acc_m), _mm512_mul_ps(vdmin, vd1), vc[col]); + }; + + for (int i = 0; i < KB; ++i) { + Unroll{}(compute, i); + } + + //store to C + auto storec = [&](int col) { + _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); + }; + Unroll{}(storec); + } +}; + +template +struct tinygemm_kernel_vnni { + static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) { + + constexpr int COLS = BLOCK_N / 16; + const int TILE_SIZE = TILE_N * sizeof(block_q6_K); + + const block_q8_K * RESTRICT A = static_cast(_A); + const char * RESTRICT B = static_cast(_B); + + // load the 256 bytes from A to 4 avx512 vectors + __m512i va[4]; + __m512 vc[COLS]; + __m512 vd1; + + // packed_B: + const int offset_qh = (QK_K / 2) * TILE_N; + const int offset_scales = (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N; + const int offset_d0 = (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N + 16 * TILE_N; + + // compensation + __m512i vcomp; + + const __m512i m32s = _mm512_set1_epi32(32); + const __m512i lowMask = _mm512_set1_epi8(0xF); + + auto loadc = [&](int col) { + vc[col] = _mm512_setzero_ps(); + }; + Unroll{}(loadc); + + auto compute = [&](int col, int i) { + if (col == 0) { + // load a + va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 0)); + va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 64)); + va[2] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 128)); + va[3] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 192)); + + const __m256i q8sums = _mm256_loadu_si256((const __m256i *)A[0 * KB + i].bsums); + vcomp = _mm512_mullo_epi32(_mm512_cvtepi16_epi32(q8sums), m32s); + vd1 = _mm512_set1_ps(A[0 * KB + i].d); + } + + // accmulate the quants + __m512i acc = _mm512_setzero_si512(); + const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE); + const char * b_qs = b_ptr; + const char * b_qh = b_ptr + offset_qh; + int mask = 0; + for (int k_group = 0; k_group < QK_K / 16; ++k_group) { + int r = k_group >> 2; + __m512i va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]); + __m512i va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]); + + __m512i vsum = _mm512_setzero_si512(); + __m512i hmask = _mm512_set1_epi8(0x3); + + __m512i bytes = _mm512_loadu_si512(b_qs); + __m512i hbits = _mm512_loadu_si512(b_qh); + __m512i vb0 = _mm512_and_si512(bytes, lowMask); + __m512i vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask); + __m512i vh0 = _mm512_slli_epi16(_mm512_and_si512(hbits, hmask), 4); + __m512i vh1 = _mm512_slli_epi16(_mm512_and_si512(hbits, _mm512_slli_epi16(hmask, 2)), 2); + + vb0 = _mm512_add_epi8(vb0, vh0); + vb1 = _mm512_add_epi8(vb1, vh1); + vsum = _mm512_dpbusd_epi32(vsum, vb0, va0); + vsum = _mm512_dpbusd_epi32(vsum, vb1, va1); + b_qs += 64; + + va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]); + va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]); + + bytes = _mm512_loadu_si512(b_qs); + vb0 = _mm512_and_si512(bytes, lowMask); + vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask); + vh0 = _mm512_and_si512(hbits, _mm512_slli_epi16(hmask, 4)); + vh1 = _mm512_srli_epi16(_mm512_and_si512(hbits, _mm512_slli_epi16(hmask, 6)), 2); + vb0 = _mm512_add_epi8(vb0, vh0); + vb1 = _mm512_add_epi8(vb1, vh1); + vsum = _mm512_dpbusd_epi32(vsum, vb0, va0); + vsum = _mm512_dpbusd_epi32(vsum, vb1, va1); + b_qs += 64; + b_qh += 64; + + // B * A - 32 * A + __m512i vmask = _mm512_set1_epi32(k_group); + vsum = _mm512_sub_epi32(vsum, _mm512_permutexvar_epi32(vmask, vcomp)); + + // vacc += scale * (q8 @ q6) + const __m512i vscale = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i *)(b_ptr + offset_scales + k_group * TILE_N))); + acc = _mm512_add_epi32(acc, _mm512_mullo_epi32(vsum, vscale)); + } + const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_d0))); + vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc), _mm512_mul_ps(vd0, vd1), vc[col]); + }; + + for (int i = 0; i < KB; ++i) { + Unroll{}(compute, i); + } + + //store to C + auto storec = [&](int col) { + _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); + }; + Unroll{}(storec); + } +}; + +template +struct tinygemm_kernel_vnni { + static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) { + + constexpr int COLS = BLOCK_N / 16; + const int TILE_SIZE = TILE_N * sizeof(block_iq4_xs) + TILE_N * 2; + + const block_q8_K * RESTRICT A = static_cast(_A); + const char * RESTRICT B = static_cast(_B); + + // load the 256 bytes from A to 4 avx512 vectors + __m512i va[4]; + __m512 vc[COLS]; + __m512 vd1; + + // packed_B: + const int offset_scales = (QK_K / 2) * TILE_N ; + const int offset_d0 = (QK_K / 2) * TILE_N + 8 * TILE_N; + + // compensation + __m512i vcomp; + + const __m256i m128s = _mm256_set1_epi16(128); + const __m512i lowMask = _mm512_set1_epi8(0xF); + + const __m512i values128 = _mm512_set_epi8( + 113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127, + 113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127, + 113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127, + 113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127 + ); + const __m512i off = _mm512_set1_epi8(static_cast(0x80)); + const __m512i values256 = _mm512_add_epi8(values128, off); + + auto loadc = [&](int col) { + vc[col] = _mm512_setzero_ps(); + }; + Unroll{}(loadc); + + auto compute = [&](int col, int i) { + if (col == 0) { + // load a + va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 0)); + va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 64)); + va[2] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 128)); + va[3] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 192)); + + // compensation: 128 * A + const __m256i q8sums = _mm256_loadu_si256((const __m256i *)A[0 * KB + i].bsums); + vcomp = _mm512_castsi256_si512(_mm256_madd_epi16(q8sums, m128s)); + vd1 = _mm512_set1_ps(A[0 * KB + i].d); + } + + // accmulate the quants + __m512i acc = _mm512_setzero_si512(); + const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE); + const char * b_qs = b_ptr; + int mask = 0; + for (int k_group = 0; k_group < QK_K / 32; ++k_group) { + int r = k_group >> 1; + __m512i vmask = _mm512_set1_epi32(k_group); + __m512i vsum = _mm512_setzero_si512(); + for (int k = 0; k < 8; k += 2) { + __m512i va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]); + __m512i va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]); + + __m512i bytes = _mm512_loadu_si512(b_qs); + __m512i vb0 = _mm512_shuffle_epi8(values256, _mm512_and_si512(bytes, lowMask)); + __m512i vb1 = _mm512_shuffle_epi8(values256, _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask)); + + vsum = _mm512_dpbusd_epi32(vsum, vb0, va0); + vsum = _mm512_dpbusd_epi32(vsum, vb1, va1); + b_qs += 64; + } + // (B + 128) * A - 128 * A + vsum = _mm512_sub_epi32(vsum, _mm512_permutexvar_epi32(vmask, vcomp)); + + // vacc += scale * (q8 @ q4) + const __m512i vscale = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i *)(b_ptr + offset_scales + k_group * TILE_N))); + acc = _mm512_add_epi32(acc, _mm512_mullo_epi32(vsum, vscale)); + } + const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_d0))); + vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc), _mm512_mul_ps(vd0, vd1), vc[col]); + }; + + for (int i = 0; i < KB; ++i) { + Unroll{}(compute, i); + } + + //store to C + auto storec = [&](int col) { + _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); + }; + Unroll{}(storec); + } +}; + +#define LAUNCH_TINYGEMM_KERNEL_VNNI(NB_SIZE) \ + tinygemm_kernel_vnni::apply( \ + KB, (const char *)wdata + 0 * row_size_A, \ + (const char *)src0->data + PACKED_INDEX(nb * kTilesN, 0, KB, TILE_SIZE), \ + (float *) dst->data + 0 * N + nb_start, ldc) + +template ::value, int>::type = 0> +void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const void * RESTRICT _B, TC * RESTRICT C, int ldc) { + using packed_B_t = packed_B_type; + const int TILE_SIZE = get_tile_size(); + const bool need_unpack = do_unpack::value; + + GGML_ASSERT(M <= 2 * TILE_M && N == 2 * TILE_N); + const TA * RESTRICT A = static_cast(_A); + const char * RESTRICT B = static_cast(_B); + + const int m0 = std::min(M, TILE_M); + const int m1 = std::max(M - TILE_M, 0); + const int lda = KB * sizeof(TA); + //const int ldb = KB * sizeof(TB); + + static thread_local packed_B_t Tile0[TILE_N * TILE_K]; + static thread_local packed_B_t Tile1[TILE_N * TILE_K]; + static thread_local int8_t Tile23[TILE_M * TILE_K]; + + static thread_local int32_t TileC0[TILE_M * TILE_N * 4]; + static thread_local int32_t TileC1[TILE_M * TILE_N * 4]; + + // double buffering C to interleave avx512 and amx + int32_t * C_cur = TileC0; + int32_t * C_pre = TileC1; + + auto Tile4 = [&](int32_t * base) { return base; }; + auto Tile5 = [&](int32_t * base) { return base + TILE_M * TILE_N; }; + auto Tile6 = [&](int32_t * base) { return base + 2 * TILE_M * TILE_N; }; + auto Tile7 = [&](int32_t * base) { return base + 3 * TILE_M * TILE_N; }; + + if (M == 2 * TILE_M) { + // i = 0 + const char * B_blk0 = B + PACKED_INDEX(0, 0, KB, TILE_SIZE); + const char * B_blk1 = B + PACKED_INDEX(1, 0, KB, TILE_SIZE); + if (need_unpack) { + unpack_B(Tile0, B_blk0); + _tile_loadd(TMM0, Tile0, TILE_N * VNNI_BLK); + } else { + _tile_loadd(TMM0, B_blk0, TILE_N * VNNI_BLK); + } + + _tile_zero(TMM4); + _tile_loadd(TMM2, A[0].qs, lda); + _tile_dpbssd(TMM4, TMM2, TMM0); + _tile_stored(TMM4, Tile4(C_pre), TILE_N * sizeof(int32_t)); + + _tile_zero(TMM5); + _tile_loadd(TMM3, A[TILE_M * KB + 0].qs, lda); + _tile_dpbssd(TMM5, TMM3, TMM0); + _tile_stored(TMM5, Tile5(C_pre), TILE_N * sizeof(int32_t)); + + if (need_unpack) { + unpack_B(Tile1, B_blk0); + _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK); + } else { + _tile_loadd(TMM1, B_blk1, TILE_N * VNNI_BLK); + } + + _tile_zero(TMM6); + _tile_dpbssd(TMM6, TMM2, TMM1); + _tile_stored(TMM6, Tile6(C_pre), TILE_N * sizeof(int32_t)); + + _tile_zero(TMM7); + _tile_dpbssd(TMM7, TMM3, TMM1); + _tile_stored(TMM7, Tile7(C_pre), TILE_N * sizeof(int32_t)); + + for (int i = 1; i < KB; ++i) { + // index of previous iter + const int ii = i - 1; + const char * B_blk0 = B + PACKED_INDEX(0, i, KB, TILE_SIZE); + const char * B_blk1 = B + PACKED_INDEX(1, i, KB, TILE_SIZE); + GGML_DISPATCH_BOOL(ii > 0, is_acc, [&] { + if (need_unpack) { + unpack_B(Tile0, B_blk0); + _tile_loadd(TMM0, Tile0, TILE_N * VNNI_BLK); + } else { + _tile_loadd(TMM0, B_blk0, TILE_N * VNNI_BLK); + } + _tile_zero(TMM4); + _tile_loadd(TMM2, A[i].qs, lda); + acc_C::apply(C, ldc, Tile4(C_pre), &A[ii], KB, B + PACKED_INDEX(0, ii, KB, TILE_SIZE), TILE_M); + + _tile_dpbssd(TMM4, TMM2, TMM0); + _tile_stored(TMM4, Tile4(C_cur), TILE_N * sizeof(int32_t)); + + _tile_zero(TMM5); + _tile_loadd(TMM3, A[TILE_M * KB + i].qs, lda); + acc_C::apply(C + TILE_M * ldc, ldc, Tile5(C_pre), &A[TILE_M * KB + ii], KB, B + PACKED_INDEX(0, ii, KB, TILE_SIZE), TILE_M); + + _tile_dpbssd(TMM5, TMM3, TMM0); + _tile_stored(TMM5, Tile5(C_cur), TILE_N * sizeof(int32_t)); + + if (need_unpack) { + unpack_B(Tile1, B_blk1); + _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK); + } else { + _tile_loadd(TMM1, B_blk1, TILE_N * VNNI_BLK); + } + _tile_zero(TMM6); + acc_C::apply(C + TILE_N, ldc, Tile6(C_pre), &A[ii], KB, B + PACKED_INDEX(1, ii, KB, TILE_SIZE), TILE_M); + + _tile_dpbssd(TMM6, TMM2, TMM1); + _tile_stored(TMM6, Tile6(C_cur), TILE_N * sizeof(int32_t)); + + _tile_zero(TMM7); + acc_C::apply(C + TILE_M * ldc + TILE_N, ldc, Tile7(C_pre), &A[TILE_M * KB + ii], KB, B + PACKED_INDEX(1, ii, KB, TILE_SIZE), TILE_M); + + _tile_dpbssd(TMM7, TMM3, TMM1); + _tile_stored(TMM7, Tile7(C_cur), TILE_N * sizeof(int32_t)); + + std::swap(C_cur, C_pre); + }); + } + // final accumulation + { + int ii = KB - 1; + acc_C::apply(C, ldc, Tile4(C_pre), &A[ii], KB, B + PACKED_INDEX(0, ii, KB, TILE_SIZE), TILE_M); + acc_C::apply(C + TILE_M * ldc, ldc, Tile5(C_pre), &A[TILE_M * KB + ii], KB, B + PACKED_INDEX(0, ii, KB, TILE_SIZE), TILE_M); + acc_C::apply(C + TILE_N, ldc, Tile6(C_pre), &A[ii], KB, B + PACKED_INDEX(1, ii, KB, TILE_SIZE), TILE_M); + acc_C::apply(C + TILE_M * ldc + TILE_N, ldc, Tile7(C_pre), &A[TILE_M * KB + ii], KB, B + PACKED_INDEX(1, ii, KB, TILE_SIZE), TILE_M); + } + } else { + for (int i = 0; i < KB; ++i) { + _tile_zero(TMM4); + _tile_zero(TMM6); + if (m1 != 0) { + _tile_zero(TMM5); + _tile_zero(TMM7); + } + + const char * B_blk0 = B + PACKED_INDEX(0, i, KB, TILE_SIZE); + const char * B_blk1 = B + PACKED_INDEX(1, i, KB, TILE_SIZE); + if (need_unpack) { + unpack_B(Tile0, B_blk0); + _tile_loadd(TMM0, Tile0, TILE_N * VNNI_BLK); + } else { + _tile_loadd(TMM0, B_blk0, TILE_N * VNNI_BLK); + } + + if (need_unpack) { + unpack_B(Tile1, B_blk1); + _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK); + } else { + _tile_loadd(TMM1, B_blk1, TILE_N * VNNI_BLK); + } + + if (m0 == TILE_M) { + _tile_loadd(TMM2, A[i].qs, lda); + } else { + unpack_A(Tile23, &A[i], KB, m0); + _tile_loadd(TMM2, Tile23, TILE_K); + } + + _tile_dpbssd(TMM4, TMM2, TMM0); + _tile_dpbssd(TMM6, TMM2, TMM1); + + _tile_stored(TMM4, Tile4(C_cur), TILE_N * sizeof(int32_t)); + _tile_stored(TMM6, Tile6(C_cur), TILE_N * sizeof(int32_t)); + + GGML_DISPATCH_BOOL(i > 0, is_acc, [&] { + acc_C::apply(C, ldc, Tile4(C_cur), &A[i], KB, B + PACKED_INDEX(0, i, KB, TILE_SIZE), m0); + acc_C::apply(C + TILE_N, ldc, Tile6(C_cur), &A[i], KB, B + PACKED_INDEX(1, i, KB, TILE_SIZE), m0); + }); + + if (m1 != 0) { + unpack_A(Tile23, &A[TILE_M * KB + i], KB, m1); + _tile_loadd(TMM3, Tile23, TILE_K); + + _tile_dpbssd(TMM5, TMM3, TMM0); + _tile_dpbssd(TMM7, TMM3, TMM1); + _tile_stored(TMM5, Tile5(C_cur), TILE_N * sizeof(int32_t)); + _tile_stored(TMM7, Tile7(C_cur), TILE_N * sizeof(int32_t)); + GGML_DISPATCH_BOOL(i > 0, is_acc, [&] { + acc_C::apply(C + TILE_M * ldc, ldc, Tile5(C_cur), &A[TILE_M * KB + i], KB, B + PACKED_INDEX(0, i, KB, TILE_SIZE), m1); + acc_C::apply(C + TILE_M * ldc + TILE_N, ldc, Tile7(C_cur), &A[TILE_M * KB + i], KB, B + PACKED_INDEX(1, i, KB, TILE_SIZE), m1); + }); + } + } + } + return; +} + +template ::value, int>::type = 0> +void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) { + static_assert(std::is_same::value); + const int TILE_SIZE = get_tile_size(); + + GGML_ASSERT(M <= 2 * TILE_M && N == 2 * TILE_N); + const TA * RESTRICT A = static_cast(_A); + const char * RESTRICT B = static_cast(_B); + + const int m0 = std::min(M, TILE_M); + const int m1 = std::max(M - TILE_M, 0); + //const int lda = KB * sizeof(TA); + + static thread_local int8_t Tile0[TILE_N * TILE_K]; + static thread_local int8_t Tile1[TILE_N * TILE_K]; + static thread_local int8_t Tile23[TILE_M * TILE_K]; + + // mat mul result for each group + static thread_local int32_t Tile4[TILE_M * TILE_N]; + static thread_local int32_t Tile5[TILE_M * TILE_N]; + static thread_local int32_t Tile6[TILE_M * TILE_N]; + static thread_local int32_t Tile7[TILE_M * TILE_N]; + + // sum of each QK_K block, contains 8 groups, int32 + static thread_local int32_t Sumi4[TILE_M * TILE_N]; + static thread_local int32_t Sumi5[TILE_M * TILE_N]; + static thread_local int32_t Sumi6[TILE_M * TILE_N]; + static thread_local int32_t Sumi7[TILE_M * TILE_N]; + + const int k_group_size = std::is_same::value ? 16 : 32; + for (int i = 0; i < KB; ++i) { + // step 1: accumulate the quants across 8 groups, each group with 32 + for (int k = 0; k < QK_K / k_group_size; ++k) { + GGML_DISPATCH_BOOL(k > 0, is_acc, [&] { + _tile_zero(TMM4); + _tile_zero(TMM6); + + unpack_B(Tile0, B + PACKED_INDEX(0, i, KB, TILE_SIZE), k); + _tile_loadd(TMM0, Tile0, TILE_N * VNNI_BLK); + + unpack_B(Tile1, B + PACKED_INDEX(1, i, KB, TILE_SIZE), k); + _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK); + + unpack_A(Tile23, &A[i], KB, k, m0); + _tile_loadd(TMM2, Tile23, TILE_K); + + _tile_dpbssd(TMM4, TMM2, TMM0); + _tile_dpbssd(TMM6, TMM2, TMM1); + + _tile_stored(TMM4, Tile4, TILE_N * sizeof(int32_t)); + _tile_stored(TMM6, Tile6, TILE_N * sizeof(int32_t)); + + scale_C(Tile4, Sumi4, B + PACKED_INDEX(0, i, KB, TILE_SIZE), k, m0); + scale_C(Tile6, Sumi6, B + PACKED_INDEX(1, i, KB, TILE_SIZE), k, m0); + + if (m1 != 0) { + _tile_zero(TMM5); + _tile_zero(TMM7); + + unpack_A(Tile23, &A[TILE_M * KB + i], KB, k, m1); + _tile_loadd(TMM3, Tile23, TILE_K); + + _tile_dpbssd(TMM5, TMM3, TMM0); + _tile_dpbssd(TMM7, TMM3, TMM1); + + _tile_stored(TMM5, Tile5, TILE_N * sizeof(int32_t)); + _tile_stored(TMM7, Tile7, TILE_N * sizeof(int32_t)); + + scale_C(Tile5, Sumi5, B + PACKED_INDEX(0, i, KB, TILE_SIZE), k, m1); + scale_C(Tile7, Sumi7, B + PACKED_INDEX(1, i, KB, TILE_SIZE), k, m1); + } + }); + } + + // step 2: accmulate the mins + GGML_DISPATCH_BOOL(i > 0, is_acc, [&] { + acc_C::apply(C, ldc, Sumi4, &A[i], KB, B + PACKED_INDEX(0, i, KB, TILE_SIZE), m0); + acc_C::apply(C + TILE_N, ldc, Sumi6, &A[i], KB, B + PACKED_INDEX(1, i, KB, TILE_SIZE), m0); + if (m1 != 0) { + acc_C::apply(C + TILE_M * ldc, ldc, Sumi5, &A[TILE_M * KB + i], KB, B + PACKED_INDEX(0, i, KB, TILE_SIZE), m1); + acc_C::apply(C + TILE_M * ldc + TILE_N, ldc, Sumi7, &A[TILE_M * KB + i], KB, B + PACKED_INDEX(1, i, KB, TILE_SIZE), m1); + } + }); + } + return; +} + +} // anonymous namespace + +// get the packed tensor size for quantized weights +size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor) { + const enum ggml_type TYPE = tensor->type; + + const int K = tensor->ne[0]; // ne0: in_features + const int N = tensor->ne[1]; // ne1: out_features + + auto get_tensor_size = [&] { + size_t row_size_B{0}; + GGML_DISPATCH_QTYPES(TYPE, [&] { + row_size_B = get_row_size(K); + }); + return N * row_size_B; + }; + + if (qtype_has_amx_kernels(TYPE)) { + return get_tensor_size(); + } else { + // for f16, bf16 we don't do packing + return ggml_nbytes(tensor); + } +} + +// pack weight to vnni format +void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + + size_t alloc_size = ggml_backend_amx_get_alloc_size(tensor); + GGML_ASSERT(alloc_size == size); + + const enum ggml_type TYPE = tensor->type; + + const int K = tensor->ne[0]; // ne0: in_features + const int N = tensor->ne[1]; // ne1: out_features + +#if defined(_OPENMP) + // the buffer ctx is not initialized when .set_tensor is called + int n_threads = omp_get_num_threads(); +#else + int n_threads = 1; +#endif + + GGML_DISPATCH_QTYPES(TYPE, [&] { + convert_B_packed_format((void *)((char *)tensor->data + offset), (const type *)data, N, K, n_threads); + }); +} + +// NB: mixed dtype gemm with Advanced Matrix Extensions (Intel AMX) +// +// src0: weight in shape of {N, K}, quantized +// src1: input in shape of {M, K}, float32 +// dst: output in shape of {M, N}, float32 +// +// the function performs: dst = src1 @ src0.T +// +void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst) { + struct ggml_tensor * src0 = dst->src[0]; + struct ggml_tensor * src1 = dst->src[1]; + + const enum ggml_type TYPE = src0->type; + + const int n_threads = ctx->n_threads; + + // f16 only has avx512 kernels for now, + // amx kernels will be added once 6th gen xeon is released. + const bool is_floating_type = TYPE == GGML_TYPE_F16; + + const int M = dst->ne[1]; + const int N = dst->ne[0]; + const int K = src0->ne[0]; + const int ldc = dst->nb[1] / dst->nb[0]; + + if (is_floating_type) { + constexpr int BLOCK_M = 4; + constexpr int BLOCK_N = 6; + const int MB = div_up(M, BLOCK_M); + const int NB = div_up(N, BLOCK_N); + + parallel_for(n_threads, MB * NB, [&](int begin, int end) { + GGML_DISPATCH_FLOATING_TYPES(TYPE, [&] { + for (int i = begin; i < end; ++i) { + int mb = i / NB; + int nb = i % NB; + + int mb_start = mb * BLOCK_M; + int mb_size = std::min(BLOCK_M, M - mb_start); + int nb_start = nb * BLOCK_N; + int nb_size = std::min(BLOCK_N, N - nb_start); + + switch (mb_size << 4 | nb_size) { + case 0x12: LAUNCH_TINYGEMM_KERNEL_AVX(1, 2); break; + case 0x14: LAUNCH_TINYGEMM_KERNEL_AVX(1, 4); break; + case 0x16: LAUNCH_TINYGEMM_KERNEL_AVX(1, 6); break; + case 0x22: LAUNCH_TINYGEMM_KERNEL_AVX(2, 2); break; + case 0x24: LAUNCH_TINYGEMM_KERNEL_AVX(2, 4); break; + case 0x26: LAUNCH_TINYGEMM_KERNEL_AVX(2, 6); break; + case 0x32: LAUNCH_TINYGEMM_KERNEL_AVX(3, 2); break; + case 0x34: LAUNCH_TINYGEMM_KERNEL_AVX(3, 4); break; + case 0x36: LAUNCH_TINYGEMM_KERNEL_AVX(3, 6); break; + case 0x42: LAUNCH_TINYGEMM_KERNEL_AVX(4, 2); break; + case 0x44: LAUNCH_TINYGEMM_KERNEL_AVX(4, 4); break; + case 0x46: LAUNCH_TINYGEMM_KERNEL_AVX(4, 6); break; + default: fprintf(stderr, "Unexpected block size!\n"); + } + } + }); + }); + return; + } + + // pointer to work space, used convert A from float to quantized type + void * wdata = nullptr; + + //TODO: performance improvement: merge quant A + GGML_DISPATCH_QTYPES(TYPE, [&] { + const size_t row_size_A = K / blck_size * sizeof(vec_dot_type); + const size_t desired_wsize = M * row_size_A; + if (ctx->work_size < desired_wsize) { + ctx->work_data.reset(new char[desired_wsize]); + ctx->work_size = desired_wsize; + } + wdata = ctx->work_data.get(); + + // Q4_0, Q4_1, Q8_0 handles 1 TILE_K per blck_size + // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size + GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size); + + const float * A_data = static_cast(src1->data); + for (int m = 0; m < M; ++m) { + from_float(A_data + m * K, (char *)wdata + m * row_size_A, K); + } + }); + + if (M == 1) { + // MB = 1 and handle 8 tiles in each block + constexpr int kTilesN = 4; + constexpr int BLOCK_N = TILE_N * kTilesN; + const int NB = div_up(N, BLOCK_N); + + parallel_for(n_threads, NB, [&](int begin, int end) { + GGML_DISPATCH_QTYPES(TYPE, [&] { + const int KB = K / blck_size; + const int TILE_SIZE = get_tile_size(); + const int row_size_A = KB * sizeof(vec_dot_type); + for (int i = begin; i < end; ++i) { + int nb = i; + int nb_start = nb * BLOCK_N; + int nb_size = std::min(BLOCK_N, N - nb_start); // 32, 64, 96 + + switch (nb_size) { + //case 160: LAUNCH_TINYGEMM_KERNEL_VNNI(160); break; + case 128: LAUNCH_TINYGEMM_KERNEL_VNNI(128); break; + case 96: LAUNCH_TINYGEMM_KERNEL_VNNI(96); break; + case 64: LAUNCH_TINYGEMM_KERNEL_VNNI(64); break; + case 32: LAUNCH_TINYGEMM_KERNEL_VNNI(32); break; + default: fprintf(stderr, "Unexpected n block size!\n"); + } + } + }); + }); + return; + } + + // handle 4 tiles at a tile + constexpr int BLOCK_M = TILE_M * 2; + constexpr int BLOCK_N = TILE_N * 2; + const int MB = div_up(M, BLOCK_M); + const int NB = div_up(N, BLOCK_N); + + parallel_for(n_threads, MB * NB, [&](int begin, int end) { + // init tile config for each thread + ggml_tile_config_init(); + + GGML_DISPATCH_QTYPES(TYPE, [&] { + const int KB = K / blck_size; + const int TILE_SIZE = get_tile_size(); + const int row_size_A = KB * sizeof(vec_dot_type); + + for (int i = begin; i < end; ++i) { + int mb = i / NB; + int nb = i % NB; + + int mb_start = mb * BLOCK_M; + int mb_size = std::min(BLOCK_M, M - mb_start); + int nb_start = nb * BLOCK_N; + int nb_size = BLOCK_N; + + tinygemm_kernel_amx( + mb_size, nb_size, KB, + (const char *)wdata + mb_start * row_size_A, + (const char *)src0->data + PACKED_INDEX(nb * 2, 0, KB, TILE_SIZE), + (float *) dst->data + mb_start * N + nb_start, ldc); + } + }); + }); +} + +#else // if defined(__AMX_INT8__) + +void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst) { + fprintf(stderr, "GGML is not compiled with AMX support!\n"); + + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +#endif // if defined(__AMX_INT8__) diff --git a/ggml/src/ggml-amx/mmq.h b/ggml/src/ggml-amx/mmq.h new file mode 100644 index 000000000..cf0920620 --- /dev/null +++ b/ggml/src/ggml-amx/mmq.h @@ -0,0 +1,17 @@ +#pragma once +#include "common.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor); + +void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + +void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index b0d4141cc..fd3deae00 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -9,145 +9,218 @@ extern "C" { #endif // - // Backend buffer + // Backend buffer type // - // buffer type - typedef void * ggml_backend_buffer_type_context_t; - struct ggml_backend_buffer_type_i { - const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft); + const char * (*get_name) (ggml_backend_buffer_type_t buft); // allocate a buffer of this type - ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); + ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); // tensor alignment - size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); - // max buffer size that can be allocated - size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); - // data size needed to allocate the tensor, including padding - size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); - // check if tensor data is in host memory - bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft); + size_t (*get_alignment) (ggml_backend_buffer_type_t buft); + // (optional) max buffer size that can be allocated (defaults to SIZE_MAX) + size_t (*get_max_size) (ggml_backend_buffer_type_t buft); + // (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes) + size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); + // (optional) check if tensor data is in host memory (defaults to false) + bool (*is_host) (ggml_backend_buffer_type_t buft); }; struct ggml_backend_buffer_type { struct ggml_backend_buffer_type_i iface; - ggml_backend_buffer_type_context_t context; + ggml_backend_dev_t device; + void * context; }; - // buffer - typedef void * ggml_backend_buffer_context_t; + // + // Backend buffer + // struct ggml_backend_buffer_i { - const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer); - void (*GGML_CALL free_buffer) (ggml_backend_buffer_t buffer); - void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer); - void (*GGML_CALL init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - void (*GGML_CALL memset_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); - void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer - void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value); - void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras + const char * (*get_name) (ggml_backend_buffer_t buffer); + // (optional) free the buffer + void (*free_buffer) (ggml_backend_buffer_t buffer); + // base address of the buffer + void * (*get_base) (ggml_backend_buffer_t buffer); + // (optional) initialize a tensor in the buffer (eg. add tensor extras) + void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + // tensor data access + void (*memset_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); + void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + // (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported) + bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); + // clear the entire buffer + void (*clear) (ggml_backend_buffer_t buffer, uint8_t value); + // (optional) reset any internal state due to tensor initialization, such as tensor extras + void (*reset) (ggml_backend_buffer_t buffer); }; struct ggml_backend_buffer { struct ggml_backend_buffer_i iface; ggml_backend_buffer_type_t buft; - ggml_backend_buffer_context_t context; + void * context; size_t size; enum ggml_backend_buffer_usage usage; }; - GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init( - ggml_backend_buffer_type_t buft, - struct ggml_backend_buffer_i iface, - ggml_backend_buffer_context_t context, - size_t size); + ggml_backend_buffer_t ggml_backend_buffer_init( + ggml_backend_buffer_type_t buft, + struct ggml_backend_buffer_i iface, + void * context, + size_t size); // do not use directly, use ggml_backend_tensor_copy instead bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst); + // multi-buffer // buffer that contains a collection of buffers - GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers); - GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer); - GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); + ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers); + bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer); + void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); // - // Backend + // Backend (stream) // - typedef void * ggml_backend_context_t; - struct ggml_backend_i { - const char * (*GGML_CALL get_name)(ggml_backend_t backend); + const char * (*get_name)(ggml_backend_t backend); - void (*GGML_CALL free)(ggml_backend_t backend); + void (*free)(ggml_backend_t backend); + // Will be moved to the device interface // buffer allocation - ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend); + ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend); // (optional) asynchronous tensor data access - void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst); + void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst); // (optional) complete all pending operations - void (*GGML_CALL synchronize)(ggml_backend_t backend); + void (*synchronize)(ggml_backend_t backend); - // compute graph with a plan (not used currently) - // create a new plan for a graph - ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); - void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + // (optional) compute graph with a plan (not used currently) + ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); + void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology - void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph); + void (*graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph); // compute the graph with the plan - enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); + enum ggml_status (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); - // compute graph without a plan (async) - enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); + // compute graph (always async if supported by the backend) + enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); - // check if the backend can compute an operation - bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); - - // check if the backend can use tensors allocated in a buffer type - bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft); - - // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer - // these should be expensive operations with large batch sizes that may benefit from running on this backend - // even if the weight has to be copied from the CPU temporarily - bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op); + // IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface + // new backends should implement the device interface instead + // These functions are being moved to the device interface + bool (*supports_op) (ggml_backend_t backend, const struct ggml_tensor * op); + bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft); + bool (*offload_op) (ggml_backend_t backend, const struct ggml_tensor * op); // (optional) event synchronization - // create a new event that can record events on this backend instance - ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend); - void (*GGML_CALL event_free) (ggml_backend_event_t event); - // record an event on the backend instance that created it - void (*GGML_CALL event_record) (ggml_backend_event_t event); - // wait for an event on on a different backend instance - void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event); - // block until an event is recorded - void (*GGML_CALL event_synchronize) (ggml_backend_event_t event); + // record an event on this stream + void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event); + // wait for an event on on a different stream + void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event); }; struct ggml_backend { ggml_guid_t guid; - struct ggml_backend_i iface; - ggml_backend_context_t context; + ggml_backend_dev_t device; + void * context; }; struct ggml_backend_event { - ggml_backend_t backend; + struct ggml_backend_device * device; void * context; }; // - // Backend registry + // Backend device // - typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data); + // Note: if additional properties are needed, we should add a struct with all of them + // the current functions to obtain the properties can remain, since they are more convenient for often used properties + struct ggml_backend_device_i { + // device name: short identifier for this device, such as "CPU" or "CUDA0" + const char * (*get_name)(ggml_backend_dev_t dev); - GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data); + // device description: short informative description of the device, could be the model name + const char * (*get_description)(ggml_backend_dev_t dev); + + // device memory in bytes + void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total); + + // device type + enum ggml_backend_dev_type (*get_type)(ggml_backend_dev_t dev); + + // device properties + void (*get_props)(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props); + + // backend (stream) initialization + ggml_backend_t (*init_backend)(ggml_backend_dev_t dev, const char * params); + + // preferred buffer type + ggml_backend_buffer_type_t (*get_buffer_type)(ggml_backend_dev_t dev); + + // (optional) host buffer type (in system memory, typically this is a pinned memory buffer for faster transfers between host and device) + ggml_backend_buffer_type_t (*get_host_buffer_type)(ggml_backend_dev_t dev); + + // (optional) buffer from pointer: create a buffer from a host pointer (useful for memory mapped models and importing data from other libraries) + ggml_backend_buffer_t (*buffer_from_host_ptr)(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size); + + // check if the backend can compute an operation + bool (*supports_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op); + + // check if the backend can use tensors allocated in a buffer type + bool (*supports_buft)(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft); + + // (optional) check if the backend wants to run an operation, even if the weights are allocated in an incompatible buffer + // these should be expensive operations that may benefit from running on this backend instead of the CPU backend + bool (*offload_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op); + + // (optional) event synchronization + ggml_backend_event_t (*event_new) (ggml_backend_dev_t dev); + void (*event_free) (ggml_backend_dev_t dev, ggml_backend_event_t event); + void (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event); + }; + + struct ggml_backend_device { + struct ggml_backend_device_i iface; + ggml_backend_reg_t reg; + void * context; + }; + + // + // Backend (reg) + // + + struct ggml_backend_reg_i { + const char * (*get_name)(ggml_backend_reg_t reg); + + // enumerate available devices + size_t (*get_device_count)(ggml_backend_reg_t reg); + ggml_backend_dev_t (*get_device)(ggml_backend_reg_t reg, size_t index); + + // (optional) get a pointer to a function in the backend + // backends can add custom functions that are not part of the standard ggml-backend interface + void * (*get_proc_address)(ggml_backend_reg_t reg, const char * name); + }; + + struct ggml_backend_reg { + // int api_version; // TODO: for dynamic loading + struct ggml_backend_reg_i iface; + void * context; + }; + + + // Internal backend registry API + void ggml_backend_register(ggml_backend_reg_t reg); + void ggml_backend_device_register(ggml_backend_dev_t device); + // TODO: backends can be loaded as a dynamic library, in which case it needs to export this function + // typedef ggml_backend_register_t * (*ggml_backend_init)(void); #ifdef __cplusplus } diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.cpp similarity index 73% rename from ggml/src/ggml-backend.c rename to ggml/src/ggml-backend.cpp index ba280e064..7d7b63a15 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.cpp @@ -1,3 +1,13 @@ +// Note: porting this file to C++ is a work in progress + +#ifdef _WIN32 +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +# define NOMINMAX +#endif +#include +#endif + #include "ggml-backend-impl.h" #include "ggml-alloc.h" #include "ggml-impl.h" @@ -8,9 +18,14 @@ #include #include #include +#include +#include +#ifdef __APPLE__ +#include +#include +#endif -#define MAX(a, b) ((a) > (b) ? (a) : (b)) // backend buffer type @@ -18,7 +33,7 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) { return buft->iface.get_name(buft); } -GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { return buft->iface.alloc_buffer(buft, size); } @@ -34,7 +49,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) { return SIZE_MAX; } -GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) { +size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) { // get_alloc_size is optional, defaults to ggml_nbytes if (buft->iface.get_alloc_size) { size_t size = buft->iface.get_alloc_size(buft, tensor); @@ -51,16 +66,18 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) { return false; } +ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) { + return buft->device; +} + // backend buffer -GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init( - ggml_backend_buffer_type_t buft, - struct ggml_backend_buffer_i iface, - ggml_backend_buffer_context_t context, - size_t size) { - ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer)); - - (*buffer) = (struct ggml_backend_buffer) { +ggml_backend_buffer_t ggml_backend_buffer_init( + ggml_backend_buffer_type_t buft, + struct ggml_backend_buffer_i iface, + void * context, + size_t size) { + ggml_backend_buffer_t buffer = new ggml_backend_buffer { /* .interface = */ iface, /* .buft = */ buft, /* .context = */ context, @@ -83,7 +100,7 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { if (buffer->iface.free_buffer != NULL) { buffer->iface.free_buffer(buffer); } - free(buffer); + delete buffer; } size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) { @@ -98,14 +115,14 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { return base; } -GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { +void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { // init_tensor is optional if (buffer->iface.init_tensor) { buffer->iface.init_tensor(buffer, tensor); } } -size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) { +size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) { return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer)); } @@ -218,7 +235,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten } } -GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; GGML_ASSERT(buf != NULL && "tensor buffer not set"); @@ -232,7 +249,7 @@ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * buf->iface.set_tensor(buf, tensor, data, offset, size); } -GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { +void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; GGML_ASSERT(buf != NULL && "tensor buffer not set"); @@ -246,7 +263,7 @@ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * buf->iface.get_tensor(buf, tensor, data, offset, size); } -GGML_API GGML_CALL void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { +GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; GGML_ASSERT(buf != NULL && "tensor buffer not set"); @@ -299,20 +316,38 @@ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct } bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { + // helper to ease transition to device interface + if (backend->device) { + return ggml_backend_dev_supports_op(backend->device, op); + } + return backend->iface.supports_op(backend, op); } bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + // helper to ease transition to device interface + if (backend->device) { + return ggml_backend_dev_supports_buft(backend->device, buft); + } return backend->iface.supports_buft(backend, buft); } bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) { + // helper to ease transition to device interface + if (backend->device) { + return ggml_backend_dev_offload_op(backend->device, op); + } + if (backend->iface.offload_op != NULL) { return backend->iface.offload_op(backend, op); } return false; } +ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) { + return backend->device; +} + // backend copy static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { @@ -343,7 +378,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); } else if (!ggml_backend_buffer_copy_tensor(src, dst)) { #ifndef NDEBUG - fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer)); + GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer)); #endif size_t nbytes = ggml_nbytes(src); void * data = malloc(nbytes); @@ -375,30 +410,31 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b // events -ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) { - if (backend->iface.event_new == NULL) { +ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) { + // null device is allowed for the transition period to the device interface + if (device == NULL || device->iface.event_new == NULL) { return NULL; } - return backend->iface.event_new(backend); + return device->iface.event_new(device); } void ggml_backend_event_free(ggml_backend_event_t event) { if (event == NULL) { return; } - event->backend->iface.event_free(event); + event->device->iface.event_free(event->device, event); } -void ggml_backend_event_record(ggml_backend_event_t event) { - GGML_ASSERT(event->backend->iface.event_record != NULL); +void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) { + GGML_ASSERT(backend->iface.event_record != NULL); - event->backend->iface.event_record(event); + backend->iface.event_record(backend, event); } void ggml_backend_event_synchronize(ggml_backend_event_t event) { - GGML_ASSERT(event->backend->iface.event_synchronize != NULL); + GGML_ASSERT(event->device->iface.event_synchronize); - event->backend->iface.event_synchronize(event); + event->device->iface.event_synchronize(event->device, event); } void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { @@ -407,170 +443,283 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) backend->iface.event_wait(backend, event); } -// backend registry +// Backend device -#define GGML_REG_MAX_BACKENDS 64 - -struct ggml_backend_reg { - char name[128]; - ggml_backend_init_fn init_fn; - ggml_backend_buffer_type_t default_buffer_type; - void * user_data; -}; - -static struct ggml_backend_reg ggml_backend_registry[GGML_REG_MAX_BACKENDS]; -static size_t ggml_backend_registry_count = 0; - -GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data); - -GGML_CALL static void ggml_backend_registry_init(void) { - static bool initialized = false; - - if (initialized) { - return; - } - - initialized = true; - - ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL); - - // add forward decls here to avoid including the backend headers -#ifdef GGML_USE_CUDA - extern GGML_CALL void ggml_backend_cuda_reg_devices(void); - ggml_backend_cuda_reg_devices(); -#endif - -#ifdef GGML_USE_SYCL - extern void ggml_backend_sycl_reg_devices(void); - ggml_backend_sycl_reg_devices(); -#endif - -#ifdef GGML_USE_METAL - extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); - extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); - ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL); -#endif - -#ifdef GGML_USE_VULKAN - extern GGML_CALL int ggml_backend_vk_reg_devices(void); - ggml_backend_vk_reg_devices(); -#endif - -#ifdef GGML_USE_KOMPUTE - extern GGML_CALL void ggml_backend_kompute_reg_devices(void); - ggml_backend_kompute_reg_devices(); -#endif - -#ifdef GGML_USE_CANN - extern GGML_CALL int ggml_backend_cann_reg_devices(void); - ggml_backend_cann_reg_devices(); -#endif +const char * ggml_backend_dev_name(ggml_backend_dev_t device) { + return device->iface.get_name(device); } -GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) { - GGML_ASSERT(ggml_backend_registry_count < GGML_REG_MAX_BACKENDS); - - size_t id = ggml_backend_registry_count; - - ggml_backend_registry[id] = (struct ggml_backend_reg) { - /* .name = */ {0}, - /* .fn = */ init_fn, - /* .default_buffer_type = */ default_buffer_type, - /* .user_data = */ user_data, - }; - - snprintf(ggml_backend_registry[id].name, sizeof(ggml_backend_registry[id].name), "%s", name); - -#ifndef NDEBUG - fprintf(stderr, "%s: registered backend %s\n", __func__, name); -#endif - - ggml_backend_registry_count++; +const char * ggml_backend_dev_description(ggml_backend_dev_t device) { + return device->iface.get_description(device); } -size_t ggml_backend_reg_get_count(void) { - ggml_backend_registry_init(); - - return ggml_backend_registry_count; +void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) { + device->iface.get_memory(device, free, total); } -size_t ggml_backend_reg_find_by_name(const char * name) { - ggml_backend_registry_init(); - - for (size_t i = 0; i < ggml_backend_registry_count; i++) { - // TODO: case insensitive in a portable way - if (strcmp(ggml_backend_registry[i].name, name) == 0) { - return i; - } - } - - // not found - return SIZE_MAX; +enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) { + return device->iface.get_type(device); } -// init from backend:params string -ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str) { - ggml_backend_registry_init(); +void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) { + memset(props, 0, sizeof(*props)); + device->iface.get_props(device, props); +} - const char * params = strchr(backend_str, ':'); - char backend_name[128]; - if (params == NULL) { - snprintf(backend_name, sizeof(backend_name), "%s", backend_str); - params = ""; - } else { - snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str); - params++; - } +ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) { + return device->reg; +} - size_t backend_i = ggml_backend_reg_find_by_name(backend_name); +ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) { + return device->iface.init_backend(device, params); +} - if (backend_i == SIZE_MAX) { - fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name); +ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) { + return device->iface.get_buffer_type(device); +} + +ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) { + if (device->iface.get_host_buffer_type == NULL) { return NULL; } - return ggml_backend_reg_init_backend(backend_i, params); + return device->iface.get_host_buffer_type(device); } -const char * ggml_backend_reg_get_name(size_t i) { - ggml_backend_registry_init(); - - GGML_ASSERT(i < ggml_backend_registry_count); - return ggml_backend_registry[i].name; +ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) { + return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size); } -ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params) { - ggml_backend_registry_init(); - - GGML_ASSERT(i < ggml_backend_registry_count); - return ggml_backend_registry[i].init_fn(params, ggml_backend_registry[i].user_data); +bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) { + return device->iface.supports_op(device, op); } -ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i) { - ggml_backend_registry_init(); - - GGML_ASSERT(i < ggml_backend_registry_count); - return ggml_backend_registry[i].default_buffer_type; +bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) { + return device->iface.supports_buft(device, buft); } -ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) { - ggml_backend_registry_init(); +bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) { + if (device->iface.offload_op != NULL) { + return device->iface.offload_op(device, op); + } - GGML_ASSERT(i < ggml_backend_registry_count); - return ggml_backend_buft_alloc_buffer(ggml_backend_registry[i].default_buffer_type, size); + return false; +} + +// Backend (reg) + +const char * ggml_backend_reg_name(ggml_backend_reg_t reg) { + return reg->iface.get_name(reg); +} + +size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) { + return reg->iface.get_device_count(reg); +} + +ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) { + return reg->iface.get_device(reg, index); +} + +void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) { + if (!reg->iface.get_proc_address) { + return NULL; + } + return reg->iface.get_proc_address(reg, name); +} + +// Backend registry + +#ifdef GGML_USE_CUDA +#include "ggml-cuda.h" +#endif + +#ifdef GGML_USE_METAL +#include "ggml-metal.h" +#endif + +#ifdef GGML_USE_SYCL +#include "ggml-sycl.h" +#endif + +#ifdef GGML_USE_VULKAN +#include "ggml-vulkan.h" +#endif + +#ifdef GGML_USE_BLAS +#include "ggml-blas.h" +#endif + +#ifdef GGML_USE_RPC +#include "ggml-rpc.h" +#endif + +#ifndef __AMX_INT8__ +#undef GGML_USE_AMX +#endif + +#ifdef GGML_USE_AMX +# include "ggml-amx.h" +#endif + +#ifdef GGML_USE_CANN +#include "ggml-cann.h" +#endif + +struct ggml_backend_registry { + std::vector backends; + std::vector devices; + + ggml_backend_registry() { +#ifdef GGML_USE_CUDA + register_backend(ggml_backend_cuda_reg()); +#endif +#ifdef GGML_USE_METAL + register_backend(ggml_backend_metal_reg()); +#endif +#ifdef GGML_USE_SYCL + register_backend(ggml_backend_sycl_reg()); +#endif +#ifdef GGML_USE_VULKAN + register_backend(ggml_backend_vk_reg()); +#endif +#ifdef GGML_USE_BLAS + register_backend(ggml_backend_blas_reg()); +#endif +#ifdef GGML_USE_RPC + register_backend(ggml_backend_rpc_reg()); +#endif +#ifdef GGML_USE_AMX + register_backend(ggml_backend_amx_reg()); +#endif +#ifdef GGML_USE_CANN + register_backend(ggml_backend_cann_reg()); +#endif + + // TODO: kompute + + register_backend(ggml_backend_cpu_reg()); + } + + void register_backend(ggml_backend_reg_t reg) { +#ifndef NDEBUG + GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n", + __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg)); +#endif + backends.push_back(reg); + for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { + register_device(ggml_backend_reg_dev_get(reg, i)); + } + } + + void register_device(ggml_backend_dev_t device) { +#ifndef NDEBUG + GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device)); +#endif + devices.push_back(device); + } +}; + +static ggml_backend_registry & get_reg() { + static ggml_backend_registry reg; + return reg; +} + +// Internal API +void ggml_backend_register(ggml_backend_reg_t reg) { + get_reg().register_backend(reg); +} + +void ggml_backend_device_register(ggml_backend_dev_t device) { + get_reg().register_device(device); +} + +// Backend (reg) enumeration +size_t ggml_backend_reg_count() { + return get_reg().backends.size(); +} + +ggml_backend_reg_t ggml_backend_reg_get(size_t index) { + GGML_ASSERT(index < ggml_backend_reg_count()); + return get_reg().backends[index]; +} + +ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) { + for (size_t i = 0; i < ggml_backend_reg_count(); i++) { + ggml_backend_reg_t reg = ggml_backend_reg_get(i); + if (strcmp(ggml_backend_reg_name(reg), name) == 0) { + return reg; + } + } + return NULL; +} + +// Device enumeration +size_t ggml_backend_dev_count() { + return get_reg().devices.size(); +} + +ggml_backend_dev_t ggml_backend_dev_get(size_t index) { + GGML_ASSERT(index < ggml_backend_dev_count()); + return get_reg().devices[index]; +} + +ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) { + for (size_t i = 0; i < ggml_backend_dev_count(); i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + if (strcmp(ggml_backend_dev_name(dev), name) == 0) { + return dev; + } + } + return NULL; +} + +ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) { + for (size_t i = 0; i < ggml_backend_dev_count(); i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + if (ggml_backend_dev_type(dev) == type) { + return dev; + } + } + return NULL; +} + +// Convenience functions +ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) { + ggml_backend_dev_t dev = ggml_backend_dev_by_name(name); + if (!dev) { + return NULL; + } + return ggml_backend_dev_init(dev, params); +} + +ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) { + ggml_backend_dev_t dev = ggml_backend_dev_by_type(type); + if (!dev) { + return NULL; + } + return ggml_backend_dev_init(dev, params); +} + +ggml_backend_t ggml_backend_init_best(void) { + ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL); + if (!dev) { + dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU_FULL); + } + if (!dev) { + return NULL; + } + return ggml_backend_dev_init(dev, NULL); } // backend CPU -static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment - -GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) { +static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) { return "CPU"; GGML_UNUSED(buffer); } -GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { +static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { uintptr_t data = (uintptr_t)buffer->context; // align the buffer @@ -581,29 +730,29 @@ GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t b return (void *)data; } -GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { - free(buffer->context); +static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_aligned_free(buffer->context, buffer->size); } -GGML_CALL static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { +static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { memset((char *)tensor->data + offset, value, size); GGML_UNUSED(buffer); } -GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { memcpy((char *)tensor->data + offset, data, size); GGML_UNUSED(buffer); } -GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { +static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { memcpy(data, (const char *)tensor->data + offset, size); GGML_UNUSED(buffer); } -GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { +static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { if (ggml_backend_buffer_is_host(src->buffer)) { memcpy(dst->data, src->data, ggml_nbytes(src)); return true; @@ -613,12 +762,12 @@ GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t b GGML_UNUSED(buffer); } -GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { memset(buffer->context, value, buffer->size); } -static struct ggml_backend_buffer_i cpu_backend_buffer_i = { - /* .get_name = */ ggml_backend_cpu_buffer_name, +static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = { + /* .get_name = */ ggml_backend_cpu_buffer_get_name, /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer, /* .get_base = */ ggml_backend_cpu_buffer_get_base, /* .init_tensor = */ NULL, // no initialization required @@ -630,9 +779,8 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = { /* .reset = */ NULL, }; -// for buffers from ptr, free is not called -static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { - /* .get_name = */ ggml_backend_cpu_buffer_name, +static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = { + /* .get_name = */ ggml_backend_cpu_buffer_get_name, /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed /* .get_base = */ ggml_backend_cpu_buffer_get_base, /* .init_tensor = */ NULL, // no initialization required @@ -644,38 +792,43 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { /* .reset = */ NULL, }; -GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { +static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { return "CPU"; GGML_UNUSED(buft); } -GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned - void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h) +static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + auto alloc_size = size; + if (alloc_size == 0) { + alloc_size = 1; + } + + void * data = ggml_aligned_malloc(alloc_size); + if (data == NULL) { - fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size); + GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size); return NULL; } - return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size); + return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, alloc_size); } -GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return TENSOR_ALIGNMENT; GGML_UNUSED(buft); } -GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { +static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return true; GGML_UNUSED(buft); } -GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { +ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = { - /* .iface = */ { + /* .iface = */ { /* .get_name = */ ggml_backend_cpu_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, @@ -683,6 +836,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), /* .context = */ NULL, }; @@ -695,28 +849,28 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { #include -GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) { +static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) { return "CPU_HBM"; GGML_UNUSED(buft); } -GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) { +static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) { return "CPU_HBM"; GGML_UNUSED(buf); } -GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { +static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { hbw_free(buffer->context); } -GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { //void * ptr = hbw_malloc(size); void * ptr; int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size); if (result != 0) { - fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size); + GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size); return NULL; } @@ -749,27 +903,27 @@ struct ggml_backend_cpu_context { int n_threads; ggml_threadpool_t threadpool; - void * work_data; + uint8_t * work_data; size_t work_size; ggml_abort_callback abort_callback; void * abort_callback_data; }; -GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) { +static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) { return "CPU"; GGML_UNUSED(backend); } -GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) { +static void ggml_backend_cpu_free(ggml_backend_t backend) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; - free(cpu_ctx->work_data); - free(cpu_ctx); - free(backend); + delete[] cpu_ctx->work_data; + delete cpu_ctx; + delete backend; } -GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) { +static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) { return ggml_backend_cpu_buffer_type(); GGML_UNUSED(backend); @@ -780,18 +934,18 @@ struct ggml_backend_plan_cpu { struct ggml_cgraph cgraph; }; -GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) { +static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; - struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); + struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu; cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); cpu_plan->cgraph = *cgraph; // FIXME: deep copy if (cpu_plan->cplan.work_size > 0) { - cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size); + cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size]; if (cpu_plan->cplan.work_data == NULL) { - free(cpu_plan); + delete cpu_plan; return NULL; } } @@ -802,16 +956,16 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg return cpu_plan; } -GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { +static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; - free(cpu_plan->cplan.work_data); - free(cpu_plan); + delete[] cpu_plan->cplan.work_data; + delete cpu_plan; GGML_UNUSED(backend); } -GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { +static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan); @@ -819,21 +973,21 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe GGML_UNUSED(backend); } -GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); if (cpu_ctx->work_size < cplan.work_size) { - free(cpu_ctx->work_data); - cpu_ctx->work_data = malloc(cplan.work_size); + delete[] cpu_ctx->work_data; + cpu_ctx->work_data = new uint8_t[cplan.work_size]; if (cpu_ctx->work_data == NULL) { cpu_ctx->work_size = 0; return GGML_STATUS_ALLOC_FAILED; } cpu_ctx->work_size = cplan.work_size; } - cplan.work_data = cpu_ctx->work_data; + cplan.work_data = (uint8_t *)cpu_ctx->work_data; cplan.abort_callback = cpu_ctx->abort_callback; cplan.abort_callback_data = cpu_ctx->abort_callback_data; @@ -841,35 +995,8 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t return ggml_graph_compute(cgraph, &cplan); } -GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { - switch (op->op) { - case GGML_OP_CPY: - return - op->type != GGML_TYPE_IQ2_XXS && - op->type != GGML_TYPE_IQ2_XS && - op->type != GGML_TYPE_IQ1_S && - op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float - case GGML_OP_MUL_MAT: - return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type; - case GGML_OP_ROPE_BACK: - return op->src[2] == NULL && (op->op_params[2] & 4) == 0; - case GGML_OP_IM2COL_BACK: - return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; - default: - return true; - } - - GGML_UNUSED(backend); -} - -GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { - return ggml_backend_buft_is_host(buft); - - GGML_UNUSED(backend); -} - -static struct ggml_backend_i cpu_backend_i = { - /* .get_name = */ ggml_backend_cpu_name, +static const struct ggml_backend_i ggml_backend_cpu_i = { + /* .get_name = */ ggml_backend_cpu_get_name, /* .free = */ ggml_backend_cpu_free, /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type, /* .set_tensor_async = */ NULL, @@ -881,14 +1008,11 @@ static struct ggml_backend_i cpu_backend_i = { /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute, /* .graph_compute = */ ggml_backend_cpu_graph_compute, - /* .supports_op = */ ggml_backend_cpu_supports_op, - /* .supports_buft = */ ggml_backend_cpu_supports_buft, + /* .supports_op = */ NULL, + /* .supports_buft = */ NULL, /* .offload_op = */ NULL, - /* .event_new = */ NULL, - /* .event_free = */ NULL, /* .event_record = */ NULL, /* .event_wait = */ NULL, - /* .event_synchronize = */ NULL, }; static ggml_guid_t ggml_backend_cpu_guid(void) { @@ -897,7 +1021,7 @@ static ggml_guid_t ggml_backend_cpu_guid(void) { } ggml_backend_t ggml_backend_cpu_init(void) { - struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context)); + struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context; if (ctx == NULL) { return NULL; } @@ -909,21 +1033,22 @@ ggml_backend_t ggml_backend_cpu_init(void) { ctx->abort_callback = NULL; ctx->abort_callback_data = NULL; - ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend)); + ggml_backend_t cpu_backend = new ggml_backend { + /* .guid = */ ggml_backend_cpu_guid(), + /* .interface = */ ggml_backend_cpu_i, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), + /* .context = */ ctx, + }; + if (cpu_backend == NULL) { - free(ctx); + delete ctx; return NULL; } - *cpu_backend = (struct ggml_backend) { - /* .guid = */ ggml_backend_cpu_guid(), - /* .interface = */ cpu_backend_i, - /* .context = */ ctx - }; return cpu_backend; } -GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) { +bool ggml_backend_is_cpu(ggml_backend_t backend) { return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid()); } @@ -954,16 +1079,233 @@ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_ ctx->abort_callback_data = abort_callback_data; } -GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { +ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned"); - return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size); + return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size); } -GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) { +//////////////////////// + +struct ggml_backend_cpu_device_context { + std::string description = "CPU"; + + ggml_backend_cpu_device_context() { +#ifdef __APPLE__ + size_t len = 0; + if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) { + description.resize(len); + sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT + } +#elif defined(__linux__) + FILE * f = fopen("/proc/cpuinfo", "r"); + if (f) { + char buf[1024]; + while (fgets(buf, sizeof(buf), f)) { + if (strncmp(buf, "model name", 10) == 0) { + char * p = strchr(buf, ':'); + if (p) { + p++; + while (std::isspace(*p)) { + p++; + } + while (std::isspace(p[strlen(p) - 1])) { + p[strlen(p) - 1] = '\0'; + } + description = p; + break; + } + } + } + fclose(f); + } +#elif defined(_WIN32) + HKEY hKey; + if (RegOpenKeyEx(HKEY_LOCAL_MACHINE, + TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"), + 0, + KEY_READ, + &hKey) == ERROR_SUCCESS) { + DWORD cpu_brand_size = 0; + if (RegQueryValueExA(hKey, + TEXT("ProcessorNameString"), + NULL, + NULL, + NULL, + &cpu_brand_size) == ERROR_SUCCESS) { + description.resize(cpu_brand_size); + if (RegQueryValueExA(hKey, + TEXT("ProcessorNameString"), + NULL, + NULL, + (LPBYTE)&description[0], // NOLINT + &cpu_brand_size) == ERROR_SUCCESS) { + if (description.find('\0') != std::string::npos) { + description.resize(description.find('\0')); + } + } + } + RegCloseKey(hKey); + } +#endif + } +}; + +static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) { + return "CPU"; + + GGML_UNUSED(dev); +} + +static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) { + struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context; + + return ctx->description.c_str(); +} + +static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + // TODO + *free = 0; + *total = 0; + + GGML_UNUSED(dev); +} + +static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) { + return GGML_BACKEND_DEVICE_TYPE_CPU_FULL; + + GGML_UNUSED(dev); +} + +static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { + props->name = ggml_backend_cpu_device_get_name(dev); + props->description = ggml_backend_cpu_device_get_description(dev); + props->type = ggml_backend_cpu_device_get_type(dev); + ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* .async = */ false, + /* .host_buffer = */ false, + /* .buffer_from_host_ptr = */ true, + /* .events = */ false, + }; +} + +static ggml_backend_t ggml_backend_cpu_device_init(ggml_backend_dev_t dev, const char * params) { return ggml_backend_cpu_init(); + GGML_UNUSED(dev); GGML_UNUSED(params); - GGML_UNUSED(user_data); +} + +static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) { + return ggml_backend_cpu_buffer_type(); + + GGML_UNUSED(dev); +} + +static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { + return ggml_backend_cpu_buffer_from_ptr(ptr, size); + + GGML_UNUSED(dev); + GGML_UNUSED(max_tensor_size); +} + +static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { + switch (op->op) { + case GGML_OP_CPY: + return + op->type != GGML_TYPE_IQ2_XXS && + op->type != GGML_TYPE_IQ2_XS && + op->type != GGML_TYPE_IQ1_S && + op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float + case GGML_OP_MUL_MAT: + return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type; + case GGML_OP_ROPE_BACK: + return op->src[2] == NULL && (op->op_params[2] & 4) == 0; + case GGML_OP_IM2COL_BACK: + return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; + case GGML_OP_OUT_PROD: + return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32; + default: + return true; + } + + GGML_UNUSED(dev); +} + +static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + return ggml_backend_buft_is_host(buft); + + GGML_UNUSED(dev); +} + +static const struct ggml_backend_device_i ggml_backend_cpu_device_i = { + /* .get_name = */ ggml_backend_cpu_device_get_name, + /* .get_description = */ ggml_backend_cpu_device_get_description, + /* .get_memory = */ ggml_backend_cpu_device_get_memory, + /* .get_type = */ ggml_backend_cpu_device_get_type, + /* .get_props = */ ggml_backend_cpu_device_get_props, + /* .init_backend = */ ggml_backend_cpu_device_init, + /* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type, + /* .get_host_buffer_type = */ NULL, + /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_ptr, + /* .supports_op = */ ggml_backend_cpu_device_supports_op, + /* .supports_buft = */ ggml_backend_cpu_device_supports_buft, + /* .offload_op = */ NULL, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, +}; + +//////////////////////// + +static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) { + return "CPU"; + + GGML_UNUSED(reg); +} + +static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) { + return 1; + + GGML_UNUSED(reg); +} + +static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) { + GGML_ASSERT(index == 0); + + static ggml_backend_cpu_device_context ctx; + static ggml_backend_device ggml_backend_cpu_device = { + /* .iface = */ ggml_backend_cpu_device_i, + /* .reg = */ reg, + /* .context = */ &ctx, + }; + + return &ggml_backend_cpu_device; +} + +static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) { + if (strcmp(name, "ggml_backend_set_n_threads") == 0) { + return (void *)ggml_backend_cpu_set_n_threads; + } + return NULL; + + GGML_UNUSED(reg); +} + +static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = { + /* .get_name = */ ggml_backend_cpu_reg_get_name, + /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count, + /* .get_device = */ ggml_backend_cpu_reg_get_device, + /* .get_proc_address = */ ggml_backend_cpu_get_proc_address, +}; + +ggml_backend_reg_t ggml_backend_cpu_reg(void) { + static struct ggml_backend_reg ggml_backend_cpu_reg = { + /* .iface = */ ggml_backend_cpu_reg_i, + /* .context = */ NULL, + }; + + return &ggml_backend_cpu_reg; } // multi-buffer buffer @@ -973,16 +1315,14 @@ struct ggml_backend_multi_buffer_context { size_t n_buffers; }; -typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t; - -GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) { - ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context; +static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) { + ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context; return ctx->buffers[0]->iface.get_name(ctx->buffers[0]); } -GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context; +static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context; for (size_t i = 0; i < ctx->n_buffers; i++) { ggml_backend_buffer_free(ctx->buffers[i]); } @@ -991,32 +1331,28 @@ GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_ free(ctx); } -GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context; +static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context; for (size_t i = 0; i < ctx->n_buffers; i++) { ggml_backend_buffer_clear(ctx->buffers[i], value); } } -static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) { - static struct ggml_backend_buffer_i multi_backend_buffer_i = { - /* .get_name = */ ggml_backend_multi_buffer_get_name, - /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer, - /* .get_base = */ NULL, - /* .init_tensor = */ NULL, - /* .memset_tensor = */ NULL, - /* .set_tensor = */ NULL, - /* .get_tensor = */ NULL, - /* .cpy_tensor = */ NULL, - /* .clear = */ ggml_backend_multi_buffer_clear, - /* .reset = */ NULL, - }; +static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = { + /* .get_name = */ ggml_backend_multi_buffer_get_name, + /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer, + /* .get_base = */ NULL, + /* .init_tensor = */ NULL, + /* .memset_tensor = */ NULL, + /* .set_tensor = */ NULL, + /* .get_tensor = */ NULL, + /* .cpy_tensor = */ NULL, + /* .clear = */ ggml_backend_multi_buffer_clear, + /* .reset = */ NULL, +}; - return multi_backend_buffer_i; -} - -GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) { - ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context)); +ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) { + ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) malloc(sizeof(struct ggml_backend_multi_buffer_context)); ctx->n_buffers = n_buffers; ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t)); @@ -1028,16 +1364,16 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_back total_size += ggml_backend_buffer_get_size(buffers[i]); } - return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size); + return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_i, ctx, total_size); } -GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) { +bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) { return buffer->iface.get_name == ggml_backend_multi_buffer_get_name; } -GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) { +void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) { GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer)); - ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context; + ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context; for (size_t i = 0; i < ctx->n_buffers; i++) { ggml_backend_buffer_set_usage(ctx->buffers[i], usage); } @@ -1157,7 +1493,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co } #ifndef NDEBUG - fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n", + GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n", __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name); #endif @@ -1246,13 +1582,13 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str for (int i = 0; i < graph->n_nodes; i++) { if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) { ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id]; - fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend), + GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend), sched->splits[cur_split].n_inputs); for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) { - fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name, + GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name, fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j]))); } - fprintf(stderr, "\n"); + GGML_LOG_DEBUG("\n"); cur_split++; } struct ggml_tensor * node = graph->nodes[i]; @@ -1260,7 +1596,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str continue; } ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node); - fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name, + GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name, fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node)); for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; @@ -1268,10 +1604,10 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str continue; } ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src); - fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name, + GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name, fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src)); } - fprintf(stderr, "\n"); + GGML_LOG_DEBUG("\n"); } } @@ -1592,7 +1928,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg i_split++; if (i_split >= sched->splits_capacity) { sched->splits_capacity *= 2; - sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split)); + sched->splits = (ggml_backend_sched_split *) + realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split)); GGML_ASSERT(sched->splits != NULL); } split = &sched->splits[i_split]; @@ -1678,11 +2015,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg sched->prev_leaf_backend_ids = tmp; } - int graph_size = MAX(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies; + int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies; if (sched->graph.size < graph_size) { sched->graph.size = graph_size; - sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *)); - sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *)); + sched->graph.nodes = (ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *)); + sched->graph.leafs = (ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *)); GGML_ASSERT(sched->graph.nodes != NULL); GGML_ASSERT(sched->graph.leafs != NULL); } @@ -1784,11 +2121,11 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { // the re-allocation may cause the split inputs to be moved to a different address ggml_backend_sched_synchronize(sched); #ifndef NDEBUG - fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed); + GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed); #endif ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids); if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) { - fprintf(stderr, "%s: failed to allocate graph\n", __func__); + GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__); return false; } } @@ -1881,7 +2218,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s // record the event of this copy if (split->n_inputs > 0) { if (sched->events[split_backend_id][sched->cur_copy] != NULL) { - ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]); + ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend); } } } @@ -1901,7 +2238,7 @@ ggml_backend_sched_t ggml_backend_sched_new( GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU - struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched)); + struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched)); sched->debug = getenv("GGML_SCHED_DEBUG") != NULL; sched->n_backends = n_backends; @@ -1910,30 +2247,31 @@ ggml_backend_sched_t ggml_backend_sched_new( // initialize hash table // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead) sched->hash_set = ggml_hash_set_new(graph_size); - sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0])); - sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *)); + sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0])); + sched->hv_tensor_copies = (ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *)); const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2; - sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0])); - sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0])); - sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0])); - sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0])); + sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0])); + sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0])); + sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0])); + sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0])); sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false); - sched->context_buffer = malloc(sched->context_buffer_size); + sched->context_buffer = (char *) malloc(sched->context_buffer_size); const int initial_splits_capacity = 16; - sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0])); + sched->splits = (ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0])); sched->splits_capacity = initial_splits_capacity; for (int b = 0; b < n_backends; b++) { sched->backends[b] = backends[b]; sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]); GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b])); + if (sched->n_copies > 1) { for (int c = 0; c < sched->n_copies; c++) { - sched->events[b][c] = ggml_backend_event_new(backends[b]); + sched->events[b][c] = ggml_backend_event_new(backends[b]->device); } } } @@ -2169,8 +2507,8 @@ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) { struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size); - struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT - bool * node_init = calloc(hash_set.size, sizeof(node_init[0])); + struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT + bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0])); struct ggml_init_params params = { /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false), @@ -2182,13 +2520,13 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s struct ggml_context * ctx_unallocated = ggml_init(params); if (ctx_allocated == NULL || ctx_unallocated == NULL) { - fprintf(stderr, "failed to allocate context for graph copy\n"); + GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__); ggml_hash_set_free(&hash_set); free(node_copies); free(node_init); ggml_free(ctx_allocated); ggml_free(ctx_unallocated); - return (struct ggml_backend_graph_copy) { + return { /* .buffer = */ NULL, /* .ctx_allocated = */ NULL, /* .ctx_unallocated = */ NULL, @@ -2205,13 +2543,13 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s // allocate nodes ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend); if (buffer == NULL) { - fprintf(stderr, "failed to allocate buffer for graph copy\n"); + GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__); ggml_hash_set_free(&hash_set); free(node_copies); free(node_init); ggml_free(ctx_allocated); ggml_free(ctx_unallocated); - return (struct ggml_backend_graph_copy) { + return { /* .buffer = */ NULL, /* .ctx_allocated = */ NULL, /* .ctx_unallocated = */ NULL, @@ -2240,7 +2578,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s free(node_copies); free(node_init); - return (struct ggml_backend_graph_copy) { + return { /* .buffer = */ buffer, /* .ctx_allocated = */ ctx_allocated, /* .ctx_unallocated = */ ctx_unallocated, diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp index 6d99c6bea..7875ec86d 100644 --- a/ggml/src/ggml-blas.cpp +++ b/ggml/src/ggml-blas.cpp @@ -4,6 +4,7 @@ #include #include +#include #if defined(GGML_USE_ACCELERATE) # include @@ -26,30 +27,6 @@ struct ggml_backend_blas_context { #endif }; -// helper function to determine if it is better to use BLAS or not -// for large matrices, BLAS is faster -static bool ggml_backend_blas_use_blas(const struct ggml_tensor * dst) { - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - const int64_t ne10 = src1->ne[0]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - - // TODO: find the optimal values for these - if (ggml_is_contiguous(src0) && - ggml_is_contiguous(src1) && - src1->type == GGML_TYPE_F32 && - (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { - - /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ - return true; - } - - return false; -} - static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; @@ -88,8 +65,8 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg // convert src0 to float if (type != GGML_TYPE_F32) { - ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type); - ggml_to_float_t const to_float = type_traits.to_float; + const auto * type_traits = ggml_get_type_traits(type); + ggml_to_float_t const to_float = type_traits->to_float; for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { @@ -235,25 +212,25 @@ static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct g // backend interface -GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) { +static const char * ggml_backend_blas_get_name(ggml_backend_t backend) { return "BLAS"; GGML_UNUSED(backend); } -GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) { +static void ggml_backend_blas_free(ggml_backend_t backend) { ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; delete ctx; delete backend; } -GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) { +static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) { return ggml_backend_cpu_buffer_type(); GGML_UNUSED(backend); } -GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; for (int i = 0; i < cgraph->n_nodes; i++) { @@ -285,29 +262,8 @@ GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t GGML_UNUSED(backend); } -GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { - const struct ggml_tensor * src0 = op->src[0]; - const struct ggml_tensor * src1 = op->src[1]; - - return (op->op == GGML_OP_MUL_MAT && ggml_backend_blas_use_blas(op)) || - (op->op == GGML_OP_OUT_PROD && op->src[0]->type == GGML_TYPE_F32 && - op->src[1]->type == GGML_TYPE_F32 && - ggml_is_matrix(src0) && - ggml_is_matrix(src1) && - ggml_is_contiguous(src0) && - (ggml_is_contiguous(src1) || ggml_is_transposed(src1))); - - GGML_UNUSED(backend); -} - -GGML_CALL static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { - return ggml_backend_buft_is_host(buft); - - GGML_UNUSED(backend); -} - static struct ggml_backend_i blas_backend_i = { - /* .get_name = */ ggml_backend_blas_name, + /* .get_name = */ ggml_backend_blas_get_name, /* .free = */ ggml_backend_blas_free, /* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type, /* .set_tensor_async = */ NULL, @@ -319,14 +275,11 @@ static struct ggml_backend_i blas_backend_i = { /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_blas_graph_compute, - /* .supports_op = */ ggml_backend_blas_supports_op, - /* .supports_buft = */ ggml_backend_blas_supports_buft, + /* .supports_op = */ NULL, + /* .supports_buft = */ NULL, /* .offload_op = */ NULL, - /* .event_new = */ NULL, - /* .event_free = */ NULL, /* .event_record = */ NULL, /* .event_wait = */ NULL, - /* .event_synchronize = */ NULL, }; static ggml_guid_t ggml_backend_blas_guid(void) { @@ -340,23 +293,24 @@ ggml_backend_t ggml_backend_blas_init(void) { ggml_backend_t backend = new ggml_backend { /* .guid = */ ggml_backend_blas_guid(), /* .interface = */ blas_backend_i, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0), /* .context = */ ctx, }; -#if !defined(NDEBUG) && defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP) +#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP) if (openblas_get_parallel() != OPENBLAS_OPENMP) { - fprintf(stderr, "%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__); + GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__); } #endif -#if !defined(NDEBUG) && defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP) - fprintf(stderr, "%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__); +#if defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP) + GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__); #endif return backend; } -GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend) { +bool ggml_backend_is_blas(ggml_backend_t backend) { return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid()); } @@ -366,3 +320,205 @@ void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads) ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context; ctx->n_threads = n_threads; } + +// device interface + +static const char * ggml_backend_blas_device_get_name(ggml_backend_dev_t dev) { + return "BLAS"; + + GGML_UNUSED(dev); +} + +static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t dev) { + #if defined(GGML_USE_ACCELERATE) + return "Accelerate"; + #elif defined(GGML_BLAS_USE_MKL) + return "MKL"; + #elif defined(GGML_BLAS_USE_BLIS) + return "BLIS"; + #elif defined(GGML_BLAS_USE_NVPL) + return "NVPL"; + #elif defined(OPENBLAS_VERSION) + return "OpenBLAS"; + #else + return "BLAS"; + #endif + + GGML_UNUSED(dev); +} + +static void ggml_backend_blas_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + // TODO + *free = 0; + *total = 0; + + GGML_UNUSED(dev); +} + +static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend_dev_t dev) { + return GGML_BACKEND_DEVICE_TYPE_CPU; + + GGML_UNUSED(dev); +} + +static void ggml_backend_blas_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { + props->name = ggml_backend_blas_device_get_name(dev); + props->description = ggml_backend_blas_device_get_description(dev); + props->type = ggml_backend_blas_device_get_type(dev); + ggml_backend_blas_device_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* .async = */ false, + /* .host_buffer = */ false, + /* .buffer_from_host_ptr = */ true, + /* .events = */ false, + }; +} + +static ggml_backend_t ggml_backend_blas_device_init(ggml_backend_dev_t dev, const char * params) { + return ggml_backend_blas_init(); + + GGML_UNUSED(dev); + GGML_UNUSED(params); +} + +static ggml_backend_buffer_type_t ggml_backend_blas_device_get_buffer_type(ggml_backend_dev_t dev) { + return ggml_backend_cpu_buffer_type(); + + GGML_UNUSED(dev); +} + +static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { + return ggml_backend_cpu_buffer_from_ptr(ptr, size); + + GGML_UNUSED(dev); + GGML_UNUSED(max_tensor_size); +} + +static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { + const struct ggml_tensor * src0 = op->src[0]; + const struct ggml_tensor * src1 = op->src[1]; + + switch (op->op) { + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + return true; + + case GGML_OP_MUL_MAT: + { + // BLAS usually is only faster for large matrices + const struct ggml_tensor * src0 = op->src[0]; + const struct ggml_tensor * src1 = op->src[1]; + + const int64_t ne10 = src1->ne[0]; + + const int64_t ne0 = op->ne[0]; + const int64_t ne1 = op->ne[1]; + + // TODO: find the optimal value + const int64_t min_batch = 32; + + return ggml_is_contiguous(src0) && + ggml_is_contiguous(src1) && + src1->type == GGML_TYPE_F32 && + (ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch) && + (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL); + } + + case GGML_OP_OUT_PROD: + return op->src[0]->type == GGML_TYPE_F32 && + op->src[1]->type == GGML_TYPE_F32 && + ggml_is_matrix(src0) && + ggml_is_matrix(src1) && + ggml_is_contiguous(src0) && + (ggml_is_contiguous(src1) || ggml_is_transposed(src1)) && + (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL); + + default: + return false; + + } + + GGML_UNUSED(dev); +} + +static bool ggml_backend_blas_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + return ggml_backend_buft_is_host(buft); + + GGML_UNUSED(dev); +} + +static const struct ggml_backend_device_i ggml_backend_blas_device_i = { + /* .get_name = */ ggml_backend_blas_device_get_name, + /* .get_description = */ ggml_backend_blas_device_get_description, + /* .get_memory = */ ggml_backend_blas_device_get_memory, + /* .get_type = */ ggml_backend_blas_device_get_type, + /* .get_props = */ ggml_backend_blas_device_get_props, + /* .init_backend = */ ggml_backend_blas_device_init, + /* .get_buffer_type = */ ggml_backend_blas_device_get_buffer_type, + /* .get_host_buffer_type = */ NULL, + /* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_ptr, + /* .supports_op = */ ggml_backend_blas_device_supports_op, + /* .supports_buft = */ ggml_backend_blas_device_supports_buft, + /* .offload_op = */ NULL, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, +}; + +// backend reg interface + +static const char * ggml_backend_blas_reg_get_name(ggml_backend_reg_t reg) { + return "BLAS"; + + GGML_UNUSED(reg); +} + +static size_t ggml_backend_blas_reg_get_device_count(ggml_backend_reg_t reg) { + return 1; + + GGML_UNUSED(reg); +} + +static ggml_backend_dev_t ggml_backend_blas_reg_get_device(ggml_backend_reg_t reg, size_t index) { + GGML_ASSERT(index == 0); + + static ggml_backend_device ggml_backend_blas_device = { + /* .iface = */ ggml_backend_blas_device_i, + /* .reg = */ reg, + /* .context = */ nullptr, + }; + + return &ggml_backend_blas_device; + + GGML_UNUSED(reg); + GGML_UNUSED(index); +} + +static void * ggml_backend_blas_get_proc_address(ggml_backend_reg_t reg, const char * name) { + if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) { + return (void *)ggml_backend_blas_set_n_threads; + } + return NULL; + + GGML_UNUSED(reg); + GGML_UNUSED(name); +} + +static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = { + /* .get_name = */ ggml_backend_blas_reg_get_name, + /* .get_device_count = */ ggml_backend_blas_reg_get_device_count, + /* .get_device = */ ggml_backend_blas_reg_get_device, + /* .get_proc_address = */ ggml_backend_blas_get_proc_address, +}; + +ggml_backend_reg_t ggml_backend_blas_reg(void) { + static struct ggml_backend_reg ggml_backend_blas_reg = { + /* .iface = */ ggml_backend_blas_reg_i, + /* .context = */ NULL, + }; + + return &ggml_backend_blas_reg; +} diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp index d3ab78006..af0fb603a 100644 --- a/ggml/src/ggml-cann.cpp +++ b/ggml/src/ggml-cann.cpp @@ -39,68 +39,7 @@ #include "ggml-common.h" -/** - * @brief Default logging callback for GGML. - * - * This function is the default logging callback that logs messages to stderr. - * - * @param level The log level. - * @param msg The log message. - * @param user_data User data passed to the callback. - */ -static void ggml_cann_default_log_callback(enum ggml_log_level level, - const char* msg, void* user_data) { - GGML_UNUSED(level); - GGML_UNUSED(user_data); - fprintf(stderr, "%s", msg); -} - -ggml_log_callback ggml_cann_log_callback = ggml_cann_default_log_callback; -void* ggml_cann_log_user_data = NULL; - -GGML_API void ggml_backend_cann_log_set_callback(ggml_log_callback log_callback, - void* user_data) { - ggml_cann_log_callback = log_callback; - ggml_cann_log_user_data = user_data; -} - -#define GGML_CANN_LOG_INFO(...) ggml_cann_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) -#define GGML_CANN_LOG_WARN(...) ggml_cann_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) -#define GGML_CANN_LOG_ERROR(...) \ - ggml_cann_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) - -GGML_ATTRIBUTE_FORMAT(2, 3) - -/** - * @brief Log a message using the current logging callback. - * - * This function formats a log message and passes it to the current logging - * callback. - * - * @param level The log level. - * @param format The format string for the log message. - * @param ... The arguments for the format string. - */ -static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) { - if (ggml_cann_log_callback != NULL) { - va_list args; - va_start(args, format); - char buffer[128]; - int len = vsnprintf(buffer, 128, format, args); - if (len < 128) { - ggml_cann_log_callback(level, buffer, ggml_cann_log_user_data); - } else { - // vsnprintf adds a null terminator - std::vector buffer2(len + 1); - va_end(args); - va_start(args, format); - vsnprintf(&buffer2[0], buffer2.size(), format, args); - ggml_cann_log_callback(level, buffer2.data(), - ggml_cann_log_user_data); - } - va_end(args); - } -} +#define GGML_CANN_NAME "CANN" /** * @brief Handles CANN errors by printing an error message and aborting. @@ -116,10 +55,10 @@ static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) { int32_t id = -1; aclrtGetDevice(&id); - GGML_CANN_LOG_ERROR("CANN error: %s\n", msg); - GGML_CANN_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, + GGML_LOG_ERROR("CANN error: %s\n", msg); + GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line); - GGML_CANN_LOG_ERROR(" %s\n", stmt); + GGML_LOG_ERROR(" %s\n", stmt); // abort with GGML_ASSERT to get a stack trace GGML_ABORT("CANN error"); } @@ -165,7 +104,7 @@ static ggml_cann_device_info ggml_cann_init() { aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count); if (err != ACL_SUCCESS) { - GGML_CANN_LOG_ERROR("%s: failed to initialize CANN: %s\n", + GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n", __func__, aclGetRecentErrMsg()); return info; } @@ -315,7 +254,7 @@ struct ggml_cann_pool_leg : public ggml_cann_pool { *actual_size = look_ahead_size; pool_size += look_ahead_size; #ifdef DEBUG_CANN_MALLOC - GGML_CANN_LOG_INFO( + GGML_LOG_INFO( "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, " "requested %u MB\n", __func__, device, nnz, (uint32_t)(max_size / 1024 / 1024), @@ -470,7 +409,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { // add to the pool pool_size += reserve_size; - // GGML_CANN_LOG_INFO("cann pool[%d]: size increased to %llu MB ( + // GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB ( // reserved %llu MB)\n", // device, (unsigned long long) (pool_size/1024/1024), // (unsigned long long) (reserve_size/1024/1024)); @@ -483,7 +422,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { pool_used += size; #ifdef DEBUG_CANN_MALLOC - GGML_CANN_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device, + GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long)size, (unsigned long long)ptr); #endif return ptr; @@ -497,7 +436,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { */ void free(void* ptr, size_t size) override { #ifdef DEBUG_CANN_MALLOC - GGML_CANN_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device, + GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long)size, (unsigned long long)ptr); #endif @@ -560,7 +499,7 @@ struct ggml_backend_cann_buffer_context { * @return A pointer to a C-string containing the name of the buffer. */ -GGML_CALL static const char* ggml_backend_cann_buffer_get_name( +static const char* ggml_backend_cann_buffer_get_name( ggml_backend_buffer_t buffer) { return "CANN"; @@ -576,7 +515,7 @@ GGML_CALL static const char* ggml_backend_cann_buffer_get_name( * @param buffer The buffer to check. * @return true if the buffer is a CANN buffer, false otherwise. */ -GGML_CALL static bool ggml_backend_buffer_is_cann( +static bool ggml_backend_buffer_is_cann( ggml_backend_buffer_t buffer) { return buffer->iface.get_name == ggml_backend_cann_buffer_get_name; } @@ -589,7 +528,7 @@ GGML_CALL static bool ggml_backend_buffer_is_cann( * * @param buffer The CANN buffer to free. */ -GGML_CALL static void ggml_backend_cann_buffer_free_buffer( +static void ggml_backend_cann_buffer_free_buffer( ggml_backend_buffer_t buffer) { ggml_backend_cann_buffer_context* ctx = (ggml_backend_cann_buffer_context*)buffer->context; @@ -605,7 +544,7 @@ GGML_CALL static void ggml_backend_cann_buffer_free_buffer( * @param buffer The CANN buffer whose base pointer is to be retrieved. * @return A pointer to the base of the device memory allocated for the buffer. */ -GGML_CALL static void* ggml_backend_cann_buffer_get_base( +static void* ggml_backend_cann_buffer_get_base( ggml_backend_buffer_t buffer) { ggml_backend_cann_buffer_context* ctx = (ggml_backend_cann_buffer_context*)buffer->context; @@ -625,9 +564,9 @@ GGML_CALL static void* ggml_backend_cann_buffer_get_base( * @param dst Pointer to the destination buffer where transformed data will be * stored. */ -GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor, - const void* src, - void* dst) { +static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor, + const void* src, + void* dst) { int64_t n_elems = ggml_nelements(tensor); int64_t groups = n_elems / QK4_0; @@ -677,7 +616,7 @@ GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor, * @param dst Pointer to the destination buffer where the Q4.0 formatted data * will be stored. */ -GGML_CALL static void ggml_backend_cann_transform_back_q4_0( +static void ggml_backend_cann_transform_back_q4_0( const ggml_tensor* tensor, void* src, void* dst) { int64_t n_elems = ggml_nelements(tensor); @@ -726,9 +665,9 @@ GGML_CALL static void ggml_backend_cann_transform_back_q4_0( * @param dst Pointer to the destination buffer where transformed data will be * stored. */ -GGML_CALL static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor, - const void* src, - void* dst) { +static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor, + const void* src, + void* dst) { int64_t n_elems = ggml_nelements(tensor); int64_t groups = n_elems / QK8_0; size_t quant_bytes = n_elems * sizeof(uint8_t); @@ -760,7 +699,7 @@ GGML_CALL static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor, * @param dst Pointer to the destination buffer where the Q8.0 formatted data * will be stored. */ -GGML_CALL static void ggml_backend_cann_transform_back_q8_0( +static void ggml_backend_cann_transform_back_q8_0( const ggml_tensor* tensor, const void* src, void* dst) { int64_t n_elems = ggml_nelements(tensor); int64_t groups = n_elems / QK8_0; @@ -792,8 +731,8 @@ GGML_CALL static void ggml_backend_cann_transform_back_q8_0( * @param dst Pointer to the destination buffer where transformed data will be * stored. */ -GGML_CALL static void ggml_backend_cann_transform(ggml_tensor* tensor, - const void* src, void* dst) { +static void ggml_backend_cann_transform(ggml_tensor* tensor, + const void* src, void* dst) { switch (tensor->type) { case GGML_TYPE_Q4_0: ggml_backend_cann_transform_q4_0(tensor, src, dst); @@ -818,7 +757,7 @@ GGML_CALL static void ggml_backend_cann_transform(ggml_tensor* tensor, * @param dst Pointer to the destination buffer where transformed tensor data * will be stored. */ -GGML_CALL static void ggml_backend_cann_transform_back( +static void ggml_backend_cann_transform_back( const ggml_tensor* tensor, void* src, void* dst) { switch (tensor->type) { case GGML_TYPE_Q4_0: @@ -841,7 +780,7 @@ GGML_CALL static void ggml_backend_cann_transform_back( * @param type The tensor type to check. * @return true if transformation is needed, false otherwise. */ -GGML_CALL static bool need_transform(ggml_type type) { +static bool need_transform(ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q8_0: @@ -860,7 +799,7 @@ GGML_CALL static bool need_transform(ggml_type type) { * @param buffer The CANN buffer from which to initialize the tensor. * @param tensor Pointer to the tensor to be initialized. */ -GGML_CALL static void ggml_backend_cann_buffer_init_tensor( +static void ggml_backend_cann_buffer_init_tensor( ggml_backend_buffer_t buffer, ggml_tensor* tensor) { if (tensor->view_src != NULL && tensor->view_offs == 0) { GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); @@ -896,7 +835,7 @@ GGML_CALL static void ggml_backend_cann_buffer_init_tensor( * @param offset Offset in the source data from where to start copying. * @param size Size of the data to be copied, in bytes. */ -GGML_CALL static void ggml_backend_cann_buffer_set_tensor( +static void ggml_backend_cann_buffer_set_tensor( ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, size_t offset, size_t size) { ggml_backend_cann_buffer_context *ctx = @@ -914,13 +853,6 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor( void *transform_buffer = malloc(size); ggml_backend_cann_transform(tensor, data, transform_buffer); -#ifndef NDEBUG - void *check_buffer = malloc(size); - ggml_backend_cann_transform_back(tensor, transform_buffer, - check_buffer); - GGML_ASSERT(memcmp(data, check_buffer, size) == 0); - free(check_buffer); -#endif ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, transform_buffer, size, ACL_MEMCPY_HOST_TO_DEVICE)); @@ -941,7 +873,7 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor( * @param offset Offset in the destination buffer where to start copying. * @param size Size of the data to be copied, in bytes. */ -GGML_CALL static void ggml_backend_cann_buffer_get_tensor( +static void ggml_backend_cann_buffer_get_tensor( ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data, size_t offset, size_t size) { ggml_backend_cann_buffer_context* ctx = @@ -975,7 +907,7 @@ GGML_CALL static void ggml_backend_cann_buffer_get_tensor( * @param dst Pointer to the destination tensor where the data will be copied. * @return true if the copy operation succeeded, false otherwise. */ -GGML_CALL static bool ggml_backend_cann_buffer_cpy_tensor( +static bool ggml_backend_cann_buffer_cpy_tensor( ggml_backend_buffer_t buffer, const ggml_tensor* src, ggml_tensor* dst) { if (ggml_backend_buffer_is_cann(src->buffer)) { ggml_backend_cann_buffer_context* src_ctx = @@ -1017,7 +949,7 @@ GGML_CALL static bool ggml_backend_cann_buffer_cpy_tensor( * @param buffer The CANN buffer to be cleared. * @param value The value to which each byte in the buffer will be set. */ -GGML_CALL static void ggml_backend_cann_buffer_clear( +static void ggml_backend_cann_buffer_clear( ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_cann_buffer_context* ctx = (ggml_backend_cann_buffer_context*)buffer->context; @@ -1032,7 +964,7 @@ GGML_CALL static void ggml_backend_cann_buffer_clear( * This structure defines function pointers to operations that can be performed * on a CANN buffer within the backend. */ -static ggml_backend_buffer_i ggml_backend_cann_buffer_interface = { +static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = { /* .get_name = */ ggml_backend_cann_buffer_get_name, /* .free_buffer = */ ggml_backend_cann_buffer_free_buffer, /* .get_base = */ ggml_backend_cann_buffer_get_base, @@ -1065,7 +997,7 @@ struct ggml_backend_cann_buffer_type_context { * @param buft Pointer to the buffer type context. * @return Const pointer to the C-style string containing the name. */ -GGML_CALL static const char* ggml_backend_cann_buffer_type_name( +static const char* ggml_backend_cann_buffer_type_name( ggml_backend_buffer_type_t buft) { return "CANN"; @@ -1082,7 +1014,7 @@ GGML_CALL static const char* ggml_backend_cann_buffer_type_name( * @param size Size in bytes of the buffer to allocate. * @return Pointer to the allocated buffer, or nullptr if allocation fails. */ -GGML_CALL static ggml_backend_buffer_t +static ggml_backend_buffer_t ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { ggml_backend_cann_buffer_type_context* buft_ctx = @@ -1095,7 +1027,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, void* dev_ptr; aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST); if (err != ACL_SUCCESS) { - GGML_CANN_LOG_ERROR( + GGML_LOG_ERROR( "%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, aclGetRecentErrMsg()); @@ -1121,7 +1053,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, * @return The alignment requirement in bytes (fixed at 128 bytes for CANN * buffers). */ -GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alignment( +static size_t ggml_backend_cann_buffer_type_get_alignment( ggml_backend_buffer_type_t buft) { return 128; @@ -1142,7 +1074,7 @@ GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alignment( * @return The total allocation size in bytes required for the tensor in the * CANN buffer. */ -GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alloc_size( +static size_t ggml_backend_cann_buffer_type_get_alloc_size( ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) { size_t size = ggml_nbytes(tensor); int64_t ne0 = tensor->ne[0]; @@ -1168,19 +1100,25 @@ GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alloc_size( GGML_UNUSED(buft); } +static bool ggml_backend_cann_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + return false; + + GGML_UNUSED(buft); +} + /** * @brief Interface for managing CANN buffer types in the GGML backend. * * Provides function pointers for allocating, querying properties, and managing * memory for CANN buffer types in the GGML backend. */ -static ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = { +static const ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = { /* .get_name = */ ggml_backend_cann_buffer_type_name, /* .alloc_buffer = */ ggml_backend_cann_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cann_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cann_buffer_type_get_alloc_size, - /* .is_host = */ NULL, + /* .is_host = */ ggml_backend_cann_buffer_type_is_host, }; /** @@ -1193,7 +1131,7 @@ static ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = { * @return A pointer to the buffer type interface for the specified device, or * nullptr if the device index is out of range. */ -GGML_CALL ggml_backend_buffer_type_t +ggml_backend_buffer_type_t ggml_backend_cann_buffer_type(int32_t device) { static std::mutex mutex; std::lock_guard lock(mutex); @@ -1211,6 +1149,7 @@ ggml_backend_cann_buffer_type(int32_t device) { for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) { ggml_backend_cann_buffer_types[i] = { /* .iface = */ ggml_backend_cann_buffer_type_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device), /* .context = */ new ggml_backend_cann_buffer_type_context{ i, "CANN" + std::to_string(i)}, @@ -1231,7 +1170,7 @@ ggml_backend_cann_buffer_type(int32_t device) { * @param buft Pointer to the host buffer type context. * @return Const pointer to the C-style string containing the name. */ -GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) { +static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) { return "CANN_Host"; GGML_UNUSED(buft); @@ -1246,7 +1185,7 @@ GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backe * @param buft Pointer to the host buffer context. * @return Const pointer to the C-style string containing the name. */ -GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) { +static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) { return "CANN_Host"; GGML_UNUSED(buffer); @@ -1260,7 +1199,7 @@ GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_bu * * @param buffer The CANN host buffer to free. */ -GGML_CALL static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) { +static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) { ACL_CHECK(aclrtFreeHost(buffer->context)); } @@ -1280,7 +1219,7 @@ static void * ggml_cann_host_malloc(size_t size) { aclError err = aclrtMallocHost((void **) &hostPtr, size); if (err != ACL_SUCCESS) { - GGML_CANN_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, + GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, size / 1024.0 / 1024.0, aclGetRecentErrMsg()); return nullptr; } @@ -1294,7 +1233,7 @@ static void * ggml_cann_host_malloc(size_t size) { * @param size Size in bytes of the host buffer to allocate. * @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails. */ -GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { void * hostPtr = ggml_cann_host_malloc(size); if (hostPtr == nullptr) { @@ -1316,7 +1255,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_ * Provides function pointers for allocating, querying properties, and managing * memory for CANN buffer types in the GGML backend. */ -GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() { +ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() { static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = { /* .iface = */ { /* .get_name = */ ggml_backend_cann_host_buffer_type_name, @@ -1326,6 +1265,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() { /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0), /* .context = */ nullptr, }; @@ -1495,7 +1435,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, * @param backend Pointer to the CANN backend structure. * @return A pointer to a constant string representing the backend name. */ -GGML_CALL static const char* ggml_backend_cann_name(ggml_backend_t backend) { +static const char* ggml_backend_cann_name(ggml_backend_t backend) { ggml_backend_cann_context* cann_ctx = (ggml_backend_cann_context*)backend->context; @@ -1510,7 +1450,7 @@ GGML_CALL static const char* ggml_backend_cann_name(ggml_backend_t backend) { * * @param backend Pointer to the CANN backend structure to be freed. */ -GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) { +static void ggml_backend_cann_free(ggml_backend_t backend) { ggml_backend_cann_context* cann_ctx = (ggml_backend_cann_context*)backend->context; ACL_CHECK(aclrtSynchronizeDevice()); @@ -1535,7 +1475,7 @@ GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) { * @param backend Pointer to the CANN backend structure. * @return Pointer to the buffer type structure for the CANN backend. */ -GGML_CALL static ggml_backend_buffer_type_t +static ggml_backend_buffer_type_t ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) { ggml_backend_cann_context* cann_ctx = (ggml_backend_cann_context*)backend->context; @@ -1556,11 +1496,11 @@ ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) { * @param offset Offset in bytes within the host data. * @param size Size of the data to copy in bytes. */ -GGML_CALL static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend, - ggml_tensor *tensor, - const void *data, - size_t offset, - size_t size) { +static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend, + ggml_tensor *tensor, + const void *data, + size_t offset, + size_t size) { ggml_backend_cann_context *cann_ctx = (ggml_backend_cann_context *)backend->context; @@ -1572,13 +1512,6 @@ GGML_CALL static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend, void *transform_buffer = malloc(size); ggml_backend_cann_transform(tensor, data, transform_buffer); -#ifndef NDEBUG - void *check_buffer = malloc(size); - ggml_backend_cann_transform_back(tensor, transform_buffer, - check_buffer); - GGML_ASSERT(memcmp(data, check_buffer, size)); - free(check_buffer); -#endif ACL_CHECK(aclrtMemcpyAsync( (char *)tensor->data + offset, size, transform_buffer, size, ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream())); @@ -1587,7 +1520,7 @@ GGML_CALL static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend, } } -GGML_CALL static void ggml_backend_cann_get_tensor_async( +static void ggml_backend_cann_get_tensor_async( ggml_backend_t backend, const ggml_tensor *tensor, void *data, size_t offset, size_t size) { ggml_backend_cann_context *cann_ctx = @@ -1626,7 +1559,7 @@ GGML_CALL static void ggml_backend_cann_get_tensor_async( * @param dst Pointer to the destination tensor to copy data to. * @return true if the copy operation succeeds, false otherwise. */ -GGML_CALL static bool ggml_backend_cann_cpy_tensor_async( +static bool ggml_backend_cann_cpy_tensor_async( ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor* src, ggml_tensor* dst) { GGML_ASSERT(ggml_backend_is_cann(backend_src) || @@ -1694,7 +1627,7 @@ GGML_CALL static bool ggml_backend_cann_cpy_tensor_async( * * @param backend Pointer to the CANN backend structure to synchronize. */ -GGML_CALL static void ggml_backend_cann_synchronize(ggml_backend_t backend) { +static void ggml_backend_cann_synchronize(ggml_backend_t backend) { ggml_backend_cann_context* cann_ctx = (ggml_backend_cann_context*)backend->context; @@ -1715,7 +1648,7 @@ GGML_CALL static void ggml_backend_cann_synchronize(ggml_backend_t backend) { * @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation * completes successfully, otherwise an appropriate error status. */ -GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute( +static enum ggml_status ggml_backend_cann_graph_compute( ggml_backend_t backend, ggml_cgraph* cgraph) { ggml_backend_cann_context* cann_ctx = (ggml_backend_cann_context*)backend->context; @@ -1732,7 +1665,7 @@ GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute( bool ok = ggml_cann_compute_forward(*cann_ctx, node); if (!ok) { - GGML_CANN_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, + GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } GGML_ASSERT(ok); @@ -1753,7 +1686,7 @@ GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute( * @return bool Returns true if the operation is supported by the backend, * otherwise false. */ -GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend, +static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_tensor* op) { switch (op->op) { case GGML_OP_UNARY: @@ -1844,7 +1777,7 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend, return false; } - GGML_UNUSED(backend); + GGML_UNUSED(dev); } /** @@ -1862,31 +1795,6 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) { return buft->iface.get_name == ggml_backend_cann_buffer_type_name; } -/** - * @brief Checks if the CANN backend supports a specific backend buffer type. - * - * This function determines whether the CANN backend supports the given backend - * buffer type by comparing the device context of the backend and buffer type. - * It returns true if the devices are same between the backend context and - * buffer type context. - * - * @param backend Pointer to the CANN backend. - * @param buft Pointer to the backend buffer type to check. - * @return bool Returns true if the CANN backend supports the buffer type, - * otherwise false. - */ -GGML_CALL static bool ggml_backend_cann_supports_buft( - ggml_backend_t backend, ggml_backend_buffer_type_t buft) { - if (ggml_backend_buft_is_cann(buft)) { - ggml_backend_cann_context * cann_ctx = - (ggml_backend_cann_context *)backend->context; - ggml_backend_cann_buffer_type_context * buft_ctx = - (ggml_backend_cann_buffer_type_context *)buft->context; - return buft_ctx->device == cann_ctx->device; - } - return false; -} - /** * @brief Determines if a tensor operation should be offloaded to the CANN * backend. @@ -1901,54 +1809,14 @@ GGML_CALL static bool ggml_backend_cann_supports_buft( * @return bool Returns true if the operation should be offloaded, otherwise * false. */ -GGML_CALL static bool ggml_backend_cann_offload_op(ggml_backend_t backend, +static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor* op) { const int min_batch_size = 32; - GGML_UNUSED(backend); + GGML_UNUSED(dev); return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS; } -/** - * @brief Creates a new event for the CANN backend. - * - * This function initializes a new event for the CANN backend by setting the - * device and creating an ACL runtime event. The created event is then wrapped - * in a ggml_backend_event structure and returned. - * - * @param backend Pointer to the CANN backend. - * @return ggml_backend_event_t Returns a pointer to the new event structure. - */ -static ggml_backend_event_t ggml_backend_cann_event_new( - ggml_backend_t backend) { - ggml_backend_cann_context* cann_ctx = - (ggml_backend_cann_context*)backend->context; - - ggml_cann_set_device(cann_ctx->device); - - aclrtEvent event; - ACL_CHECK(aclrtCreateEvent(&event)); - - return new ggml_backend_event{ - /* .backend = */ backend, - /* .context = */ event, - }; -} - -/** - * @brief Frees a CANN backend event. - * - * This function destroys the ACL runtime event associated with the given CANN - * backend event and then deletes the event structure itself. - * - * @param event Pointer to the event structure to be freed. - */ -static void ggml_backend_cann_event_free(ggml_backend_event_t event) { - ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context)); - - delete event; -} - /** * @brief Records an event on the CANN backend stream. * @@ -1957,10 +1825,9 @@ static void ggml_backend_cann_event_free(ggml_backend_event_t event) { * * @param event Pointer to the event structure to be recorded. */ -static void ggml_backend_cann_event_record(ggml_backend_event_t event) { +static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) { ggml_backend_cann_context* cann_ctx = - (ggml_backend_cann_context*)event->backend->context; - + (ggml_backend_cann_context*)backend->context; ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream())); } @@ -1978,8 +1845,7 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { ggml_backend_cann_context* cann_ctx = (ggml_backend_cann_context*)backend->context; - - if (ggml_backend_is_cann(event->backend)) { + if (ggml_backend_is_cann(backend)) { ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(), (aclrtEvent)event->context)); } else { @@ -1987,17 +1853,6 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend, } } -/** - * @brief Synchronizes the given event on the CANN backend. - * - * This function waits for the specified event to complete on the ACL runtime. - * - * @param event Pointer to the event structure to be synchronized. - */ -static void ggml_backend_cann_event_synchronize(ggml_backend_event_t event) { - ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context)); -} - /** * @brief Structure defining the interface for the CANN backend. * @@ -2005,7 +1860,7 @@ static void ggml_backend_cann_event_synchronize(ggml_backend_event_t event) { * supported by the CANN backend, including name retrieval, memory * management, tensor operations, synchronization, and event handling. */ -static ggml_backend_i ggml_backend_cann_interface = { +static const ggml_backend_i ggml_backend_cann_interface = { /* .get_name = */ ggml_backend_cann_name, /* .free = */ ggml_backend_cann_free, /* .get_default_buffer_type = */ ggml_backend_cann_get_default_buffer_type, @@ -2018,14 +1873,11 @@ static ggml_backend_i ggml_backend_cann_interface = { /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_cann_graph_compute, - /* .supports_op = */ ggml_backend_cann_supports_op, - /* .supports_buft = */ ggml_backend_cann_supports_buft, - /* .offload_op = */ ggml_backend_cann_offload_op, - /* .event_new = */ ggml_backend_cann_event_new, - /* .event_free = */ ggml_backend_cann_event_free, + /* .supports_op = */ NULL, // moved to device + /* .supports_buft = */ NULL, // moved to device + /* .offload_op = */ NULL, // moved to device /* .event_record = */ ggml_backend_cann_event_record, /* .event_wait = */ ggml_backend_cann_event_wait, - /* .event_synchronize = */ ggml_backend_cann_event_synchronize, }; /** @@ -2042,91 +1894,274 @@ static ggml_guid_t ggml_backend_cann_guid() { return &guid; } -GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) { +// backend device +struct ggml_backend_cann_device_context { + int device; + std::string name; + std::string description; +}; + +static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) { + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; + return ctx->name.c_str(); +} + +static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) { + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; + return ctx->description.c_str(); +} + +static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; + ggml_backend_cann_get_device_memory(ctx->device, free, total); +} + +static enum ggml_backend_dev_type ggml_backend_cann_device_get_type(ggml_backend_dev_t dev) { + GGML_UNUSED(dev); + return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; +} + +static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { + props->name = ggml_backend_cann_device_get_name(dev); + props->description = ggml_backend_cann_device_get_description(dev); + props->type = ggml_backend_cann_device_get_type(dev); + ggml_backend_cann_device_get_memory(dev, &props->memory_free, &props->memory_total); + + bool host_buffer = getenv("GGML_CANN_NO_PINNED") == nullptr; + + props->caps = { + /* .async = */ false, + /* .host_buffer = */ host_buffer, + /* .buffer_from_host_ptr = */ false, + /* .events = */ true, + }; +} + +static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) { + GGML_UNUSED(params); + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; + return ggml_backend_cann_init(ctx->device); +} + +/** + * @brief Checks if the CANN backend supports a specific backend buffer type. + * + * This function determines whether the CANN backend supports the given backend + * buffer type by comparing the device context of the backend and buffer type. + * It returns true if the devices are same between the backend context and + * buffer type context. + * + * @param backend Pointer to the CANN backend. + * @param buft Pointer to the backend buffer type to check. + * @return bool Returns true if the CANN backend supports the buffer type, + * otherwise false. + */ +static bool ggml_backend_cann_supports_buft( + ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + if (ggml_backend_buft_is_cann(buft)) { + ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context; + ggml_backend_cann_buffer_type_context * buft_ctx = + (ggml_backend_cann_buffer_type_context *)buft->context; + return buft_ctx->device == dev_ctx->device; + } + return false; +} + +static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) { + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; + return ggml_backend_cann_buffer_type(ctx->device); +} + +static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(ggml_backend_dev_t dev) { + GGML_UNUSED(dev); + return ggml_backend_cann_host_buffer_type(); +} + +/** + * @brief Creates a new event for the CANN backend device. + * + * This function initializes a new event for the CANN backend by setting the + * device and creating an ACL runtime event. The created event is then wrapped + * in a ggml_backend_event structure and returned. + * + * @param backend Pointer to the CANN backend. + * @return ggml_backend_event_t Returns a pointer to the new event structure. + */ +static ggml_backend_event_t ggml_backend_cann_device_event_new( + ggml_backend_dev_t dev) { + ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context; + + ggml_cann_set_device(dev_ctx->device); + + aclrtEvent event; + ACL_CHECK(aclrtCreateEvent(&event)); + + return new ggml_backend_event{ + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), dev_ctx->device), + /* .context = */ event, + }; +} + +/** + * @brief Frees a CANN backend event. + * + * This function destroys the ACL runtime event associated with the given CANN + * backend event and then deletes the event structure itself. + * + * @param event Pointer to the event structure to be freed. + */ +static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) { + ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context)); + + delete event; + GGML_UNUSED(dev); +} + +/** + * @brief Synchronizes the given event on the CANN backend. + * + * This function waits for the specified event to complete on the ACL runtime. + * + * @param event Pointer to the event structure to be synchronized. + */ +static void ggml_backend_cann_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) { + ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context)); + + GGML_UNUSED(dev); +} + +static const ggml_backend_device_i ggml_backend_cann_device_interface = { + /* .get_name = */ ggml_backend_cann_device_get_name, + /* .get_description = */ ggml_backend_cann_device_get_description, + /* .get_memory = */ ggml_backend_cann_device_get_memory, + /* .get_type = */ ggml_backend_cann_device_get_type, + /* .get_props = */ ggml_backend_cann_device_get_props, + /* .init_backend = */ ggml_backend_cann_device_init, // called for every card + /* .get_buffer_type = */ ggml_backend_cann_device_get_buffer_type, + /* .get_host_buffer_type = */ ggml_backend_cann_device_get_host_buffer_type, + /* .buffer_from_host_ptr = */ NULL, // not supported for CANN + /* .supports_op = */ ggml_backend_cann_supports_op, + /* .supports_buft = */ ggml_backend_cann_supports_buft, + /* .offload_op = */ ggml_backend_cann_offload_op, + /* .event_new = */ ggml_backend_cann_device_event_new, + /* .event_free = */ ggml_backend_cann_device_event_free, + /* .event_synchronize = */ ggml_backend_cann_device_event_synchronize, +}; + + +// backend reg +struct ggml_backend_cann_reg_context { + std::vector devices; +}; + +static const char * ggml_backend_cann_reg_get_name(ggml_backend_reg_t reg) { + GGML_UNUSED(reg); + return GGML_CANN_NAME; +} + +static size_t ggml_backend_cann_reg_get_device_count(ggml_backend_reg_t reg) { + ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context; + return ctx->devices.size(); +} + +static ggml_backend_dev_t ggml_backend_cann_reg_get_device(ggml_backend_reg_t reg, size_t index) { + ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context; + GGML_ASSERT(index < ctx->devices.size()); + return ctx->devices[index]; +} + +static void * ggml_backend_cann_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) { + GGML_UNUSED(reg); + GGML_UNUSED(name); + // reserved for future use + return nullptr; +} + +static const ggml_backend_reg_i ggml_backend_cann_reg_interface = { + /* .get_name = */ ggml_backend_cann_reg_get_name, + /* .get_device_count = */ ggml_backend_cann_reg_get_device_count, + /* .get_device_get = */ ggml_backend_cann_reg_get_device, + /* .get_proc_address = */ ggml_backend_cann_reg_get_proc_address, +}; + +// backend registry, called only once for cann backend +ggml_backend_reg_t ggml_backend_cann_reg() { + static ggml_backend_reg reg; + static bool initialized = false; + + { + static std::mutex mutex; + std::lock_guard lock(mutex); + if (!initialized) { + aclInit(nullptr); + ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context; + + for (int i = 0; i < ggml_cann_info().device_count; i++) { + ggml_backend_cann_device_context* dev_ctx = new ggml_backend_cann_device_context(); + dev_ctx->description = aclrtGetSocName(); + dev_ctx->device = i; + dev_ctx->name = GGML_CANN_NAME + std::to_string(i); + ggml_cann_set_device(i); + ggml_backend_dev_t dev = new ggml_backend_device { + /* .interface = */ ggml_backend_cann_device_interface, + /* .reg = */ ®, + /* .context = */ dev_ctx + }; + ctx->devices.push_back(dev); + } + + reg = ggml_backend_reg { + /* .interface = */ ggml_backend_cann_reg_interface, + /* .context = */ ctx + }; + } + + initialized = true; + } + + return ® +} + +ggml_backend_t ggml_backend_cann_init(int32_t device) { aclInit(nullptr); if (device < 0 || device >= ggml_backend_cann_get_device_count()) { - GGML_CANN_LOG_ERROR("%s: error: invalid device %d\n", __func__, device); + GGML_LOG_ERROR("%s: error: invalid device %d\n", __func__, device); return nullptr; } ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device); if (ctx == nullptr) { - GGML_CANN_LOG_ERROR("%s: error: failed to allocate context\n", __func__); + GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__); return nullptr; } ggml_cann_set_device(ctx->device); ggml_backend_t cann_backend = new ggml_backend{/* .guid = */ ggml_backend_cann_guid(), /* .interface = */ ggml_backend_cann_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device), /* .context = */ ctx}; return cann_backend; } -GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend) { +bool ggml_backend_is_cann(ggml_backend_t backend) { return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cann_guid()); } -GGML_CALL int32_t ggml_backend_cann_get_device_count() { +int32_t ggml_backend_cann_get_device_count() { return ggml_cann_info().device_count; } -GGML_CALL void ggml_backend_cann_get_device_description( +void ggml_backend_cann_get_device_description( int32_t device, char* description, size_t description_size) { ggml_cann_set_device(device); const char* soc_name = aclrtGetSocName(); snprintf(description, description_size, "%s", soc_name); } -GGML_CALL void ggml_backend_cann_get_device_memory(int32_t device, size_t* free, - size_t* total) { +void ggml_backend_cann_get_device_memory(int32_t device, size_t* free, + size_t* total) { ggml_cann_set_device(device); ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total)); } - -// backend registry -/** - * @brief Initializes a CANN backend based on the provided parameters. - * - * This function initializes a CANN backend using the device index and then - * initializes the backend using `ggml_backend_cann_init`. - * - * @param params Parameters for initialization (unused in this implementation). - * @param user_data User data containing the device index to initialize the - * backend. - * @return ggml_backend_t The initialized CANN backend. - */ -GGML_CALL static ggml_backend_t ggml_backend_reg_cann_init(const char* params, - void* user_data) { - ggml_backend_t cann_backend = - ggml_backend_cann_init((int)(intptr_t)user_data); - return cann_backend; - - GGML_UNUSED(params); -} - -extern "C" GGML_CALL int ggml_backend_cann_reg_devices(); - -/** - * @brief Registers CANN (Ascend) devices as backend options. - * - * This function initializes ACL, retrieves the number of available CANN - * devices, and registers each device as a backend option using - * `ggml_backend_register`. Each device is given a unique name based on - * `GGML_CANN_NAME` followed by its index. - * - * @return int The number of CANN devices registered. - */ -GGML_CALL int ggml_backend_cann_reg_devices() { - uint32_t device_count = ggml_backend_cann_get_device_count(); - // initialization - for (uint32_t i = 0; i < device_count; i++) { - char name[128]; - snprintf(name, sizeof(name), "CANN%d", i); - ggml_backend_register(name, ggml_backend_reg_cann_init, - ggml_backend_cann_buffer_type(i), - (void*)(intptr_t)i); - } - return device_count; -} diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 0bb7f2d99..1338bd458 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -5,12 +5,14 @@ #include "ggml-cuda/common.cuh" #include "ggml-cuda/acc.cuh" #include "ggml-cuda/arange.cuh" +#include "ggml-cuda/argmax.cuh" #include "ggml-cuda/argsort.cuh" #include "ggml-cuda/binbcast.cuh" #include "ggml-cuda/clamp.cuh" #include "ggml-cuda/concat.cuh" #include "ggml-cuda/conv-transpose-1d.cuh" #include "ggml-cuda/convert.cuh" +#include "ggml-cuda/count-equal.cuh" #include "ggml-cuda/cpy.cuh" #include "ggml-cuda/cross-entropy-loss.cuh" #include "ggml-cuda/diagmask.cuh" @@ -56,54 +58,16 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); -static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) { - GGML_UNUSED(level); - GGML_UNUSED(user_data); - fprintf(stderr, "%s", msg); -} - -ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback; -void * ggml_cuda_log_user_data = NULL; - -GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) { - ggml_cuda_log_callback = log_callback; - ggml_cuda_log_user_data = user_data; -} - -#define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) -#define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) -#define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) - -GGML_ATTRIBUTE_FORMAT(2, 3) -static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) { - if (ggml_cuda_log_callback != NULL) { - va_list args; - va_start(args, format); - char buffer[128]; - int len = vsnprintf(buffer, 128, format, args); - if (len < 128) { - ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data); - } else { - std::vector buffer2(len + 1); // vsnprintf adds a null terminator - va_end(args); - va_start(args, format); - vsnprintf(&buffer2[0], buffer2.size(), format, args); - ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data); - } - va_end(args); - } -} - [[noreturn]] void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) { int id = -1; // in case cudaGetDevice fails cudaGetDevice(&id); - GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg); - GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line); - GGML_CUDA_LOG_ERROR(" %s\n", stmt); - // abort with GGML_ASSERT to get a stack trace - GGML_ABORT("CUDA error"); + GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg); + GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line); + GGML_LOG_ERROR(" %s\n", stmt); + // abort with GGML_ABORT to get a stack trace + GGML_ABORT(GGML_CUDA_NAME " error"); } // this is faster on Windows @@ -166,7 +130,7 @@ static ggml_cuda_device_info ggml_cuda_init() { cudaError_t err = cudaGetDeviceCount(&info.device_count); if (err != cudaSuccess) { - GGML_CUDA_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err)); + GGML_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err)); return info; } @@ -174,20 +138,20 @@ static ggml_cuda_device_info ggml_cuda_init() { int64_t total_vram = 0; #ifdef GGML_CUDA_FORCE_MMQ - GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__); + GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__); #else - GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__); + GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__); #endif // GGML_CUDA_FORCE_MMQ #ifdef GGML_CUDA_FORCE_CUBLAS - GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__); + GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__); #else - GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__); + GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__); #endif // GGML_CUDA_FORCE_CUBLAS - GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count); + GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count); for (int id = 0; id < info.device_count; ++id) { int device_vmm = 0; -#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA) +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) CUdevice device; CU_CHECK(cuDeviceGet(&device, id)); CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device)); @@ -199,12 +163,12 @@ static ggml_cuda_device_info ggml_cuda_init() { alloc_prop.location.id = id; CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); } -#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA) +#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) info.devices[id].vmm = !!device_vmm; cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); - GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); + GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); info.default_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem; @@ -312,7 +276,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { *actual_size = look_ahead_size; pool_size += look_ahead_size; #ifdef DEBUG_CUDA_MALLOC - GGML_CUDA_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz, + GGML_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz, (uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024)); #endif return ptr; @@ -327,7 +291,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { return; } } - GGML_CUDA_LOG_WARN("Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n"); + GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n"); ggml_cuda_set_device(device); CUDA_CHECK(cudaFree(ptr)); pool_size -= size; @@ -335,7 +299,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { }; // pool with virtual memory -#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA) +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) struct ggml_cuda_pool_vmm : public ggml_cuda_pool { static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB @@ -429,14 +393,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool { GGML_ASSERT(ptr == (void *) (pool_addr + pool_used)); } }; -#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA) +#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) std::unique_ptr ggml_backend_cuda_context::new_pool_for_device(int device) { -#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA) +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) if (ggml_cuda_info().devices[device].vmm) { return std::unique_ptr(new ggml_cuda_pool_vmm(device)); } -#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA) +#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) return std::unique_ptr(new ggml_cuda_pool_leg(device)); } @@ -457,26 +421,26 @@ struct ggml_backend_cuda_buffer_context { } }; -GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) { +static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; return ctx->name.c_str(); } -GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { +static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name; } -GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { +static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; delete ctx; } -GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { +static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; return ctx->dev_ptr; } -GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { +static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; if (tensor->view_src != NULL) { @@ -496,7 +460,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t } } -GGML_CALL static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { +static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); @@ -504,7 +468,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } -GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); @@ -512,7 +476,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } -GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); @@ -520,7 +484,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } -GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { +static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { if (ggml_backend_buffer_is_cuda(src->buffer)) { ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context; ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context; @@ -541,7 +505,7 @@ GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t GGML_UNUSED(buffer); } -GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); @@ -550,7 +514,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffe CUDA_CHECK(cudaDeviceSynchronize()); } -static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = { +static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = { /* .get_name = */ ggml_backend_cuda_buffer_get_name, /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, /* .get_base = */ ggml_backend_cuda_buffer_get_base, @@ -569,17 +533,17 @@ struct ggml_backend_cuda_buffer_type_context { std::string name; }; -GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) { +static const char * ggml_backend_cuda_buffer_type_get_name(ggml_backend_buffer_type_t buft) { ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; return ctx->name.c_str(); } static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) { - return buft->iface.get_name == ggml_backend_cuda_buffer_type_name; + return buft->iface.get_name == ggml_backend_cuda_buffer_type_get_name; } -GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; ggml_cuda_set_device(buft_ctx->device); @@ -591,7 +555,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe if (err != cudaSuccess) { // clear the error cudaGetLastError(); - GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err)); + GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err)); return nullptr; } @@ -600,13 +564,13 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size); } -GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return 128; GGML_UNUSED(buft); } -GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { +static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { size_t size = ggml_nbytes(tensor); int64_t ne0 = tensor->ne[0]; @@ -621,8 +585,8 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backen GGML_UNUSED(buft); } -static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { - /* .get_name = */ ggml_backend_cuda_buffer_type_name, +static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { + /* .get_name = */ ggml_backend_cuda_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX @@ -630,7 +594,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { /* .is_host = */ NULL, }; -GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { +ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { static std::mutex mutex; std::lock_guard lock(mutex); @@ -643,9 +607,10 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { static bool ggml_backend_cuda_buffer_type_initialized = false; if (!ggml_backend_cuda_buffer_type_initialized) { - for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) { + for (int i = 0; i < ggml_backend_cuda_get_device_count(); i++) { ggml_backend_cuda_buffer_types[i] = { /* .iface = */ ggml_backend_cuda_buffer_type_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), i), /* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)}, }; } @@ -715,7 +680,7 @@ struct ggml_backend_cuda_split_buffer_context { std::vector tensor_extras; }; -GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) { +static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) { return GGML_CUDA_NAME "_Split"; GGML_UNUSED(buffer); @@ -726,19 +691,19 @@ static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) { GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds } -GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { +static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; delete ctx; } -GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) { +static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) { // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced return (void *)0x1000; GGML_UNUSED(buffer); } -GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { +static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; @@ -786,7 +751,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu tensor->extra = extra; } -GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); GGML_ASSERT(size == ggml_nbytes(tensor)); @@ -824,7 +789,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buf } } -GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); GGML_ASSERT(size == ggml_nbytes(tensor)); @@ -862,12 +827,12 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buf } } -GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { GGML_UNUSED(buffer); GGML_UNUSED(value); } -static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = { +static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = { /* .get_name = */ ggml_backend_cuda_split_buffer_get_name, /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer, /* .get_base = */ ggml_backend_cuda_split_buffer_get_base, @@ -882,17 +847,17 @@ static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = { // cuda split buffer type -GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) { +static const char * ggml_backend_cuda_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) { return GGML_CUDA_NAME "_Split"; GGML_UNUSED(buft); } static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) { - return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name; + return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_get_name; } -GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point // instead, we allocate them for each tensor separately in init_tensor // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated, @@ -902,13 +867,13 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size); } -GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return 128; GGML_UNUSED(buft); } -GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { +static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context; size_t total_size = 0; @@ -935,14 +900,14 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_ return total_size; } -GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { +static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return false; GGML_UNUSED(buft); } -static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = { - /* .get_name = */ ggml_backend_cuda_split_buffer_type_name, +static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = { + /* .get_name = */ ggml_backend_cuda_split_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX @@ -950,7 +915,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, }; -GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) { +ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) { static std::mutex mutex; std::lock_guard lock(mutex); @@ -979,6 +944,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const f struct ggml_backend_buffer_type buft { /* .iface = */ ggml_backend_cuda_split_buffer_type_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0), /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr}, }; @@ -988,19 +954,19 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const f // host buffer type -GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) { +static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) { return GGML_CUDA_NAME "_Host"; GGML_UNUSED(buft); } -GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) { +static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) { return GGML_CUDA_NAME "_Host"; GGML_UNUSED(buffer); } -GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { +static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { CUDA_CHECK(cudaFreeHost(buffer->context)); } @@ -1014,7 +980,7 @@ static void * ggml_cuda_host_malloc(size_t size) { if (err != cudaSuccess) { // clear the error cudaGetLastError(); - GGML_CUDA_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, + GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, size / 1024.0 / 1024.0, cudaGetErrorString(err)); return nullptr; } @@ -1022,7 +988,7 @@ static void * ggml_cuda_host_malloc(size_t size) { return ptr; } -GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { void * ptr = ggml_cuda_host_malloc(size); if (ptr == nullptr) { @@ -1038,7 +1004,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_ return buffer; } -GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { +ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = { /* .iface = */ { /* .get_name = */ ggml_backend_cuda_host_buffer_type_name, @@ -1048,6 +1014,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0), /* .context = */ nullptr, }; @@ -2178,6 +2145,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg } switch (dst->op) { + case GGML_OP_ARGMAX: + ggml_cuda_argmax(ctx, dst); + break; + case GGML_OP_COUNT_EQUAL: + ggml_cuda_count_equal(ctx, dst); + break; case GGML_OP_REPEAT: ggml_cuda_op_repeat(ctx, dst); break; @@ -2280,7 +2253,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg break; case GGML_OP_MUL_MAT: if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) { - GGML_CUDA_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]); + GGML_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]); return false; } else { ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst); @@ -2364,7 +2337,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { - GGML_CUDA_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst)); + GGML_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst)); CUDA_CHECK(err); } @@ -2375,26 +2348,26 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg // backend -GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) { +static const char * ggml_backend_cuda_get_name(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; return cuda_ctx->name.c_str(); } -GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) { +static void ggml_backend_cuda_free(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; delete cuda_ctx; delete backend; } -GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) { +static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; return ggml_backend_cuda_buffer_type(cuda_ctx->device); } -GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; @@ -2403,7 +2376,7 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream())); } -GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; @@ -2412,7 +2385,7 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream())); } -GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) { +static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) { ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer; ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer; @@ -2433,7 +2406,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) { #ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: backend and buffer devices do not match\n", __func__); + GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__); #endif return false; } @@ -2467,7 +2440,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ return true; } -GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { +static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream())); @@ -2475,6 +2448,7 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { GGML_UNUSED(backend); } +#ifdef USE_CUDA_GRAPH static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) { graph_node_properties->node_address = node->data; graph_node_properties->node_op = node->op; @@ -2525,8 +2499,9 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra return true; } +#endif -GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_cuda_set_device(cuda_ctx->device); @@ -2549,7 +2524,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) { cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true; #ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__); #endif } } @@ -2600,14 +2575,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) { use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture #ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__); #endif } if (node->op == GGML_OP_MUL_MAT_ID) { use_cuda_graph = false; // This node type is not supported by CUDA graph capture #ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to mul_mat_id\n", __func__); #endif } @@ -2616,7 +2591,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t // Changes in batch size or context size can cause changes to the grid size of some kernels. use_cuda_graph = false; #ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); #endif } @@ -2628,7 +2603,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t if (!ptr) { use_cuda_graph = false; #ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__); #endif } else { if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) { @@ -2652,7 +2627,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) { cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true; #ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); #endif } } @@ -2691,7 +2666,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t bool ok = ggml_cuda_compute_forward(*cuda_ctx, node); if (!ok) { - GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); + GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } GGML_ASSERT(ok); } @@ -2710,7 +2685,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t use_cuda_graph = false; cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true; #ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to failed graph capture\n", __func__); #endif } else { graph_evaluated_or_captured = true; // CUDA graph has been captured @@ -2777,7 +2752,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info); if (stat == cudaErrorGraphExecUpdateFailure) { #ifndef NDEBUG - GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__); + GGML_LOG_ERROR("%s: CUDA graph update failed\n", __func__); #endif // The pre-existing graph exec cannot be updated due to violated constraints // so instead clear error and re-instantiate @@ -2798,8 +2773,188 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t return GGML_STATUS_SUCCESS; } -GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) { - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context; +static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + + CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, cuda_ctx->stream())); +} + +static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + + if (ggml_backend_is_cuda(backend)) { + CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), (cudaEvent_t)event->context, 0)); + } else { +#if 0 + // untested + auto wait_fn = [](void * user_data) { + ggml_backend_event_t event = (ggml_backend_event_t)user_data; + ggml_backend_event_synchronize(event); + }; + + CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event)); +#endif + GGML_ABORT("fatal error"); + } +} + +static const ggml_backend_i ggml_backend_cuda_interface = { + /* .get_name = */ ggml_backend_cuda_get_name, + /* .free = */ ggml_backend_cuda_free, + /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type, + /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async, + /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async, + /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async, + /* .synchronize = */ ggml_backend_cuda_synchronize, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_cuda_graph_compute, + /* .supports_op = */ NULL, // moved to device + /* .supports_buft = */ NULL, // moved to device + /* .offload_op = */ NULL, // moved to device + /* .event_record = */ ggml_backend_cuda_event_record, + /* .event_wait = */ ggml_backend_cuda_event_wait, +}; + +static ggml_guid_t ggml_backend_cuda_guid() { + static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 }; + return &guid; +} + +bool ggml_backend_is_cuda(ggml_backend_t backend) { + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid()); +} + +int ggml_backend_cuda_get_device_count() { + return ggml_cuda_info().device_count; +} + +void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) { + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); + snprintf(description, description_size, "%s", prop.name); +} + +void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) { + ggml_cuda_set_device(device); + + CUDA_CHECK(cudaMemGetInfo(free, total)); +} + +bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) { + if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) { + return false; + } + +#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA) + cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly); + if (err != cudaSuccess) { + // clear the error + cudaGetLastError(); + + GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__, + size / 1024.0 / 1024.0, cudaGetErrorString(err)); + return false; + } + return true; +#else + return false; +#endif +} + +void ggml_backend_cuda_unregister_host_buffer(void * buffer) { + if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) { + return; + } + + cudaError_t err = cudaHostUnregister(buffer); + if (err != cudaSuccess) { + // clear the error + cudaGetLastError(); + } +} + + +// backend device + +struct ggml_backend_cuda_device_context { + int device; + std::string name; + std::string description; +}; + +static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) { + ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; + return ctx->name.c_str(); +} + +static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t dev) { + ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; + return ctx->description.c_str(); +} + +static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; + ggml_cuda_set_device(ctx->device); + CUDA_CHECK(cudaMemGetInfo(free, total)); +} + +static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) { + GGML_UNUSED(dev); + return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; +} + +static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { + props->name = ggml_backend_cuda_device_get_name(dev); + props->description = ggml_backend_cuda_device_get_description(dev); + props->type = ggml_backend_cuda_device_get_type(dev); + ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total); + + bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr; +#ifdef GGML_CUDA_NO_PEER_COPY + bool events = false; +#else + bool events = true; +#endif + + props->caps = { + /* .async = */ true, + /* .host_buffer = */ host_buffer, + /* .buffer_from_host_ptr = */ false, + /* .events = */ events, + }; +} + +static ggml_backend_t ggml_backend_cuda_device_init(ggml_backend_dev_t dev, const char * params) { + GGML_UNUSED(params); + ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; + return ggml_backend_cuda_init(ctx->device); +} + +static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_buffer_type(ggml_backend_dev_t dev) { + ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; + return ggml_backend_cuda_buffer_type(ctx->device); +} + +static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type(ggml_backend_dev_t dev) { + GGML_UNUSED(dev); + return ggml_backend_cuda_host_buffer_type(); +} + +static ggml_backend_buffer_t ggml_backend_cuda_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { + GGML_UNUSED(dev); + GGML_UNUSED(ptr); + GGML_UNUSED(size); + GGML_UNUSED(max_tensor_size); + return nullptr; +} + +// TODO: move these functions here +static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { + ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context; + switch (op->op) { case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { @@ -2929,6 +3084,15 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons return false; } break; case GGML_OP_DUP: + { + ggml_type src0_type = op->src[0]->type; + return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; + } break; + case GGML_OP_ARGMAX: + case GGML_OP_COUNT_EQUAL: + { + return true; + } break; case GGML_OP_REPEAT: { ggml_type src0_type = op->src[0]->type; @@ -3004,7 +3168,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons if (op->src[0]->ne[0] == 256 && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) { return true; } - const int cc = ggml_cuda_info().devices[cuda_ctx->device].cc; + const int cc = ggml_cuda_info().devices[dev_ctx->device].cc; return cc >= CC_VOLTA && cc < CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16; } case GGML_OP_CROSS_ENTROPY_LOSS: @@ -3014,205 +3178,181 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons default: return false; } - - GGML_UNUSED(backend); } -GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { +static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { if (ggml_backend_buft_is_cuda_split(buft)) { return true; } if (ggml_backend_buft_is_cuda(buft)) { - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context; ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; - return buft_ctx->device == cuda_ctx->device; + return buft_ctx->device == dev_ctx->device; } return false; } -GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) { +static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { const int min_batch_size = 32; return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) || (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID); - GGML_UNUSED(backend); + GGML_UNUSED(dev); } -static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) { +static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) { #ifdef GGML_CUDA_NO_PEER_COPY return nullptr; #else - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context; - ggml_cuda_set_device(cuda_ctx->device); + ggml_cuda_set_device(dev_ctx->device); cudaEvent_t event; CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); return new ggml_backend_event { - /* .backend = */ backend, + /* .device = */ dev, /* .context = */ event, }; #endif } -static void ggml_backend_cuda_event_free(ggml_backend_event_t event) { - CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context)); +static void ggml_backend_cuda_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) { + GGML_UNUSED(dev); + CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context)); delete event; } -static void ggml_backend_cuda_event_record(ggml_backend_event_t event) { - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context; - - CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, cuda_ctx->stream())); -} - -static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; - - if (ggml_backend_is_cuda(event->backend)) { - CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), (cudaEvent_t)event->context, 0)); - } else { -#if 0 - // untested - auto wait_fn = [](void * user_data) { - ggml_backend_event_t event = (ggml_backend_event_t)user_data; - ggml_backend_event_synchronize(event); - }; - - CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event)); -#endif - GGML_ABORT("fatal error"); - } -} - -static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) { +static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) { + GGML_UNUSED(dev); CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context)); } -static ggml_backend_i ggml_backend_cuda_interface = { - /* .get_name = */ ggml_backend_cuda_name, - /* .free = */ ggml_backend_cuda_free, - /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type, - /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async, - /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async, - /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async, - /* .synchronize = */ ggml_backend_cuda_synchronize, - /* .graph_plan_create = */ NULL, - /* .graph_plan_free = */ NULL, - /* .graph_plan_update = */ NULL, - /* .graph_plan_compute = */ NULL, - /* .graph_compute = */ ggml_backend_cuda_graph_compute, - /* .supports_op = */ ggml_backend_cuda_supports_op, - /* .supports_buft = */ ggml_backend_cuda_supports_buft, - /* .offload_op = */ ggml_backend_cuda_offload_op, - /* .event_new = */ ggml_backend_cuda_event_new, - /* .event_free = */ ggml_backend_cuda_event_free, - /* .event_record = */ ggml_backend_cuda_event_record, - /* .event_wait = */ ggml_backend_cuda_event_wait, - /* .event_synchronize = */ ggml_backend_cuda_event_synchronize, +static const ggml_backend_device_i ggml_backend_cuda_device_interface = { + /* .get_name = */ ggml_backend_cuda_device_get_name, + /* .get_description = */ ggml_backend_cuda_device_get_description, + /* .get_memory = */ ggml_backend_cuda_device_get_memory, + /* .get_type = */ ggml_backend_cuda_device_get_type, + /* .get_props = */ ggml_backend_cuda_device_get_props, + /* .init_backend = */ ggml_backend_cuda_device_init, + /* .get_buffer_type = */ ggml_backend_cuda_device_get_buffer_type, + /* .get_host_buffer_type = */ ggml_backend_cuda_device_get_host_buffer_type, + /* .buffer_from_host_ptr = */ ggml_backend_cuda_device_buffer_from_host_ptr, + /* .supports_op = */ ggml_backend_cuda_device_supports_op, + /* .supports_buft = */ ggml_backend_cuda_device_supports_buft, + /* .offload_op = */ ggml_backend_cuda_device_offload_op, + /* .event_new = */ ggml_backend_cuda_device_event_new, + /* .event_free = */ ggml_backend_cuda_device_event_free, + /* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize, }; -static ggml_guid_t ggml_backend_cuda_guid() { - static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 }; - return &guid; +// backend reg + +struct ggml_backend_cuda_reg_context { + std::vector devices; +}; + +static const char * ggml_backend_cuda_reg_get_name(ggml_backend_reg_t reg) { + GGML_UNUSED(reg); + return GGML_CUDA_NAME; } -GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) { +static size_t ggml_backend_cuda_reg_get_device_count(ggml_backend_reg_t reg) { + ggml_backend_cuda_reg_context * ctx = (ggml_backend_cuda_reg_context *)reg->context; + return ctx->devices.size(); +} + +static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t reg, size_t index) { + ggml_backend_cuda_reg_context * ctx = (ggml_backend_cuda_reg_context *)reg->context; + GGML_ASSERT(index < ctx->devices.size()); + return ctx->devices[index]; +} + +static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) { + GGML_UNUSED(reg); + if (strcmp(name, "ggml_backend_split_buffer_type") == 0) { + return (void *)ggml_backend_cuda_split_buffer_type; + } + if (strcmp(name, "ggml_backend_register_host_buffer") == 0) { + return (void *)ggml_backend_cuda_register_host_buffer; + } + if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) { + return (void *)ggml_backend_cuda_unregister_host_buffer; + } + return nullptr; +} + +static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = { + /* .get_name = */ ggml_backend_cuda_reg_get_name, + /* .get_device_count = */ ggml_backend_cuda_reg_get_device_count, + /* .get_device_get = */ ggml_backend_cuda_reg_get_device, + /* .get_proc_address = */ ggml_backend_cuda_reg_get_proc_address, +}; + +// backend registry +ggml_backend_reg_t ggml_backend_cuda_reg() { + static ggml_backend_reg reg; + static bool initialized = false; + + { + static std::mutex mutex; + std::lock_guard lock(mutex); + if (!initialized) { + ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context; + + for (int i = 0; i < ggml_cuda_info().device_count; i++) { + ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context; + dev_ctx->device = i; + dev_ctx->name = GGML_CUDA_NAME + std::to_string(i); + + ggml_cuda_set_device(i); + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); + dev_ctx->description = prop.name; + + ggml_backend_dev_t dev = new ggml_backend_device { + /* .interface = */ ggml_backend_cuda_device_interface, + /* .reg = */ ®, + /* .context = */ dev_ctx + }; + ctx->devices.push_back(dev); + } + + reg = ggml_backend_reg { + /* .interface = */ ggml_backend_cuda_reg_interface, + /* .context = */ ctx + }; + } + + initialized = true; + } + + return ® +} + +ggml_backend_t ggml_backend_cuda_init(int device) { if (device < 0 || device >= ggml_backend_cuda_get_device_count()) { - GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device); + GGML_LOG_ERROR("%s: invalid device %d\n", __func__, device); return nullptr; } ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device); if (ctx == nullptr) { - GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__); + GGML_LOG_ERROR("%s: failed to allocate context\n", __func__); return nullptr; } ggml_backend_t cuda_backend = new ggml_backend { /* .guid = */ ggml_backend_cuda_guid(), /* .interface = */ ggml_backend_cuda_interface, - /* .context = */ ctx + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device), + /* .context = */ ctx, }; return cuda_backend; } - -GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) { - return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid()); -} - -GGML_CALL int ggml_backend_cuda_get_device_count() { - return ggml_cuda_info().device_count; -} - -GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) { - cudaDeviceProp prop; - CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); - snprintf(description, description_size, "%s", prop.name); -} - -GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) { - ggml_cuda_set_device(device); - - CUDA_CHECK(cudaMemGetInfo(free, total)); -} - -GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) { - if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) { - return false; - } - -#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA) - cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly); - if (err != cudaSuccess) { - // clear the error - cudaGetLastError(); - - GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__, - size / 1024.0 / 1024.0, cudaGetErrorString(err)); - return false; - } - return true; -#else - return false; -#endif -} - -GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) { - if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) { - return; - } - - cudaError_t err = cudaHostUnregister(buffer); - if (err != cudaSuccess) { - // clear the error - cudaGetLastError(); - } -} - -// backend registry -GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) { - ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data); - return cuda_backend; - - GGML_UNUSED(params); -} - -extern "C" GGML_CALL int ggml_backend_cuda_reg_devices(); - -GGML_CALL int ggml_backend_cuda_reg_devices() { - int device_count = ggml_backend_cuda_get_device_count(); - //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization - for (int i = 0; i < device_count; i++) { - char name[128]; - snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i); - ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i); - } - return device_count; -} diff --git a/ggml/src/ggml-cuda/argmax.cu b/ggml/src/ggml-cuda/argmax.cu new file mode 100644 index 000000000..aab04eca7 --- /dev/null +++ b/ggml/src/ggml-cuda/argmax.cu @@ -0,0 +1,79 @@ +#include "common.cuh" +#include "argmax.cuh" +#include "sum.cuh" + +#include + +static __global__ void argmax_f32( + const float * x, int32_t * dst, const int64_t ncols, const int64_t nrows) { + + int argmax_thread = 0; + const int64_t row0 = (int64_t)blockIdx.x*WARP_SIZE; + +#pragma unroll + for (int64_t row1 = 0; row1 < WARP_SIZE; ++row1) { + const int64_t row = row0 + row1; + + if (row >= nrows) { + break; + } + + float maxval = -FLT_MAX; + int argmax = -1; + + for (int32_t col = threadIdx.x; col < ncols; col += WARP_SIZE) { + const float val = x[row*ncols + col]; + const int bigger = val > maxval; + const int not_bigger = bigger ^ 0x00000001; + + maxval = maxval*not_bigger + val*bigger; + argmax = argmax*not_bigger + col*bigger; + } + +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, mask, WARP_SIZE); + const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, mask, WARP_SIZE); + const int bigger = val > maxval; + const int not_bigger = bigger ^ 0x00000001; + + maxval = maxval*not_bigger + val*bigger; + argmax = argmax*not_bigger + col*bigger; + } + + const int store = row1 == threadIdx.x; + argmax_thread += store*argmax; + } + + const int row = row0 + threadIdx.x; + + if (row >= nrows) { + return; + } + + dst[row] = argmax_thread; +} + +void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_I32); + + GGML_ASSERT(ggml_is_contiguous(src0)); + + const int64_t ne00 = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + const float * src0_d = (const float *) src0->data; + int32_t * dst_d = (int32_t *) dst->data; + + cudaStream_t stream = ctx.stream(); + + const int64_t num_blocks = (nrows + WARP_SIZE - 1) / WARP_SIZE; + + const dim3 blocks_dim(WARP_SIZE, 1, 1); + const dim3 blocks_num(num_blocks, 1, 1); + + argmax_f32<<>>(src0_d, dst_d, ne00, nrows); +} diff --git a/ggml/src/ggml-cuda/argmax.cuh b/ggml/src/ggml-cuda/argmax.cuh new file mode 100644 index 000000000..5b7223adc --- /dev/null +++ b/ggml/src/ggml-cuda/argmax.cuh @@ -0,0 +1,3 @@ +#include "common.cuh" + +void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 6a4bcdba0..dd203fcde 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -175,6 +175,18 @@ static __device__ void no_device_code( #define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.") #endif // __CUDA_ARCH__ +static __device__ __forceinline__ int warp_reduce_sum(int x) { +#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE + return __reduce_add_sync(0xffffffff, x); +#else +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + x += __shfl_xor_sync(0xffffffff, x, mask, 32); + } + return x; +#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE +} + static __device__ __forceinline__ float warp_reduce_sum(float x) { #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { diff --git a/ggml/src/ggml-cuda/count-equal.cu b/ggml/src/ggml-cuda/count-equal.cu new file mode 100644 index 000000000..ffb053b10 --- /dev/null +++ b/ggml/src/ggml-cuda/count-equal.cu @@ -0,0 +1,64 @@ +#include "common.cuh" +#include "count-equal.cuh" + +#include + +template +static __global__ void count_equal(const T * __restrict__ x, const T * __restrict__ y, int64_t * __restrict__ dst, const int64_t dk, const int64_t k) { + const int64_t i0 = (int64_t) blockIdx.x*dk; + const int64_t i1 = min(i0 + dk, k); + + int nequal = 0; + + for (int64_t i = i0 + threadIdx.x; i < i1; i += WARP_SIZE) { + const T xi = x[i]; + const T yi = y[i]; + nequal += xi == yi; + } + + nequal = warp_reduce_sum(nequal); + + if (threadIdx.x != 0) { + return; + } + + atomicAdd((int *) dst, nequal); +} + +void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(src0->type == src1->type); + GGML_ASSERT( dst->type == GGML_TYPE_I64); + + GGML_ASSERT(ggml_are_same_shape(src0, src1)); + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_contiguous(dst)); + + int64_t * dst_d = (int64_t *) dst->data; + + cudaStream_t stream = ctx.stream(); + const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm; + + const int64_t ne = ggml_nelements(src0); + GGML_ASSERT(ne < (1 << 30) && "atomicAdd implementation only supports int"); + const int64_t dne = GGML_PAD(ne / (4*nsm), CUDA_COUNT_EQUAL_CHUNK_SIZE); + + CUDA_CHECK(cudaMemsetAsync(dst_d, 0, ggml_nbytes(dst), stream)); + + const dim3 blocks_dim(WARP_SIZE, 1, 1); + const dim3 blocks_num(std::min((int64_t)4*nsm, (ne + CUDA_COUNT_EQUAL_CHUNK_SIZE - 1)/CUDA_COUNT_EQUAL_CHUNK_SIZE), 1, 1); + + switch (src0->type) { + case GGML_TYPE_I32: { + const int * src0_d = (const int *) src0->data; + const int * src1_d = (const int *) src1->data; + count_equal<<>>(src0_d, src1_d, dst_d, dne, ne); + } break; + default: + GGML_ASSERT(false); + break; + } +} diff --git a/ggml/src/ggml-cuda/count-equal.cuh b/ggml/src/ggml-cuda/count-equal.cuh new file mode 100644 index 000000000..8467da79e --- /dev/null +++ b/ggml/src/ggml-cuda/count-equal.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_COUNT_EQUAL_CHUNK_SIZE 128 + +void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/dmmv.cu b/ggml/src/ggml-cuda/dmmv.cu index 96a5adef5..00e21b5d7 100644 --- a/ggml/src/ggml-cuda/dmmv.cu +++ b/ggml/src/ggml-cuda/dmmv.cu @@ -416,10 +416,11 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ const half * x = (const half *) vx; - + // load 2 halfs into register in a single instruction + const half2 x_reg = *((half2 *) &(x[ib + iqs])); // automatic half -> float type cast if dfloat == float - v.x = x[ib + iqs + 0]; - v.y = x[ib + iqs + 1]; + v.x = __low2float(x_reg); + v.y = __high2float(x_reg); } static constexpr __device__ dequantize_kernel_t get_dequantize_kernel(ggml_type type) { @@ -476,13 +477,28 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons // matrix multiplication // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 #ifdef GGML_CUDA_F16 - tmp += __hmul2(v, { - y[iybs + iqs + j/qr + 0], - y[iybs + iqs + j/qr + y_offset] - }); + if ( y_offset == 1 ) { + // load 2 dfloats into register in a single instruction + const dfloat2 y_reg = *((dfloat2 *) &(y[iybs + iqs + j/qr])); + tmp += __hmul2(v, y_reg); + } + else { + tmp += __hmul2(v, { + y[iybs + iqs + j/qr + 0], + y[iybs + iqs + j/qr + y_offset] + }); + } #else - tmp += v.x * y[iybs + iqs + j/qr + 0]; - tmp += v.y * y[iybs + iqs + j/qr + y_offset]; + if ( y_offset == 1 ) { + // load 2 dfloats into register in a single instruction + const dfloat2 y_reg = *((dfloat2 *) &(y[iybs + iqs + j/qr])); + tmp += v.x * y_reg.x; + tmp += v.y * y_reg.y; + } + else { + tmp += v.x * y[iybs + iqs + j/qr + 0]; + tmp += v.y * y[iybs + iqs + j/qr + y_offset]; + } #endif // GGML_CUDA_F16 } } diff --git a/ggml/src/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu index 342f2eb66..5af02c7ec 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f16.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu @@ -259,7 +259,7 @@ static __global__ void flash_attn_tile_ext_f16( } half kqsum_j = __low2half(kqsum[j_VKQ_0/nwarps]) + __high2half(kqsum[j_VKQ_0/nwarps]); - kqsum_j = warp_reduce_sum(kqsum_j); + kqsum_j = warp_reduce_sum((float)kqsum_j); #pragma unroll for (int i00 = 0; i00 < D; i00 += 2*WARP_SIZE) { diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh index 448a9a905..2ed6509ac 100644 --- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh @@ -196,7 +196,7 @@ static __global__ void flash_attn_vec_ext_f16( #pragma unroll for (int j = 0; j < ncols; ++j) { half sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_h2[j], Q_i32[j], Q_ds[j]); - sum = warp_reduce_sum(sum); + sum = warp_reduce_sum((float)sum); if (use_logit_softcap) { sum = logit_softcap*tanhf(sum); @@ -265,7 +265,7 @@ static __global__ void flash_attn_vec_ext_f16( #pragma unroll for (int j = 0; j < ncols; ++j) { - kqsum[j] = warp_reduce_sum(kqsum[j]); + kqsum[j] = warp_reduce_sum((float)kqsum[j]); if (threadIdx.x == 0) { kqsum_shared[j][threadIdx.y] = kqsum[j]; } @@ -280,7 +280,7 @@ static __global__ void flash_attn_vec_ext_f16( } kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x]; - kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]); + kqsum[j_VKQ] = warp_reduce_sum((float)kqsum[j_VKQ]); half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ])); if (parallel_blocks == 1) { diff --git a/ggml/src/ggml-cuda/im2col.cu b/ggml/src/ggml-cuda/im2col.cu index 3d0d8d4e6..16463ab0f 100644 --- a/ggml/src/ggml-cuda/im2col.cu +++ b/ggml/src/ggml-cuda/im2col.cu @@ -69,7 +69,6 @@ void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { float * dst_d = (float *)dst->data; cudaStream_t stream = ctx.stream(); - GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 833984190..65c4f8119 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -19,6 +19,9 @@ extern "C" { #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) +// required for mmap as gguf only guarantees 32-byte alignment +#define TENSOR_ALIGNMENT 32 + // static_assert should be a #define, but if it's not, // fall back to the _Static_assert C11 keyword. // if C99 - static_assert is noop @@ -33,6 +36,21 @@ extern "C" { #endif #endif +// +// logging +// + +GGML_ATTRIBUTE_FORMAT(2, 3) +void ggml_log_internal (enum ggml_log_level level, const char * format, ...); +void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data); + +#define GGML_LOG(...) ggml_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__) +#define GGML_LOG_INFO(...) ggml_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__) +#define GGML_LOG_WARN(...) ggml_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__) +#define GGML_LOG_ERROR(...) ggml_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) +#define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) +#define GGML_LOG_CONT(...) ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__) + // bitset typedef uint32_t ggml_bitset_t; @@ -181,6 +199,11 @@ struct ggml_cgraph { struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1); +// Memory allocation + +void * ggml_aligned_malloc(size_t size); +void ggml_aligned_free(void * ptr, size_t size); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp index 9cbc57a64..2c926aaee 100644 --- a/ggml/src/ggml-kompute.cpp +++ b/ggml/src/ggml-kompute.cpp @@ -1921,6 +1921,7 @@ ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) { for (const auto & dev : devices) { vec.push_back({ /* .iface = */ ggml_backend_kompute_buffer_type_interface, + /* .device = */ nullptr, /* .context = */ new ggml_backend_kompute_buffer_type_context(dev.index, dev.bufferAlignment, dev.maxAlloc) }); } @@ -1989,11 +1990,8 @@ static struct ggml_backend_i kompute_backend_i = { /* .supports_op = */ ggml_backend_kompute_supports_op, /* .supports_buft = */ ggml_backend_kompute_supports_buft, /* .offload_op = */ NULL, - /* .event_new = */ NULL, - /* .event_free = */ NULL, /* .event_record = */ NULL, /* .event_wait = */ NULL, - /* .event_synchronize = */ NULL, }; static ggml_guid_t ggml_backend_kompute_guid() { @@ -2008,6 +2006,7 @@ ggml_backend_t ggml_backend_kompute_init(int device) { ggml_backend_t kompute_backend = new ggml_backend { /* .guid = */ ggml_backend_kompute_guid(), /* .interface = */ kompute_backend_i, + /* .device = */ nullptr, /* .context = */ s_kompute_context, }; @@ -2017,23 +2016,3 @@ ggml_backend_t ggml_backend_kompute_init(int device) { bool ggml_backend_is_kompute(ggml_backend_t backend) { return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_kompute_guid()); } - -static ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data) { - GGML_UNUSED(params); - return ggml_backend_kompute_init(intptr_t(user_data)); -} - -extern "C" int ggml_backend_kompute_reg_devices(); - -int ggml_backend_kompute_reg_devices() { - auto devices = ggml_vk_available_devices_internal(0); - for (const auto & device : devices) { - ggml_backend_register( - ggml_kompute_format_name(device.index).c_str(), - ggml_backend_reg_kompute_init, - ggml_backend_kompute_buffer_type(device.index), - reinterpret_cast(intptr_t(device.index)) - ); - } - return devices.size(); -} diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index ef3b7f0e8..172a0f925 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -12,21 +12,77 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) -#ifdef GGML_METAL_NDEBUG -#define GGML_METAL_LOG(...) -#define GGML_METAL_LOG_INFO(...) -#define GGML_METAL_LOG_WARN(...) -#define GGML_METAL_LOG_ERROR(...) -#else -#define GGML_METAL_LOG(...) ggml_metal_log(GGML_LOG_LEVEL_NONE, __VA_ARGS__) -#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) -#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) -#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) -#define GGML_METAL_LOG_DEBUG(...) ggml_metal_log(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) -#endif +// max memory buffers that can be mapped to the device +#define GGML_METAL_MAX_BUFFERS 64 + +// max number of MTLCommandBuffer used to submit a graph for processing +#define GGML_METAL_MAX_COMMAND_BUFFERS 8 #define UNUSED(x) (void)(x) +// globals + +// overload of MTLGPUFamilyMetal3 (not available in some environments) +static const NSInteger MTLGPUFamilyMetal3_GGML = 5001; + +// initialized in ggml_backend_metal_reg +static struct ggml_backend_reg g_ggml_backend_metal_reg; +static struct ggml_backend_device g_ggml_backend_metal_device; + +// information about a Metal device +// note: assumes single GPU device - the default one +// TODO: support multiple GPU devices +static struct ggml_backend_metal_device_context { + id mtl_device; + int mtl_device_ref_count; + + bool support_simdgroup_reduction; + bool support_simdgroup_mm; + + char name[128]; +} g_ggml_ctx_dev_main = { + /*.mtl_device =*/ nil, + /*.mtl_device_ref_count =*/ 0, + /*.support_simdgroup_reduction =*/ false, + /*.support_simdgroup_mm =*/ false, + /*.name =*/ "", +}; + +// acquire +static id ggml_backend_metal_device_acq(struct ggml_backend_metal_device_context * ctx) { + assert(ctx != NULL); + + if (ctx->mtl_device == nil) { + ctx->mtl_device = MTLCreateSystemDefaultDevice(); + + ctx->support_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; + ctx->support_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; + + ctx->support_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; + + strncpy(ctx->name, [[ctx->mtl_device name] UTF8String], sizeof(ctx->name) - 1); + } + + ctx->mtl_device_ref_count++; + + return ctx->mtl_device; +} + +// release +static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_context * ctx) { + assert(ctx != NULL); + assert(ctx->mtl_device_ref_count > 0); + + ctx->mtl_device_ref_count--; + + if (ctx->mtl_device_ref_count == 0) { + [ctx->mtl_device release]; + ctx->mtl_device = nil; + } +} + +// kernels + struct ggml_metal_kernel { id pipeline; }; @@ -221,19 +277,31 @@ enum ggml_metal_kernel_type { }; struct ggml_backend_metal_context { - int n_cb; - - id device; id queue; dispatch_queue_t d_queue; struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT]; - bool support_simdgroup_reduction; - bool support_simdgroup_mm; + // capture state + bool capture_next_compute; + bool capture_started; - bool should_capture_next_compute; + id capture_scope; + + // command buffer state + int n_cb; // number of extra threads used to submit the command buffers + int n_nodes_0; // number of nodes submitted by the main thread + int n_nodes_1; // remaining number of nodes submitted by the n_cb threads + int n_nodes_per_cb; + + struct ggml_cgraph * gf; + + // the callback given to the thread pool + void (^encode_async)(size_t ith); + + // n_cb command buffers + 1 used by the main thread + id command_buffers[GGML_METAL_MAX_COMMAND_BUFFERS + 1]; // abort ggml_metal_graph_compute if callback returns true ggml_abort_callback abort_callback; @@ -251,51 +319,19 @@ struct ggml_backend_metal_context { @implementation GGMLMetalClass @end -static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) { - fprintf(stderr, "%s", msg); - - UNUSED(level); - UNUSED(user_data); -} - -ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback; -void * ggml_metal_log_user_data = NULL; - -GGML_ATTRIBUTE_FORMAT(2, 3) -static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ - if (ggml_metal_log_callback != NULL) { - va_list args; - va_start(args, format); - char buffer[128]; - int len = vsnprintf(buffer, 128, format, args); - if (len < 128) { - ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data); - } else { - char* buffer2 = malloc(len+1); - va_end(args); - va_start(args, format); - vsnprintf(buffer2, len+1, format, args); - buffer2[len] = 0; - ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data); - free(buffer2); - } - va_end(args); - } -} - static void * ggml_metal_host_malloc(size_t n) { void * data = NULL; #if TARGET_OS_OSX kern_return_t err = vm_allocate((vm_map_t) mach_task_self(), (void *) &data, n, VM_FLAGS_ANYWHERE); if (err != KERN_SUCCESS) { - GGML_METAL_LOG_ERROR("%s: error: vm_allocate failed\n", __func__); + GGML_LOG_ERROR("%s: error: vm_allocate failed\n", __func__); return NULL; } #else const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); if (result != 0) { - GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__); + GGML_LOG_ERROR("%s: error: posix_memalign failed\n", __func__); return NULL; } #endif @@ -303,27 +339,26 @@ static void * ggml_metal_host_malloc(size_t n) { return data; } -static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { - GGML_METAL_LOG_INFO("%s: allocating\n", __func__); +static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t dev) { + GGML_LOG_INFO("%s: allocating\n", __func__); #if TARGET_OS_OSX && !GGML_METAL_NDEBUG // Show all the Metal device instances in the system NSArray * devices = MTLCopyAllDevices(); for (id device in devices) { - GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [[device name] UTF8String]); + GGML_LOG_INFO("%s: found device: %s\n", __func__, [[device name] UTF8String]); } [devices release]; // since it was created by a *Copy* C method #endif - // Pick and show default Metal device - id device = MTLCreateSystemDefaultDevice(); - GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); - - // Configure context + // init context struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context)); - ctx->device = device; - ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); - ctx->queue = [ctx->device newCommandQueue]; + struct ggml_backend_metal_device_context * ctx_dev = dev->context; + + id device = ggml_backend_metal_device_acq(ctx_dev); + GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); + + ctx->queue = [device newCommandQueue]; ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); id metal_library; @@ -354,28 +389,28 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { if (try_metallib && path_lib != nil) { // pre-compiled library found NSURL * libURL = [NSURL fileURLWithPath:path_lib]; - GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]); + GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]); - metal_library = [ctx->device newLibraryWithURL:libURL error:&error]; + metal_library = [device newLibraryWithURL:libURL error:&error]; if (error) { - GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); + GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } } else { #if GGML_METAL_EMBED_LIBRARY - GGML_METAL_LOG_INFO("%s: using embedded metal library\n", __func__); + GGML_LOG_INFO("%s: using embedded metal library\n", __func__); extern const char ggml_metallib_start[]; extern const char ggml_metallib_end[]; NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding]; #else - GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); + GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); NSString * path_source; NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"]; - GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil"); + GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil"); if (path_resource) { path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"]; @@ -384,15 +419,15 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { } if (path_source == nil) { - GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__); + GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__); path_source = @"ggml-metal.metal"; } - GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]); + GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]); NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error]; if (error) { - GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); + GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } #endif // GGML_METAL_EMBED_LIBRARY @@ -406,9 +441,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { //[options setFastMathEnabled:false]; - metal_library = [ctx->device newLibraryWithSource:src options:options error:&error]; + metal_library = [device newLibraryWithSource:src options:options error:&error]; if (error) { - GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); + GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } } @@ -416,56 +451,51 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { } // print MTL GPU family: - GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]); - - const NSInteger MTLGPUFamilyMetal3 = 5001; + GGML_LOG_INFO("%s: GPU name: %s\n", __func__, [[device name] UTF8String]); // determine max supported GPU family // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf { for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) { - if ([ctx->device supportsFamily:i]) { - GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i); + if ([device supportsFamily:i]) { + GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i); break; } } for (int i = MTLGPUFamilyCommon1 + 5; i >= MTLGPUFamilyCommon1; --i) { - if ([ctx->device supportsFamily:i]) { - GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyCommon%d (%d)\n", __func__, i - (int) MTLGPUFamilyCommon1 + 1, i); + if ([device supportsFamily:i]) { + GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyCommon%d (%d)\n", __func__, i - (int) MTLGPUFamilyCommon1 + 1, i); break; } } - for (int i = MTLGPUFamilyMetal3 + 5; i >= MTLGPUFamilyMetal3; --i) { - if ([ctx->device supportsFamily:i]) { - GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyMetal%d (%d)\n", __func__, i - (int) MTLGPUFamilyMetal3 + 3, i); + for (int i = MTLGPUFamilyMetal3_GGML + 5; i >= MTLGPUFamilyMetal3_GGML; --i) { + if ([device supportsFamily:i]) { + GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyMetal%d (%d)\n", __func__, i - (int) MTLGPUFamilyMetal3_GGML + 3, i); break; } } } - ctx->support_simdgroup_reduction = [ctx->device supportsFamily:MTLGPUFamilyApple7]; - ctx->support_simdgroup_reduction |= [ctx->device supportsFamily:MTLGPUFamilyMetal3]; + GGML_LOG_INFO("%s: simdgroup reduction support = %s\n", __func__, ctx_dev->support_simdgroup_reduction ? "true" : "false"); + GGML_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx_dev->support_simdgroup_mm ? "true" : "false"); + GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false"); - ctx->support_simdgroup_mm = [ctx->device supportsFamily:MTLGPUFamilyApple7]; + ctx->capture_next_compute = false; + ctx->capture_started = false; + ctx->capture_scope = nil; - GGML_METAL_LOG_INFO("%s: simdgroup reduction support = %s\n", __func__, ctx->support_simdgroup_reduction ? "true" : "false"); - GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx->support_simdgroup_mm ? "true" : "false"); - GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); - - ctx->should_capture_next_compute = false; + ctx->gf = nil; + ctx->encode_async = nil; + for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { + ctx->command_buffers[i] = nil; + } #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) if (@available(macOS 10.12, iOS 16.0, *)) { - GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6); - } -#elif TARGET_OS_OSX - if (ctx->device.maxTransferRate != 0) { - GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6); - } else { - GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__); + GGML_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, device.recommendedMaxWorkingSetSize / 1e6); } #endif @@ -478,7 +508,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { } /* - GGML_METAL_LOG_INFO("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \ + GGML_LOG_INFO("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \ (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \ (int) kernel->pipeline.threadExecutionWidth); \ */ @@ -486,17 +516,20 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { if (supported) { \ struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \ id metal_function = [metal_library newFunctionWithName:@"kernel_"#name]; \ - kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:metal_function error:&error]; \ + kernel->pipeline = [device newComputePipelineStateWithFunction:metal_function error:&error]; \ [metal_function release]; \ if (error) { \ - GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ + GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ [metal_library release]; \ return NULL; \ } \ } else { \ - GGML_METAL_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_"#name); \ + GGML_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_"#name); \ } + const bool support_simdgroup_mm = ctx_dev->support_simdgroup_mm; + const bool support_simdgroup_reduction = ctx_dev->support_simdgroup_reduction; + // simd_sum and simd_max requires MTLGPUFamilyApple7 GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD, add, true); @@ -523,10 +556,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK_4, gelu_quick_4, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU, silu, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU_4, silu_4, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16, soft_max_f16, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4, soft_max_f16_4, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32, soft_max_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4, soft_max_f32_4, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16, soft_max_f16, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4, soft_max_f16_4, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32, soft_max_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4, soft_max_f32_4, support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF, diag_mask_inf, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8, diag_mask_inf_8, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F32, get_rows_f32, true); @@ -551,101 +584,101 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS, get_rows_iq4_xs, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM, norm, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_CONV_F32, ssm_conv_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32, ssm_scan_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, mul_mv_f32_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, mul_mv_f16_f16, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, mul_mv_f16_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW, mul_mv_f16_f32_1row, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4, mul_mv_f16_f32_l4, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32, mul_mv_q4_0_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32, mul_mv_q4_1_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32, mul_mv_q5_0_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32, mul_mv_q5_1_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32, mul_mv_q8_0_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32, mul_mv_q2_K_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32, mul_mv_q3_K_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32, mul_mv_q4_K_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32, mul_mv_q5_K_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32, mul_mv_q6_K_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, mul_mv_iq3_s_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32, mul_mv_iq2_s_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32, mul_mv_iq1_m_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32, mul_mv_iq4_xs_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction); - //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, ctx->support_simdgroup_reduction); - //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW, mul_mv_id_f16_f32_1row, ctx->support_simdgroup_reduction); - //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4, mul_mv_id_f16_f32_l4, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32, mul_mv_id_q4_0_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32, mul_mv_id_q4_1_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32, mul_mv_id_q5_0_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32, mul_mv_id_q5_1_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32, mul_mv_id_q8_0_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32, mul_mv_id_q2_K_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32, mul_mv_id_q3_K_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32, mul_mv_id_q4_K_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32, mul_mv_id_q5_K_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32, mul_mv_id_q6_K_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, mul_mv_id_iq3_s_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32, mul_mv_id_iq2_s_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32, mul_mv_id_iq1_m_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32, mul_mv_id_iq4_xs_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32, mul_mm_q4_1_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32, mul_mm_q5_0_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32, mul_mm_q5_1_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32, mul_mm_q8_0_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32, mul_mm_q2_K_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32, mul_mm_q3_K_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32, mul_mm_q4_K_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32, mul_mm_q5_K_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32, mul_mm_q6_K_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, mul_mm_iq3_s_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32, mul_mm_iq2_s_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32, mul_mm_iq1_m_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32, mul_mm_iq4_xs_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32, mul_mm_id_q4_1_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32, mul_mm_id_q5_0_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32, mul_mm_id_q5_1_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32, mul_mm_id_q8_0_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32, mul_mm_id_q2_K_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32, mul_mm_id_q3_K_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32, mul_mm_id_q4_K_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32, mul_mm_id_q5_K_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32, mul_mm_id_q6_K_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, mul_mm_id_iq3_s_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32, mul_mm_id_iq2_s_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32, mul_mm_id_iq1_m_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, mul_mm_id_iq4_xs_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, mul_mv_f32_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, mul_mv_f16_f16, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, mul_mv_f16_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW, mul_mv_f16_f32_1row, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4, mul_mv_f16_f32_l4, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32, mul_mv_q4_0_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32, mul_mv_q4_1_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32, mul_mv_q5_0_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32, mul_mv_q5_1_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32, mul_mv_q8_0_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32, mul_mv_q2_K_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32, mul_mv_q3_K_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32, mul_mv_q4_K_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32, mul_mv_q5_K_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32, mul_mv_q6_K_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, mul_mv_iq3_s_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32, mul_mv_iq2_s_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32, mul_mv_iq1_m_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32, mul_mv_iq4_xs_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, support_simdgroup_reduction); + //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, support_simdgroup_reduction); + //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW, mul_mv_id_f16_f32_1row, support_simdgroup_reduction); + //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4, mul_mv_id_f16_f32_l4, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32, mul_mv_id_q4_0_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32, mul_mv_id_q4_1_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32, mul_mv_id_q5_0_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32, mul_mv_id_q5_1_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32, mul_mv_id_q8_0_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32, mul_mv_id_q2_K_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32, mul_mv_id_q3_K_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32, mul_mv_id_q4_K_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32, mul_mv_id_q5_K_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32, mul_mv_id_q6_K_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, mul_mv_id_iq3_s_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32, mul_mv_id_iq2_s_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32, mul_mv_id_iq1_m_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32, mul_mv_id_iq4_xs_f32, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32, mul_mm_q4_1_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32, mul_mm_q5_0_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32, mul_mm_q5_1_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32, mul_mm_q8_0_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32, mul_mm_q2_K_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32, mul_mm_q3_K_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32, mul_mm_q4_K_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32, mul_mm_q5_K_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32, mul_mm_q6_K_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, mul_mm_iq3_s_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32, mul_mm_iq2_s_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32, mul_mm_iq1_m_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32, mul_mm_iq4_xs_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32, mul_mm_id_q4_1_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32, mul_mm_id_q5_0_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32, mul_mm_id_q5_1_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32, mul_mm_id_q8_0_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32, mul_mm_id_q2_K_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32, mul_mm_id_q3_K_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32, mul_mm_id_q4_K_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32, mul_mm_id_q5_K_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32, mul_mm_id_q6_K_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, mul_mm_id_iq3_s_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32, mul_mm_id_iq2_s_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32, mul_mm_id_iq1_m_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, mul_mm_id_iq4_xs_f32, support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32, rope_norm_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16, rope_norm_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32, rope_neox_f32, true); @@ -659,14 +692,14 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, flash_attn_ext_f16_h64, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, flash_attn_ext_f16_h80, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, ctx->support_simdgroup_mm); - //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, ctx->support_simdgroup_reduction); - //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, flash_attn_ext_f16_h64, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, flash_attn_ext_f16_h80, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, support_simdgroup_mm); + //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, support_simdgroup_reduction); + //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16, cpy_f16_f16, true); @@ -686,18 +719,20 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { } [metal_library release]; + return ctx; } static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { - GGML_METAL_LOG_INFO("%s: deallocating\n", __func__); + GGML_LOG_INFO("%s: deallocating\n", __func__); for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) { [ctx->kernels[i].pipeline release]; } + Block_release(ctx->encode_async); + [ctx->queue release]; - [ctx->device release]; dispatch_release(ctx->d_queue); @@ -728,7 +763,7 @@ struct ggml_backend_metal_buffer_context { // Metal buffer based on the host memory pointer // static id ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) { - //GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); + //GGML_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); const int64_t tsize = ggml_nbytes(t); @@ -740,28 +775,31 @@ static id ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs for (int i = 0; i < buf_ctx->n_buffers; ++i) { const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data; - //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size); + //GGML_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size); if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) { *offs = (size_t) ioffs; - //GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs); + //GGML_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs); return buf_ctx->buffers[i].metal; } } - GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name); + GGML_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name); return nil; } -static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx, const struct ggml_tensor * op) { +static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_context * ctx_dev, const struct ggml_tensor * op) { for (size_t i = 0, n = 3; i < n; ++i) { if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) { return false; } } + const bool support_simdgroup_mm = ctx_dev->support_simdgroup_mm; + const bool support_simdgroup_reduction = ctx_dev->support_simdgroup_reduction; + switch (op->op) { case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { @@ -799,7 +837,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx case GGML_OP_SOFT_MAX: case GGML_OP_RMS_NORM: case GGML_OP_GROUP_NORM: - return ctx->support_simdgroup_reduction; + return support_simdgroup_reduction; case GGML_OP_NORM: case GGML_OP_ROPE: return true; @@ -825,13 +863,13 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx if (op->src[0]->ne[0] == 256) { return false; } - return ctx->support_simdgroup_mm; // TODO: over-restricted for vec-kernels + return support_simdgroup_mm; // TODO: over-restricted for vec-kernels case GGML_OP_SSM_CONV: case GGML_OP_SSM_SCAN: return true; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: - return ctx->support_simdgroup_reduction && + return support_simdgroup_reduction && (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F32); case GGML_OP_CPY: case GGML_OP_DUP: @@ -874,875 +912,826 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx } } -static enum ggml_status ggml_metal_graph_compute( - struct ggml_backend_metal_context * ctx, - struct ggml_cgraph * gf) { +static void ggml_metal_encode_node( + ggml_backend_t backend, + int idx, + id encoder) { + struct ggml_backend_metal_context * ctx = backend->context; + struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; - @autoreleasepool { - MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor; - edesc.dispatchType = MTLDispatchTypeSerial; + struct ggml_cgraph * gf = ctx->gf; - // create multiple command buffers and enqueue them - // then, we encode the graph into the command buffers in parallel + struct ggml_tensor * node = ggml_graph_node(gf, idx); - const int n_nodes = gf->n_nodes; - const int n_cb = ctx->n_cb; - const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb; + //GGML_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, idx, ggml_op_name(node->op)); - const bool should_capture = ctx->should_capture_next_compute; - if (should_capture) { - ctx->should_capture_next_compute = false; + struct ggml_tensor * src0 = node->src[0]; + struct ggml_tensor * src1 = node->src[1]; + struct ggml_tensor * src2 = node->src[2]; + struct ggml_tensor * dst = node; - MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new]; - descriptor.captureObject = ctx->queue; - - NSError * error = nil; - if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) { - GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]); - GGML_ABORT("capture failed"); - } + if (ggml_is_empty(dst)) { + return; } - id command_buffer_builder[n_cb]; - for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { - id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; - command_buffer_builder[cb_idx] = command_buffer; - - // always enqueue the first two command buffers - // enqueue all of the command buffers if we don't need to abort - if (cb_idx < 2 || ctx->abort_callback == NULL) { - [command_buffer enqueue]; - } + switch (dst->op) { + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_TRANSPOSE: + case GGML_OP_PERMUTE: + { + // noop -> next node + } return; + default: + { + } break; } - const id *command_buffers = command_buffer_builder; + if (!ggml_metal_supports_op(ctx_dev, dst)) { + GGML_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst)); + GGML_ABORT("unsupported op"); + } - dispatch_apply(n_cb, ctx->d_queue, ^(size_t iter) { - const int cb_idx = iter; + const int64_t ne00 = src0 ? src0->ne[0] : 0; + const int64_t ne01 = src0 ? src0->ne[1] : 0; + const int64_t ne02 = src0 ? src0->ne[2] : 0; + const int64_t ne03 = src0 ? src0->ne[3] : 0; - size_t offs_src0 = 0; - size_t offs_src1 = 0; - size_t offs_src2 = 0; - size_t offs_dst = 0; + const uint64_t nb00 = src0 ? src0->nb[0] : 0; + const uint64_t nb01 = src0 ? src0->nb[1] : 0; + const uint64_t nb02 = src0 ? src0->nb[2] : 0; + const uint64_t nb03 = src0 ? src0->nb[3] : 0; - id command_buffer = command_buffers[cb_idx]; - id encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; + const int64_t ne10 = src1 ? src1->ne[0] : 0; + const int64_t ne11 = src1 ? src1->ne[1] : 0; + const int64_t ne12 = src1 ? src1->ne[2] : 0; + const int64_t ne13 = src1 ? src1->ne[3] : 0; - const int node_start = (cb_idx + 0) * n_nodes_per_cb; - const int node_end = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes); + const uint64_t nb10 = src1 ? src1->nb[0] : 0; + const uint64_t nb11 = src1 ? src1->nb[1] : 0; + const uint64_t nb12 = src1 ? src1->nb[2] : 0; + const uint64_t nb13 = src1 ? src1->nb[3] : 0; - for (int i = node_start; i < node_end; ++i) { - if (i == -1) { - [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers]; - continue; - } + const int64_t ne20 = src2 ? src2->ne[0] : 0; + const int64_t ne21 = src2 ? src2->ne[1] : 0; + const int64_t ne22 = src2 ? src2->ne[2] : 0; GGML_UNUSED(ne22); + const int64_t ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23); - //GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); + const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20); + const uint64_t nb21 = src2 ? src2->nb[1] : 0; + const uint64_t nb22 = src2 ? src2->nb[2] : 0; + const uint64_t nb23 = src2 ? src2->nb[3] : 0; - struct ggml_tensor * src0 = gf->nodes[i]->src[0]; - struct ggml_tensor * src1 = gf->nodes[i]->src[1]; - struct ggml_tensor * src2 = gf->nodes[i]->src[2]; - struct ggml_tensor * dst = gf->nodes[i]; + const int64_t ne0 = dst ? dst->ne[0] : 0; + const int64_t ne1 = dst ? dst->ne[1] : 0; + const int64_t ne2 = dst ? dst->ne[2] : 0; + const int64_t ne3 = dst ? dst->ne[3] : 0; - if (ggml_is_empty(dst)) { - continue; - } + const uint64_t nb0 = dst ? dst->nb[0] : 0; + const uint64_t nb1 = dst ? dst->nb[1] : 0; + const uint64_t nb2 = dst ? dst->nb[2] : 0; + const uint64_t nb3 = dst ? dst->nb[3] : 0; - switch (dst->op) { - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_TRANSPOSE: - case GGML_OP_PERMUTE: - { - // noop -> next node - } continue; + const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; + const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; + const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; + + size_t offs_src0 = 0; + size_t offs_src1 = 0; + size_t offs_src2 = 0; + size_t offs_dst = 0; + + id id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil; + id id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil; + id id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil; + id id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil; + + //GGML_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op)); + //if (src0) { + // GGML_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, + // ggml_is_contiguous(src0), src0->name); + //} + //if (src1) { + // GGML_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, + // ggml_is_contiguous(src1), src1->name); + //} + //if (dst) { + // GGML_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, + // dst->name); + //} + + id device = ctx_dev->mtl_device; + + switch (dst->op) { + case GGML_OP_CONCAT: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline; + + const int32_t dim = ((const int32_t *) dst->op_params)[0]; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17]; + [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26]; + [encoder setBytes:&dim length:sizeof(dim) atIndex:27]; + + const int nth = MIN(1024, ne0); + + [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_ADD: + case GGML_OP_SUB: + case GGML_OP_MUL: + case GGML_OP_DIV: + { + GGML_ASSERT(src0t == GGML_TYPE_F32); + GGML_ASSERT(src1t == GGML_TYPE_F32); + + const size_t offs = 0; + + bool bcast_row = false; + + int64_t nb = ne00; // used by the "row" kernels + + id pipeline = nil; + + if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) { + GGML_ASSERT(ggml_is_contiguous(src0)); + + // src1 is a row + GGML_ASSERT(ne11 == 1); + + nb = ne00 / 4; + switch (dst->op) { + case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break; + case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB_ROW].pipeline; break; + case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break; + case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break; + default: GGML_ABORT("fatal error"); + } + + bcast_row = true; + } else { + switch (dst->op) { + case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break; + case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB].pipeline; break; + case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break; + case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break; + default: GGML_ABORT("fatal error"); + } + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17]; + [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26]; + [encoder setBytes:&offs length:sizeof(offs) atIndex:27]; + [encoder setBytes:&nb length:sizeof(nb) atIndex:28]; + + if (bcast_row) { + const int64_t n = ggml_nelements(dst)/4; + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } else { + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0); + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } + } break; + case GGML_OP_REPEAT: + { + id pipeline; + + switch (src0t) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F32].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break; + case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break; + case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break; + default: GGML_ABORT("fatal error"); + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; + + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0); + + [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_ACC: + { + GGML_ASSERT(src0t == GGML_TYPE_F32); + GGML_ASSERT(src1t == GGML_TYPE_F32); + GGML_ASSERT(dstt == GGML_TYPE_F32); + + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + + const size_t pnb1 = ((const int32_t *) dst->op_params)[0]; + const size_t pnb2 = ((const int32_t *) dst->op_params)[1]; + const size_t pnb3 = ((const int32_t *) dst->op_params)[2]; + const size_t offs = ((const int32_t *) dst->op_params)[3]; + + const bool inplace = (bool) ((const int32_t *) dst->op_params)[4]; + + if (!inplace) { + // run a separete kernel to cpy src->dst + // not sure how to avoid this + // TODO: make a simpler cpy_bytes kernel + + const id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; + + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00); + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } + + const id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; + [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:8]; + [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:9]; + [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:10]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17]; + [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23]; + [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:24]; + [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:25]; + [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26]; + [encoder setBytes:&offs length:sizeof(offs) atIndex:27]; + + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00); + + [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_SCALE: + { + GGML_ASSERT(ggml_is_contiguous(src0)); + + float scale; + memcpy(&scale, dst->op_params, sizeof(scale)); + + int64_t n = ggml_nelements(dst); + + id pipeline = nil; + + if (n % 4 == 0) { + n /= 4; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE_4].pipeline; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE].pipeline; + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&scale length:sizeof(scale) atIndex:2]; + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_CLAMP: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CLAMP].pipeline; + + float min; + float max; + memcpy(&min, ((const int32_t *) dst->op_params) + 0, sizeof(float)); + memcpy(&max, ((const int32_t *) dst->op_params) + 1, sizeof(float)); + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&min length:sizeof(min) atIndex:2]; + [encoder setBytes:&max length:sizeof(max) atIndex:3]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_UNARY: + switch (ggml_get_unary_op(node)) { + // we are not taking into account the strides, so for now require contiguous tensors + GGML_ASSERT(ggml_is_contiguous(src0)); + + case GGML_UNARY_OP_TANH: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TANH].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_UNARY_OP_RELU: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RELU].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_UNARY_OP_SIGMOID: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIGMOID].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_UNARY_OP_GELU: + { + int64_t n = ggml_nelements(dst); + + id pipeline = nil; + + if (n % 4 == 0) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_4].pipeline; + n /= 4; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline; + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_UNARY_OP_GELU_QUICK: + { + int64_t n = ggml_nelements(dst); + + id pipeline = nil; + + if (n % 4 == 0) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK_4].pipeline; + n /= 4; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline; + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_UNARY_OP_SILU: + { + int64_t n = ggml_nelements(dst); + + id pipeline = nil; + + if (n % 4 == 0) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU_4].pipeline; + n /= 4; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline; + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; default: - { - } break; - } - - if (!ggml_metal_supports_op(ctx, dst)) { - GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst)); - GGML_ABORT("unsupported op"); - } - - if (should_capture) { - [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]]; - } - - const int64_t ne00 = src0 ? src0->ne[0] : 0; - const int64_t ne01 = src0 ? src0->ne[1] : 0; - const int64_t ne02 = src0 ? src0->ne[2] : 0; - const int64_t ne03 = src0 ? src0->ne[3] : 0; - - const uint64_t nb00 = src0 ? src0->nb[0] : 0; - const uint64_t nb01 = src0 ? src0->nb[1] : 0; - const uint64_t nb02 = src0 ? src0->nb[2] : 0; - const uint64_t nb03 = src0 ? src0->nb[3] : 0; - - const int64_t ne10 = src1 ? src1->ne[0] : 0; - const int64_t ne11 = src1 ? src1->ne[1] : 0; - const int64_t ne12 = src1 ? src1->ne[2] : 0; - const int64_t ne13 = src1 ? src1->ne[3] : 0; - - const uint64_t nb10 = src1 ? src1->nb[0] : 0; - const uint64_t nb11 = src1 ? src1->nb[1] : 0; - const uint64_t nb12 = src1 ? src1->nb[2] : 0; - const uint64_t nb13 = src1 ? src1->nb[3] : 0; - - const int64_t ne20 = src2 ? src2->ne[0] : 0; - const int64_t ne21 = src2 ? src2->ne[1] : 0; - const int64_t ne22 = src2 ? src2->ne[2] : 0; GGML_UNUSED(ne22); - const int64_t ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23); - - const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20); - const uint64_t nb21 = src2 ? src2->nb[1] : 0; - const uint64_t nb22 = src2 ? src2->nb[2] : 0; - const uint64_t nb23 = src2 ? src2->nb[3] : 0; - - const int64_t ne0 = dst ? dst->ne[0] : 0; - const int64_t ne1 = dst ? dst->ne[1] : 0; - const int64_t ne2 = dst ? dst->ne[2] : 0; - const int64_t ne3 = dst ? dst->ne[3] : 0; - - const uint64_t nb0 = dst ? dst->nb[0] : 0; - const uint64_t nb1 = dst ? dst->nb[1] : 0; - const uint64_t nb2 = dst ? dst->nb[2] : 0; - const uint64_t nb3 = dst ? dst->nb[3] : 0; - - const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; - const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; - const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; - - id id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil; - id id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil; - id id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil; - id id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil; - - //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op)); - //if (src0) { - // GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, - // ggml_is_contiguous(src0), src0->name); - //} - //if (src1) { - // GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, - // ggml_is_contiguous(src1), src1->name); - //} - //if (dst) { - // GGML_METAL_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, - // dst->name); - //} - - switch (dst->op) { - case GGML_OP_CONCAT: - { - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline; - - const int32_t dim = ((int32_t *) dst->op_params)[0]; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; - [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9]; - [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10]; - [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11]; - [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13]; - [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17]; - [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20]; - [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21]; - [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22]; - [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24]; - [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25]; - [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26]; - [encoder setBytes:&dim length:sizeof(dim) atIndex:27]; - - const int nth = MIN(1024, ne0); - - [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_ADD: - case GGML_OP_SUB: - case GGML_OP_MUL: - case GGML_OP_DIV: - { - GGML_ASSERT(src0t == GGML_TYPE_F32); - GGML_ASSERT(src1t == GGML_TYPE_F32); - - const size_t offs = 0; - - bool bcast_row = false; - - int64_t nb = ne00; // used by the "row" kernels - - id pipeline = nil; - - if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) { - GGML_ASSERT(ggml_is_contiguous(src0)); - - // src1 is a row - GGML_ASSERT(ne11 == 1); - - nb = ne00 / 4; - switch (dst->op) { - case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break; - case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB_ROW].pipeline; break; - case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break; - case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break; - default: GGML_ABORT("fatal error"); - } - - bcast_row = true; - } else { - switch (dst->op) { - case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break; - case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB].pipeline; break; - case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break; - case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break; - default: GGML_ABORT("fatal error"); - } - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; - [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9]; - [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10]; - [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11]; - [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13]; - [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17]; - [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20]; - [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21]; - [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22]; - [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24]; - [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25]; - [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26]; - [encoder setBytes:&offs length:sizeof(offs) atIndex:27]; - [encoder setBytes:&nb length:sizeof(nb) atIndex:28]; - - if (bcast_row) { - const int64_t n = ggml_nelements(dst)/4; - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } else { - const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0); - - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } - } break; - case GGML_OP_REPEAT: - { - id pipeline; - - switch (src0t) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F32].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break; - case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break; - case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break; - default: GGML_ABORT("fatal error"); - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; - - const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0); - - [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_ACC: - { - GGML_ASSERT(src0t == GGML_TYPE_F32); - GGML_ASSERT(src1t == GGML_TYPE_F32); - GGML_ASSERT(dstt == GGML_TYPE_F32); - - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(src1)); - - const size_t pnb1 = ((int32_t *) dst->op_params)[0]; - const size_t pnb2 = ((int32_t *) dst->op_params)[1]; - const size_t pnb3 = ((int32_t *) dst->op_params)[2]; - const size_t offs = ((int32_t *) dst->op_params)[3]; - - const bool inplace = (bool) ((int32_t *) dst->op_params)[4]; - - if (!inplace) { - // run a separete kernel to cpy src->dst - // not sure how to avoid this - // TODO: make a simpler cpy_bytes kernel - - const id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; - - const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00); - - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } - - const id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; - [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; - [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:8]; - [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:9]; - [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:10]; - [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11]; - [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13]; - [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17]; - [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20]; - [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21]; - [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22]; - [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23]; - [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:24]; - [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:25]; - [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26]; - [encoder setBytes:&offs length:sizeof(offs) atIndex:27]; - - const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00); - - [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_SCALE: - { - GGML_ASSERT(ggml_is_contiguous(src0)); - - float scale; - memcpy(&scale, dst->op_params, sizeof(scale)); - - int64_t n = ggml_nelements(dst); - - id pipeline = nil; - - if (n % 4 == 0) { - n /= 4; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE_4].pipeline; - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE].pipeline; - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&scale length:sizeof(scale) atIndex:2]; - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_CLAMP: - { - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CLAMP].pipeline; - - float min; - float max; - memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float)); - memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float)); - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&min length:sizeof(min) atIndex:2]; - [encoder setBytes:&max length:sizeof(max) atIndex:3]; - - const int64_t n = ggml_nelements(dst); - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_UNARY: - switch (ggml_get_unary_op(gf->nodes[i])) { - // we are not taking into account the strides, so for now require contiguous tensors - GGML_ASSERT(ggml_is_contiguous(src0)); - - case GGML_UNARY_OP_TANH: - { - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TANH].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - const int64_t n = ggml_nelements(dst); - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_UNARY_OP_RELU: - { - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RELU].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - const int64_t n = ggml_nelements(dst); - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_UNARY_OP_SIGMOID: - { - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIGMOID].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - const int64_t n = ggml_nelements(dst); - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_UNARY_OP_GELU: - { - int64_t n = ggml_nelements(dst); - - id pipeline = nil; - - if (n % 4 == 0) { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_4].pipeline; - n /= 4; - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline; - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_UNARY_OP_GELU_QUICK: - { - int64_t n = ggml_nelements(dst); - - id pipeline = nil; - - if (n % 4 == 0) { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK_4].pipeline; - n /= 4; - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline; - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_UNARY_OP_SILU: - { - int64_t n = ggml_nelements(dst); - - id pipeline = nil; - - if (n % 4 == 0) { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU_4].pipeline; - n /= 4; - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline; - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - default: - { - GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - GGML_ABORT("fatal error"); - } - } break; - case GGML_OP_SQR: - { - GGML_ASSERT(ggml_is_contiguous(src0)); - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQR].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - const int64_t n = ggml_nelements(dst); - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_SQRT: - { - GGML_ASSERT(ggml_is_contiguous(src0)); - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQRT].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - const int64_t n = ggml_nelements(dst); - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_SIN: - { - GGML_ASSERT(ggml_is_contiguous(src0)); - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIN].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - const int64_t n = ggml_nelements(dst); - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_COS: - { - GGML_ASSERT(ggml_is_contiguous(src0)); - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_COS].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - const int64_t n = ggml_nelements(dst); - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_SUM_ROWS: - { - GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; - [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10]; - [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12]; - [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16]; - [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:18]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:19]; - [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:20]; - [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:21]; - [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:22]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:23]; - [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:24]; - [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:25]; - - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_SOFT_MAX: - { - GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); - - int nth = 32; // SIMD width - - id pipeline = nil; - - const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16); - - if (ne00%4 == 0) { - while (nth < ne00/4 && nth*ne01*ne02*ne03 < 256) { - nth *= 2; - } - if (use_f16) { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4].pipeline; - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4].pipeline; - } - } else { - while (nth < ne00 && nth*ne01*ne02*ne03 < 256) { - nth *= 2; - } - if (use_f16) { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16].pipeline; - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32].pipeline; - } - } - - float scale; - float max_bias; - - memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(scale)); - memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias)); - - const int64_t nrows_x = ggml_nrows(src0); - const int64_t nrows_y = src0->ne[1]; - - const uint32_t n_head = nrows_x/nrows_y; - const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head)); - - const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); - const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - if (id_src1) { - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - } else { - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; - } - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; - [encoder setBytes:&scale length:sizeof(scale) atIndex:6]; - [encoder setBytes:&max_bias length:sizeof(max_bias) atIndex:7]; - [encoder setBytes:&m0 length:sizeof(m0) atIndex:8]; - [encoder setBytes:&m1 length:sizeof(m1) atIndex:9]; - [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:10]; - [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; - - [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_DIAG_MASK_INF: - { - const int n_past = ((int32_t *)(dst->op_params))[0]; - - id pipeline = nil; - - if (ne00%8 == 0) { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8].pipeline; - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF].pipeline; - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; - [encoder setBytes:&n_past length:sizeof(int) atIndex:4]; - - if (ne00%8 == 0) { - [encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } - else { - [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } - } break; - case GGML_OP_SSM_CONV: - { - GGML_ASSERT(src0t == GGML_TYPE_F32); - GGML_ASSERT(src1t == GGML_TYPE_F32); - - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(src1)); - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_CONV_F32].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; - [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9]; - [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:11]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:12]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; - [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:15]; - [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:16]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:17]; - [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:18]; - - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne1, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_SSM_SCAN: - { - struct ggml_tensor * src3 = gf->nodes[i]->src[3]; - struct ggml_tensor * src4 = gf->nodes[i]->src[4]; - struct ggml_tensor * src5 = gf->nodes[i]->src[5]; - - GGML_ASSERT(src3); - GGML_ASSERT(src4); - GGML_ASSERT(src5); - - size_t offs_src3 = 0; - size_t offs_src4 = 0; - size_t offs_src5 = 0; - - id id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil; - id id_src4 = src4 ? ggml_metal_get_buffer(src4, &offs_src4) : nil; - id id_src5 = src5 ? ggml_metal_get_buffer(src5, &offs_src5) : nil; - - const int64_t ne30 = src3->ne[0]; GGML_UNUSED(ne30); - const int64_t ne31 = src3->ne[1]; GGML_UNUSED(ne31); - - const uint64_t nb30 = src3->nb[0]; - const uint64_t nb31 = src3->nb[1]; - - const int64_t ne40 = src4->ne[0]; GGML_UNUSED(ne40); - const int64_t ne41 = src4->ne[1]; GGML_UNUSED(ne41); - const int64_t ne42 = src4->ne[2]; GGML_UNUSED(ne42); - - const uint64_t nb40 = src4->nb[0]; - const uint64_t nb41 = src4->nb[1]; - const uint64_t nb42 = src4->nb[2]; - - const int64_t ne50 = src5->ne[0]; GGML_UNUSED(ne50); - const int64_t ne51 = src5->ne[1]; GGML_UNUSED(ne51); - const int64_t ne52 = src5->ne[2]; GGML_UNUSED(ne52); - - const uint64_t nb50 = src5->nb[0]; - const uint64_t nb51 = src5->nb[1]; - const uint64_t nb52 = src5->nb[2]; - - const int64_t d_state = ne00; - const int64_t d_inner = ne01; - const int64_t n_seq_tokens = ne11; - const int64_t n_seqs = ne02; - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; - [encoder setBuffer:id_src3 offset:offs_src3 atIndex:3]; - [encoder setBuffer:id_src4 offset:offs_src4 atIndex:4]; - [encoder setBuffer:id_src5 offset:offs_src5 atIndex:5]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:6]; - - [encoder setBytes:&d_state length:sizeof(d_state) atIndex:7]; - [encoder setBytes:&d_inner length:sizeof(d_inner) atIndex:8]; - [encoder setBytes:&n_seq_tokens length:sizeof(n_seq_tokens) atIndex:9]; - [encoder setBytes:&n_seqs length:sizeof(n_seqs) atIndex:10]; - - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:11]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:12]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:13]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16]; - [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17]; - [encoder setBytes:&nb20 length:sizeof(nb20) atIndex:18]; - [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:19]; - [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:20]; - [encoder setBytes:&nb30 length:sizeof(nb30) atIndex:21]; - [encoder setBytes:&nb31 length:sizeof(nb31) atIndex:22]; - [encoder setBytes:&nb40 length:sizeof(nb40) atIndex:23]; - [encoder setBytes:&nb41 length:sizeof(nb41) atIndex:24]; - [encoder setBytes:&nb42 length:sizeof(nb42) atIndex:25]; - [encoder setBytes:&nb50 length:sizeof(nb50) atIndex:26]; - [encoder setBytes:&nb51 length:sizeof(nb51) atIndex:27]; - [encoder setBytes:&nb52 length:sizeof(nb52) atIndex:28]; - - [encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_MUL_MAT: - { - GGML_ASSERT(ne00 == ne10); - - GGML_ASSERT(ne12 % ne02 == 0); - GGML_ASSERT(ne13 % ne03 == 0); - - const uint r2 = ne12/ne02; - const uint r3 = ne13/ne03; - - // find the break-even point where the matrix-matrix kernel becomes more efficient compared - // to the matrix-vector kernel - int ne11_mm_min = 1; + { + GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op)); + GGML_ABORT("fatal error"); + } + } break; + case GGML_OP_SQR: + { + GGML_ASSERT(ggml_is_contiguous(src0)); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQR].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_SQRT: + { + GGML_ASSERT(ggml_is_contiguous(src0)); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQRT].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_SIN: + { + GGML_ASSERT(ggml_is_contiguous(src0)); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIN].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_COS: + { + GGML_ASSERT(ggml_is_contiguous(src0)); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_COS].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_SUM_ROWS: + { + GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16]; + [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:18]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:19]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:20]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:21]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:22]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:23]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:24]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:25]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_SOFT_MAX: + { + GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); + + int nth = 32; // SIMD width + + id pipeline = nil; + + const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16); + + if (ne00%4 == 0) { + while (nth < ne00/4 && nth*ne01*ne02*ne03 < 256) { + nth *= 2; + } + if (use_f16) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4].pipeline; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4].pipeline; + } + } else { + while (nth < ne00 && nth*ne01*ne02*ne03 < 256) { + nth *= 2; + } + if (use_f16) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16].pipeline; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32].pipeline; + } + } + + float scale; + float max_bias; + + memcpy(&scale, ((const int32_t *) dst->op_params) + 0, sizeof(scale)); + memcpy(&max_bias, ((const int32_t *) dst->op_params) + 1, sizeof(max_bias)); + + const int64_t nrows_x = ggml_nrows(src0); + const int64_t nrows_y = src0->ne[1]; + + const uint32_t n_head = nrows_x/nrows_y; + const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head)); + + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + if (id_src1) { + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + } else { + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + } + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; + [encoder setBytes:&scale length:sizeof(scale) atIndex:6]; + [encoder setBytes:&max_bias length:sizeof(max_bias) atIndex:7]; + [encoder setBytes:&m0 length:sizeof(m0) atIndex:8]; + [encoder setBytes:&m1 length:sizeof(m1) atIndex:9]; + [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:10]; + [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_DIAG_MASK_INF: + { + const int n_past = ((const int32_t *)(dst->op_params))[0]; + + id pipeline = nil; + + if (ne00%8 == 0) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8].pipeline; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF].pipeline; + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&n_past length:sizeof(int) atIndex:4]; + + if (ne00%8 == 0) { + [encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } + else { + [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } + } break; + case GGML_OP_SSM_CONV: + { + GGML_ASSERT(src0t == GGML_TYPE_F32); + GGML_ASSERT(src1t == GGML_TYPE_F32); + + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_CONV_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:11]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:12]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:15]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:16]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:17]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:18]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne1, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_SSM_SCAN: + { + struct ggml_tensor * src3 = node->src[3]; + struct ggml_tensor * src4 = node->src[4]; + struct ggml_tensor * src5 = node->src[5]; + + GGML_ASSERT(src3); + GGML_ASSERT(src4); + GGML_ASSERT(src5); + + size_t offs_src3 = 0; + size_t offs_src4 = 0; + size_t offs_src5 = 0; + + id id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil; + id id_src4 = src4 ? ggml_metal_get_buffer(src4, &offs_src4) : nil; + id id_src5 = src5 ? ggml_metal_get_buffer(src5, &offs_src5) : nil; + + const int64_t ne30 = src3->ne[0]; GGML_UNUSED(ne30); + const int64_t ne31 = src3->ne[1]; GGML_UNUSED(ne31); + + const uint64_t nb30 = src3->nb[0]; + const uint64_t nb31 = src3->nb[1]; + + const int64_t ne40 = src4->ne[0]; GGML_UNUSED(ne40); + const int64_t ne41 = src4->ne[1]; GGML_UNUSED(ne41); + const int64_t ne42 = src4->ne[2]; GGML_UNUSED(ne42); + + const uint64_t nb40 = src4->nb[0]; + const uint64_t nb41 = src4->nb[1]; + const uint64_t nb42 = src4->nb[2]; + + const int64_t ne50 = src5->ne[0]; GGML_UNUSED(ne50); + const int64_t ne51 = src5->ne[1]; GGML_UNUSED(ne51); + const int64_t ne52 = src5->ne[2]; GGML_UNUSED(ne52); + + const uint64_t nb50 = src5->nb[0]; + const uint64_t nb51 = src5->nb[1]; + const uint64_t nb52 = src5->nb[2]; + + const int64_t d_state = ne00; + const int64_t d_inner = ne01; + const int64_t n_seq_tokens = ne11; + const int64_t n_seqs = ne02; + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; + [encoder setBuffer:id_src3 offset:offs_src3 atIndex:3]; + [encoder setBuffer:id_src4 offset:offs_src4 atIndex:4]; + [encoder setBuffer:id_src5 offset:offs_src5 atIndex:5]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:6]; + + [encoder setBytes:&d_state length:sizeof(d_state) atIndex:7]; + [encoder setBytes:&d_inner length:sizeof(d_inner) atIndex:8]; + [encoder setBytes:&n_seq_tokens length:sizeof(n_seq_tokens) atIndex:9]; + [encoder setBytes:&n_seqs length:sizeof(n_seqs) atIndex:10]; + + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:11]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:12]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:13]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16]; + [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17]; + [encoder setBytes:&nb20 length:sizeof(nb20) atIndex:18]; + [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:19]; + [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:20]; + [encoder setBytes:&nb30 length:sizeof(nb30) atIndex:21]; + [encoder setBytes:&nb31 length:sizeof(nb31) atIndex:22]; + [encoder setBytes:&nb40 length:sizeof(nb40) atIndex:23]; + [encoder setBytes:&nb41 length:sizeof(nb41) atIndex:24]; + [encoder setBytes:&nb42 length:sizeof(nb42) atIndex:25]; + [encoder setBytes:&nb50 length:sizeof(nb50) atIndex:26]; + [encoder setBytes:&nb51 length:sizeof(nb51) atIndex:27]; + [encoder setBytes:&nb52 length:sizeof(nb52) atIndex:28]; + + [encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_MUL_MAT: + { + GGML_ASSERT(ne00 == ne10); + + GGML_ASSERT(ne12 % ne02 == 0); + GGML_ASSERT(ne13 % ne03 == 0); + + const uint r2 = ne12/ne02; + const uint r3 = ne13/ne03; + + // find the break-even point where the matrix-matrix kernel becomes more efficient compared + // to the matrix-vector kernel + int ne11_mm_min = 1; #if 0 - // the numbers below are measured on M2 Ultra for 7B and 13B models - // these numbers do not translate to other devices or model sizes - // TODO: need to find a better approach - if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) { + // the numbers below are measured on M2 Ultra for 7B and 13B models + // these numbers do not translate to other devices or model sizes + // TODO: need to find a better approach + if ([device.name isEqualToString:@"Apple M2 Ultra"]) { switch (src0t) { case GGML_TYPE_F16: ne11_mm_min = 2; break; case GGML_TYPE_Q8_0: ne11_mm_min = 7; break; @@ -1762,12 +1751,12 @@ static enum ggml_status ggml_metal_graph_compute( // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel - if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && - !ggml_is_transposed(src0) && - !ggml_is_transposed(src1) && - src1t == GGML_TYPE_F32 && - ne00 % 32 == 0 && ne00 >= 64 && - (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) { + if ([device supportsFamily:MTLGPUFamilyApple7] && + !ggml_is_transposed(src0) && + !ggml_is_transposed(src1) && + src1t == GGML_TYPE_F32 && + ne00 % 32 == 0 && ne00 >= 64 && + (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) { //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); // some Metal matrix data types require aligned pointers @@ -1974,7 +1963,7 @@ static enum ggml_status ggml_metal_graph_compute( } break; default: { - GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t); + GGML_LOG_ERROR("Asserting on type %d\n", (int)src0t); GGML_ABORT("not implemented"); } }; @@ -2001,8 +1990,8 @@ static enum ggml_status ggml_metal_graph_compute( [encoder setBytes:&r3 length:sizeof(r3) atIndex:18]; if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 || - src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K || - src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) { + src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K || + src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) { @@ -2036,1041 +2025,1120 @@ static enum ggml_status ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } } - } break; - case GGML_OP_MUL_MAT_ID: - { - const int n_as = src0->ne[2]; - - // src2 = ids - const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t); - - GGML_ASSERT(src2t == GGML_TYPE_I32); - - GGML_ASSERT(!ggml_is_transposed(src0)); - GGML_ASSERT(!ggml_is_transposed(src1)); - - GGML_ASSERT(src1t == GGML_TYPE_F32); - - // find the break-even point where the matrix-matrix kernel becomes more efficient compared - // to the matrix-vector kernel - // ne20 = n_used_experts - // ne21 = n_rows - const int dst_rows = ne20*ne21; - const int dst_rows_min = n_as; - const int dst_rows_max = (ctx->device.maxThreadgroupMemoryLength - 32 - 8192)/4; - - // max size of the rowids array in the kernel shared buffer - GGML_ASSERT(dst_rows <= dst_rows_max); - - // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs - // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel - // !!! - // TODO: for now, always use mat-vec kernels until we figure out how to improve the - // indirect matrix multiplication - // !!! - if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && - ne00 % 32 == 0 && ne00 >= 64 && - dst_rows > dst_rows_min) { - - // some Metal matrix data types require aligned pointers - // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5) - switch (src0->type) { - case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break; - case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break; - default: break; - } - - id pipeline = nil; - - switch (src0->type) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32 ].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32 ].pipeline; break; - case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32 ].pipeline; break; - case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32 ].pipeline; break; - case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32 ].pipeline; break; - case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32 ].pipeline; break; - case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32 ].pipeline; break; - case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32 ].pipeline; break; - case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32 ].pipeline; break; - case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32 ].pipeline; break; - case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32 ].pipeline; break; - case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32 ].pipeline; break; - case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break; - case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break; - case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break; - case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32 ].pipeline; break; - case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32 ].pipeline; break; - case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break; - case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32 ].pipeline; break; - case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break; - case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break; - default: GGML_ABORT("MUL_MAT_ID not implemented"); - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBuffer:id_src2 offset:offs_src2 atIndex:3]; - [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4]; - [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5]; - [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:6]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:7]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:8]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:9]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:10]; - [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12]; - [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:17]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:18]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:19]; - - [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + dst_rows*4/*sizeof(ushort2)*/, 16) atIndex:0]; - - [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 31)/32, (ne01 + 63)/64, n_as) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; - } else { - int nth0 = 32; - int nth1 = 1; - int nrows = 1; - //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); - - id pipeline = nil; - - // use custom matrix x vector kernel - switch (src0t) { - case GGML_TYPE_F32: - { - GGML_ASSERT(src1t == GGML_TYPE_F32); - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32].pipeline; - } break; - case GGML_TYPE_F16: - { - GGML_ASSERT(src1t == GGML_TYPE_F32); - nth0 = 32; - nth1 = 1; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32].pipeline; - } break; - case GGML_TYPE_Q4_0: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32].pipeline; - } break; - case GGML_TYPE_Q4_1: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32].pipeline; - } break; - case GGML_TYPE_Q5_0: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32].pipeline; - } break; - case GGML_TYPE_Q5_1: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32].pipeline; - } break; - case GGML_TYPE_Q8_0: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32].pipeline; - } break; - case GGML_TYPE_Q2_K: - { - nth0 = 2; - nth1 = 32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32].pipeline; - } break; - case GGML_TYPE_Q3_K: - { - nth0 = 2; - nth1 = 32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32].pipeline; - } break; - case GGML_TYPE_Q4_K: - { - nth0 = 4; //1; - nth1 = 8; //32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32].pipeline; - } break; - case GGML_TYPE_Q5_K: - { - nth0 = 2; - nth1 = 32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32].pipeline; - } break; - case GGML_TYPE_Q6_K: - { - nth0 = 2; - nth1 = 32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32].pipeline; - } break; - case GGML_TYPE_IQ2_XXS: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32].pipeline; - } break; - case GGML_TYPE_IQ2_XS: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32].pipeline; - } break; - case GGML_TYPE_IQ3_XXS: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32].pipeline; - } break; - case GGML_TYPE_IQ3_S: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32].pipeline; - } break; - case GGML_TYPE_IQ2_S: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32].pipeline; - } break; - case GGML_TYPE_IQ1_S: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32].pipeline; - } break; - case GGML_TYPE_IQ1_M: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32].pipeline; - } break; - case GGML_TYPE_IQ4_NL: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline; - } break; - case GGML_TYPE_IQ4_XS: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline; - } break; - default: - { - GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t); - GGML_ABORT("not implemented"); - } - }; - - if (ggml_is_quantized(src0t)) { - GGML_ASSERT(ne00 >= nth0*nth1); - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBuffer:id_src2 offset:offs_src2 atIndex:3]; - [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4]; - [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5]; - [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:6]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:7]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:8]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:9]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:10]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:11]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:12]; - [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:13]; - [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:14]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:15]; - [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:16]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:17]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:18]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:19]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:20]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:21]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:22]; - - const int64_t _ne1 = 1; - const int tgz = dst_rows; - - if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 || - src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K || - src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) { - const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128; - [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) { - const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4; - [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) { - const int mem_size = 32*sizeof(float); - [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_Q4_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_Q3_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_Q5_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_Q6_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } else { - const int64_t ny = (_ne1 + nrows - 1)/nrows; // = _ne1 - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - } - } break; - case GGML_OP_GET_ROWS: - { - id pipeline = nil; - - switch (src0->type) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F32 ].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F16 ].pipeline; break; - case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0 ].pipeline; break; - case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1 ].pipeline; break; - case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0 ].pipeline; break; - case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1 ].pipeline; break; - case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0 ].pipeline; break; - case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K ].pipeline; break; - case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K ].pipeline; break; - case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K ].pipeline; break; - case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K ].pipeline; break; - case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K ].pipeline; break; - case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break; - case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break; - case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break; - case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S ].pipeline; break; - case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S ].pipeline; break; - case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break; - case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_M ].pipeline; break; - case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break; - case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS ].pipeline; break; - case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break; - default: GGML_ABORT("not implemented"); - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:5]; - [encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:6]; - [encoder setBytes:&nb10 length:sizeof( int64_t) atIndex:7]; - [encoder setBytes:&nb11 length:sizeof( int64_t) atIndex:8]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:10]; - - [encoder dispatchThreadgroups:MTLSizeMake(ne10, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)]; - } break; - case GGML_OP_RMS_NORM: - { - GGML_ASSERT(ne00 % 4 == 0); - GGML_ASSERT(ggml_is_contiguous_1(src0)); - - float eps; - memcpy(&eps, dst->op_params, sizeof(float)); - - int nth = 32; // SIMD width - - while (nth < ne00/4 && nth < 1024) { - nth *= 2; - } - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; - [encoder setBytes:&eps length:sizeof( float) atIndex:4]; - [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; - - const int64_t nrows = ggml_nrows(src0); - - [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_GROUP_NORM: - { - GGML_ASSERT(ne00 % 4 == 0); - GGML_ASSERT(ggml_is_contiguous(src0)); - - float eps; - memcpy(&eps, dst->op_params + 1, sizeof(float)); - - const int32_t n_groups = ((int32_t *) dst->op_params)[0]; - - int nth = 32; // SIMD width - - //while (nth < ne00/4 && nth < 1024) { - // nth *= 2; - //} - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GROUP_NORM].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:5]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:6]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&n_groups length:sizeof( int32_t) atIndex:8]; - [encoder setBytes:&eps length:sizeof( float) atIndex:9]; - [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; - - [encoder dispatchThreadgroups:MTLSizeMake(n_groups, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_NORM: - { - GGML_ASSERT(ggml_is_contiguous_1(src0)); - - float eps; - memcpy(&eps, dst->op_params, sizeof(float)); - - const int nth = MIN(256, ne00); - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NORM].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; - [encoder setBytes:&eps length:sizeof( float) atIndex:4]; - [encoder setThreadgroupMemoryLength:GGML_PAD(nth*sizeof(float), 16) atIndex:0]; - - const int64_t nrows = ggml_nrows(src0); - - [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_ROPE: - { - GGML_ASSERT(ne10 == ne02); - - const int nth = MIN(1024, ne00); - - const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal - const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; - - float freq_base; - float freq_scale; - float ext_factor; - float attn_factor; - float beta_fast; - float beta_slow; - - memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); - memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); - memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); - memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); - - const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; - - id pipeline = nil; - - if (!is_neox) { - switch (src0->type) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break; - default: GGML_ABORT("fatal error"); - }; - } else { - switch (src0->type) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break; - default: GGML_ABORT("fatal error"); - }; - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - if (id_src2 != nil) { - [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; - } else { - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:2]; - } - [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:10]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:11]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:14]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:15]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:17]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:18]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:19]; - [encoder setBytes:&n_past length:sizeof( int) atIndex:20]; - [encoder setBytes:&n_dims length:sizeof( int) atIndex:21]; - [encoder setBytes:&n_ctx_orig length:sizeof( int) atIndex:22]; - [encoder setBytes:&freq_base length:sizeof( float) atIndex:23]; - [encoder setBytes:&freq_scale length:sizeof( float) atIndex:24]; - [encoder setBytes:&ext_factor length:sizeof( float) atIndex:25]; - [encoder setBytes:&attn_factor length:sizeof( float) atIndex:26]; - [encoder setBytes:&beta_fast length:sizeof( float) atIndex:27]; - [encoder setBytes:&beta_slow length:sizeof( float) atIndex:28]; - - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_IM2COL: - { - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); - - const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; - const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; - const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; - const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; - const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; - const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; - - const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; - - const int32_t N = src1->ne[is_2D ? 3 : 2]; - const int32_t IC = src1->ne[is_2D ? 2 : 1]; - const int32_t IH = is_2D ? src1->ne[1] : 1; - const int32_t IW = src1->ne[0]; - - const int32_t KH = is_2D ? src0->ne[1] : 1; - const int32_t KW = src0->ne[0]; - - const int32_t OH = is_2D ? dst->ne[2] : 1; - const int32_t OW = dst->ne[1]; - - const int32_t CHW = IC * KH * KW; - - const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; - const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; - - id pipeline = nil; - - switch (dst->type) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break; - default: GGML_ABORT("fatal error"); - }; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ofs0 length:sizeof( int32_t) atIndex:2]; - [encoder setBytes:&ofs1 length:sizeof( int32_t) atIndex:3]; - [encoder setBytes:&IW length:sizeof( int32_t) atIndex:4]; - [encoder setBytes:&IH length:sizeof( int32_t) atIndex:5]; - [encoder setBytes:&CHW length:sizeof( int32_t) atIndex:6]; - [encoder setBytes:&s0 length:sizeof( int32_t) atIndex:7]; - [encoder setBytes:&s1 length:sizeof( int32_t) atIndex:8]; - [encoder setBytes:&p0 length:sizeof( int32_t) atIndex:9]; - [encoder setBytes:&p1 length:sizeof( int32_t) atIndex:10]; - [encoder setBytes:&d0 length:sizeof( int32_t) atIndex:11]; - [encoder setBytes:&d1 length:sizeof( int32_t) atIndex:12]; - - [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)]; - } break; - case GGML_OP_UPSCALE: - { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - - const float sf0 = (float)ne0/src0->ne[0]; - const float sf1 = (float)ne1/src0->ne[1]; - const float sf2 = (float)ne2/src0->ne[2]; - const float sf3 = (float)ne3/src0->ne[3]; - - const id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; - [encoder setBytes:&sf0 length:sizeof(sf0) atIndex:18]; - [encoder setBytes:&sf1 length:sizeof(sf1) atIndex:19]; - [encoder setBytes:&sf2 length:sizeof(sf2) atIndex:20]; - [encoder setBytes:&sf3 length:sizeof(sf3) atIndex:21]; - - const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0); - - [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_PAD: - { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_F32].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; - - const int nth = MIN(1024, ne0); - - [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_ARANGE: - { - GGML_ASSERT(dst->type == GGML_TYPE_F32); - - float start; - float step; - - memcpy(&start, ((int32_t *) dst->op_params) + 0, sizeof(float)); - memcpy(&step, ((int32_t *) dst->op_params) + 2, sizeof(float)); - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARANGE_F32].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:0]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:1]; - [encoder setBytes:&start length:sizeof(start) atIndex:2]; - [encoder setBytes:&step length:sizeof(step) atIndex:3]; - - const int nth = MIN(1024, ne0); - - [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_TIMESTEP_EMBEDDING: - { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - - const int dim = dst->op_params[0]; - const int max_period = dst->op_params[1]; - - const int half = dim / 2; - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:2]; - [encoder setBytes:&dim length:sizeof(dim) atIndex:3]; - [encoder setBytes:&max_period length:sizeof(max_period) atIndex:4]; - - const int nth = MIN(1024, half); - - [encoder dispatchThreadgroups:MTLSizeMake(ne00, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_ARGSORT: - { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_I32); - - const int nrows = ggml_nrows(src0); - - enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; - - // bitonic sort requires the number of elements to be power of 2 - int64_t ne00_padded = 1; - while (ne00_padded < ne00) { - ne00_padded *= 2; - } - - // Metal kernels require the buffer size to be multiple of 16 bytes - // https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/1443142-setthreadgroupmemorylength - const int mem_size = GGML_PAD(ne00_padded*sizeof(int32_t), 16); - - id pipeline = nil; - - switch (order) { - case GGML_SORT_ORDER_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break; - case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break; - default: GGML_ABORT("fatal error"); - }; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&ne00_padded length:sizeof( int64_t) atIndex:3]; - [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; - - [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00_padded, 1, 1)]; - } break; - case GGML_OP_LEAKY_RELU: - { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - - float slope; - memcpy(&slope, dst->op_params, sizeof(float)); - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&slope length:sizeof(slope) atIndex:2]; - - const int64_t n = ggml_nelements(dst); - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_FLASH_ATTN_EXT: - { - GGML_ASSERT(ne00 % 4 == 0); - GGML_ASSERT(ne11 % 32 == 0); - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - - GGML_ASSERT(ggml_are_same_shape (src1, src2)); - - struct ggml_tensor * src3 = gf->nodes[i]->src[3]; - - size_t offs_src3 = 0; - - id id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil; - - GGML_ASSERT(!src3 || src3->type == GGML_TYPE_F16); - GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) && - "the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big"); - - const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30); - //const int64_t ne31 = src3 ? src3->ne[1] : 0; - const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32); - const int64_t ne33 = src3 ? src3->ne[3] : 0; GGML_UNUSED(ne33); - - const uint64_t nb30 = src3 ? src3->nb[0] : 0; GGML_UNUSED(nb30); - const uint64_t nb31 = src3 ? src3->nb[1] : 0; - const uint64_t nb32 = src3 ? src3->nb[2] : 0; GGML_UNUSED(nb32); - const uint64_t nb33 = src3 ? src3->nb[3] : 0; GGML_UNUSED(nb33); - - const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t); - - float scale; - float max_bias; - float logit_softcap; - memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(scale)); - memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias)); - memcpy(&logit_softcap, ((int32_t *) dst->op_params) + 2, sizeof(logit_softcap)); - - if (logit_softcap != 0.0f) { - scale /= logit_softcap; - } - - const uint32_t n_head = src0->ne[2]; - const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head)); - - const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); - const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - - id pipeline = nil; - - bool use_vec_kernel = false; - - if (ne01 >= 4 || (ne00%128 != 0)) { - switch (ne00) { - case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64 ].pipeline; break; - case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80 ].pipeline; break; - case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break; - case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break; - case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break; - //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break; - default: - { - GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00); - GGML_METAL_LOG_ERROR("add template specialization for this size\n"); - GGML_ABORT("add template specialization for this size"); - } - } - } else { - use_vec_kernel = true; - - switch (ne00) { - case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break; - //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break; - default: - { - GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00); - GGML_METAL_LOG_ERROR("add template specialization for this size\n"); - GGML_ABORT("add template specialization for this size"); - } - } - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; - if (id_src3) { - [encoder setBuffer:id_src3 offset:offs_src3 atIndex:3]; - } else { - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:3]; - } - [encoder setBuffer:id_dst offset:offs_dst atIndex:4]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10]; - [encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:14]; - [encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb21 length:sizeof(uint64_t) atIndex:17]; - [encoder setBytes:&nb22 length:sizeof(uint64_t) atIndex:18]; - [encoder setBytes:&nb23 length:sizeof(uint64_t) atIndex:19]; - [encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:20]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:21]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:22]; - [encoder setBytes:&scale length:sizeof( float) atIndex:23]; - [encoder setBytes:&max_bias length:sizeof( float) atIndex:24]; - [encoder setBytes:&m0 length:sizeof(m0) atIndex:25]; - [encoder setBytes:&m1 length:sizeof(m1) atIndex:26]; - [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:27]; - [encoder setBytes:&logit_softcap length:sizeof(logit_softcap) atIndex:28]; - - if (!use_vec_kernel) { - // half8x8 kernel - const int64_t nqptg = 8; // queries per threadgroup !! sync with kernel template arguments !! - const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !! - - GGML_ASSERT(nqptg <= 32); - GGML_ASSERT(nqptg % 8 == 0); - GGML_ASSERT(ncpsg % 32 == 0); - - int64_t nsgmax = 2; - - while (true) { - const size_t smem = nqptg*(ne00 + 2*nsgmax*(ncpsg + nqptg))*(sizeof(float)/2); - if (smem > ctx->device.maxThreadgroupMemoryLength) { - break; - } - nsgmax *= 2; - } - nsgmax /= 2; - - // simdgroups per threadgroup (a.k.a. warps) - const int64_t nsg = ne01 <= nqptg ? MAX(4, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))) : 4; - - const size_t smem = nqptg*(ne00 + 2*nsg*(ncpsg + nqptg))*(sizeof(float)/2); - - //printf("smem: %zu, max: %zu\n", smem, ctx->device.maxThreadgroupMemoryLength); - GGML_ASSERT(smem <= ctx->device.maxThreadgroupMemoryLength); - - [encoder setThreadgroupMemoryLength:GGML_PAD(smem, 16) atIndex:0]; - - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)]; - } else { - // half1x4 kernel - const int64_t nqptg = 1; // queries per threadgroup !! sync with kernel template arguments !! - const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !! - - GGML_ASSERT(nqptg <= 32); - GGML_ASSERT(nqptg % 1 == 0); - GGML_ASSERT(ncpsg % 32 == 0); - - // simdgroups per threadgroup (a.k.a. warps) - const int64_t nsgt = MAX(2, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32)); - - int64_t nsg = 1; - while (nsg <= nsgt) { - nsg *= 2; - } - nsg /= 2; - - const size_t smem = (nqptg*(ne00 + 2*nsg*(ncpsg + nqptg)) + nsg*ne00)*(sizeof(float)/2); - - //printf("smem: %zu, max: %zu\n", smem, ctx->device.maxThreadgroupMemoryLength); - GGML_ASSERT(smem <= ctx->device.maxThreadgroupMemoryLength); - [encoder setThreadgroupMemoryLength:GGML_PAD(smem, 16) atIndex:0]; - - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)]; - } - } break; - case GGML_OP_DUP: - case GGML_OP_CPY: - case GGML_OP_CONT: - { - GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); - - int nth = MIN(1024, ne00/ggml_blck_size(src0->type)); - - id pipeline = nil; - - switch (src0t) { - case GGML_TYPE_F32: - { - GGML_ASSERT(ne0 % ggml_blck_size(dst->type) == 0); - - switch (dstt) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F16].pipeline; break; - case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0].pipeline; break; - case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0].pipeline; break; - case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1].pipeline; break; - case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break; - case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break; - case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL].pipeline; break; - default: GGML_ABORT("not implemented"); - }; - } break; - case GGML_TYPE_F16: - { - switch (dstt) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break; - default: GGML_ABORT("not implemented"); - }; - } break; - default: GGML_ABORT("not implemented"); - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; - - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - default: - { - GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - GGML_ABORT("fatal error"); + } break; + case GGML_OP_MUL_MAT_ID: + { + const int n_as = src0->ne[2]; + + // src2 = ids + const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t); + + GGML_ASSERT(src2t == GGML_TYPE_I32); + + GGML_ASSERT(!ggml_is_transposed(src0)); + GGML_ASSERT(!ggml_is_transposed(src1)); + + GGML_ASSERT(src1t == GGML_TYPE_F32); + + // find the break-even point where the matrix-matrix kernel becomes more efficient compared + // to the matrix-vector kernel + // ne20 = n_used_experts + // ne21 = n_rows + const int dst_rows = ne20*ne21; + const int dst_rows_min = n_as; + const int dst_rows_max = (device.maxThreadgroupMemoryLength - 32 - 8192)/4; + + // max size of the rowids array in the kernel shared buffer + GGML_ASSERT(dst_rows <= dst_rows_max); + + // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs + // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel + // !!! + // TODO: for now, always use mat-vec kernels until we figure out how to improve the + // indirect matrix multiplication + // !!! + if ([device supportsFamily:MTLGPUFamilyApple7] && + ne00 % 32 == 0 && ne00 >= 64 && + dst_rows > dst_rows_min) { + + // some Metal matrix data types require aligned pointers + // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5) + switch (src0->type) { + case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break; + case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break; + default: break; } - } - if (should_capture) { - [encoder popDebugGroup]; + id pipeline = nil; + + switch (src0->type) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32 ].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32 ].pipeline; break; + case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32 ].pipeline; break; + case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32 ].pipeline; break; + case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32 ].pipeline; break; + case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32 ].pipeline; break; + case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32 ].pipeline; break; + case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32 ].pipeline; break; + case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32 ].pipeline; break; + case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32 ].pipeline; break; + case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32 ].pipeline; break; + case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32 ].pipeline; break; + case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break; + case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break; + case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break; + case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32 ].pipeline; break; + case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32 ].pipeline; break; + case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break; + case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32 ].pipeline; break; + case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break; + case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break; + default: GGML_ABORT("MUL_MAT_ID not implemented"); + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBuffer:id_src2 offset:offs_src2 atIndex:3]; + [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4]; + [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5]; + [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:6]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:7]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:8]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:9]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:10]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:17]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:18]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:19]; + + [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + dst_rows*4/*sizeof(ushort2)*/, 16) atIndex:0]; + + [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 31)/32, (ne01 + 63)/64, n_as) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; + } else { + int nth0 = 32; + int nth1 = 1; + int nrows = 1; + //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); + + id pipeline = nil; + + // use custom matrix x vector kernel + switch (src0t) { + case GGML_TYPE_F32: + { + GGML_ASSERT(src1t == GGML_TYPE_F32); + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32].pipeline; + } break; + case GGML_TYPE_F16: + { + GGML_ASSERT(src1t == GGML_TYPE_F32); + nth0 = 32; + nth1 = 1; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32].pipeline; + } break; + case GGML_TYPE_Q4_0: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32].pipeline; + } break; + case GGML_TYPE_Q4_1: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32].pipeline; + } break; + case GGML_TYPE_Q5_0: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32].pipeline; + } break; + case GGML_TYPE_Q5_1: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32].pipeline; + } break; + case GGML_TYPE_Q8_0: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32].pipeline; + } break; + case GGML_TYPE_Q2_K: + { + nth0 = 2; + nth1 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32].pipeline; + } break; + case GGML_TYPE_Q3_K: + { + nth0 = 2; + nth1 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32].pipeline; + } break; + case GGML_TYPE_Q4_K: + { + nth0 = 4; //1; + nth1 = 8; //32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32].pipeline; + } break; + case GGML_TYPE_Q5_K: + { + nth0 = 2; + nth1 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32].pipeline; + } break; + case GGML_TYPE_Q6_K: + { + nth0 = 2; + nth1 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32].pipeline; + } break; + case GGML_TYPE_IQ2_XXS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32].pipeline; + } break; + case GGML_TYPE_IQ2_XS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32].pipeline; + } break; + case GGML_TYPE_IQ3_XXS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32].pipeline; + } break; + case GGML_TYPE_IQ3_S: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32].pipeline; + } break; + case GGML_TYPE_IQ2_S: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32].pipeline; + } break; + case GGML_TYPE_IQ1_S: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32].pipeline; + } break; + case GGML_TYPE_IQ1_M: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32].pipeline; + } break; + case GGML_TYPE_IQ4_NL: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline; + } break; + case GGML_TYPE_IQ4_XS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline; + } break; + default: + { + GGML_LOG_ERROR("Asserting on type %d\n", (int)src2t); + GGML_ABORT("not implemented"); + } + }; + + if (ggml_is_quantized(src0t)) { + GGML_ASSERT(ne00 >= nth0*nth1); + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBuffer:id_src2 offset:offs_src2 atIndex:3]; + [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4]; + [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5]; + [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:6]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:7]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:8]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:9]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:10]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:11]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:12]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:13]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:14]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:15]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:16]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:17]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:18]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:19]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:20]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:21]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:22]; + + const int64_t _ne1 = 1; + const int tgz = dst_rows; + + if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 || + src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K || + src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) { + const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128; + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) { + const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4; + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) { + const int mem_size = 32*sizeof(float); + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q4_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q3_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q5_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q6_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } else { + const int64_t ny = (_ne1 + nrows - 1)/nrows; // = _ne1 + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + } + } break; + case GGML_OP_GET_ROWS: + { + id pipeline = nil; + + switch (src0->type) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F32 ].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F16 ].pipeline; break; + case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0 ].pipeline; break; + case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1 ].pipeline; break; + case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0 ].pipeline; break; + case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1 ].pipeline; break; + case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0 ].pipeline; break; + case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K ].pipeline; break; + case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K ].pipeline; break; + case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K ].pipeline; break; + case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K ].pipeline; break; + case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K ].pipeline; break; + case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break; + case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break; + case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break; + case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S ].pipeline; break; + case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S ].pipeline; break; + case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break; + case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_M ].pipeline; break; + case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break; + case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS ].pipeline; break; + case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break; + default: GGML_ABORT("not implemented"); + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:5]; + [encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:6]; + [encoder setBytes:&nb10 length:sizeof( int64_t) atIndex:7]; + [encoder setBytes:&nb11 length:sizeof( int64_t) atIndex:8]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:10]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne10, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)]; + } break; + case GGML_OP_RMS_NORM: + { + GGML_ASSERT(ne00 % 4 == 0); + GGML_ASSERT(ggml_is_contiguous_1(src0)); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + int nth = 32; // SIMD width + + while (nth < ne00/4 && nth < 1024) { + nth *= 2; + } + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; + [encoder setBytes:&eps length:sizeof( float) atIndex:4]; + [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; + + const int64_t nrows = ggml_nrows(src0); + + [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_GROUP_NORM: + { + GGML_ASSERT(ne00 % 4 == 0); + GGML_ASSERT(ggml_is_contiguous(src0)); + + float eps; + memcpy(&eps, dst->op_params + 1, sizeof(float)); + + const int32_t n_groups = ((const int32_t *) dst->op_params)[0]; + + int nth = 32; // SIMD width + + //while (nth < ne00/4 && nth < 1024) { + // nth *= 2; + //} + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GROUP_NORM].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:5]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:6]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&n_groups length:sizeof( int32_t) atIndex:8]; + [encoder setBytes:&eps length:sizeof( float) atIndex:9]; + [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; + + [encoder dispatchThreadgroups:MTLSizeMake(n_groups, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_NORM: + { + GGML_ASSERT(ggml_is_contiguous_1(src0)); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + const int nth = MIN(256, ne00); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NORM].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; + [encoder setBytes:&eps length:sizeof( float) atIndex:4]; + [encoder setThreadgroupMemoryLength:GGML_PAD(nth*sizeof(float), 16) atIndex:0]; + + const int64_t nrows = ggml_nrows(src0); + + [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_ROPE: + { + GGML_ASSERT(ne10 == ne02); + + const int nth = MIN(1024, ne00); + + const int n_past = ((const int32_t *) dst->op_params)[0]; + const int n_dims = ((const int32_t *) dst->op_params)[1]; + const int mode = ((const int32_t *) dst->op_params)[2]; + // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal + const int n_ctx_orig = ((const int32_t *) dst->op_params)[4]; + + float freq_base; + float freq_scale; + float ext_factor; + float attn_factor; + float beta_fast; + float beta_slow; + + memcpy(&freq_base, (const int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (const int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (const int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (const int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (const int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (const int32_t *) dst->op_params + 10, sizeof(float)); + + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; + + id pipeline = nil; + + if (!is_neox) { + switch (src0->type) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break; + default: GGML_ABORT("fatal error"); + }; + } else { + switch (src0->type) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break; + default: GGML_ABORT("fatal error"); + }; + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + if (id_src2 != nil) { + [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; + } else { + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:2]; + } + [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:10]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:11]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:14]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:15]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:18]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:19]; + [encoder setBytes:&n_past length:sizeof( int) atIndex:20]; + [encoder setBytes:&n_dims length:sizeof( int) atIndex:21]; + [encoder setBytes:&n_ctx_orig length:sizeof( int) atIndex:22]; + [encoder setBytes:&freq_base length:sizeof( float) atIndex:23]; + [encoder setBytes:&freq_scale length:sizeof( float) atIndex:24]; + [encoder setBytes:&ext_factor length:sizeof( float) atIndex:25]; + [encoder setBytes:&attn_factor length:sizeof( float) atIndex:26]; + [encoder setBytes:&beta_fast length:sizeof( float) atIndex:27]; + [encoder setBytes:&beta_slow length:sizeof( float) atIndex:28]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_IM2COL: + { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); + + const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; + + const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; + + const int32_t N = src1->ne[is_2D ? 3 : 2]; + const int32_t IC = src1->ne[is_2D ? 2 : 1]; + const int32_t IH = is_2D ? src1->ne[1] : 1; + const int32_t IW = src1->ne[0]; + + const int32_t KH = is_2D ? src0->ne[1] : 1; + const int32_t KW = src0->ne[0]; + + const int32_t OH = is_2D ? dst->ne[2] : 1; + const int32_t OW = dst->ne[1]; + + const int32_t CHW = IC * KH * KW; + + const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; + const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; + + id pipeline = nil; + + switch (dst->type) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break; + default: GGML_ABORT("fatal error"); + }; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ofs0 length:sizeof( int32_t) atIndex:2]; + [encoder setBytes:&ofs1 length:sizeof( int32_t) atIndex:3]; + [encoder setBytes:&IW length:sizeof( int32_t) atIndex:4]; + [encoder setBytes:&IH length:sizeof( int32_t) atIndex:5]; + [encoder setBytes:&CHW length:sizeof( int32_t) atIndex:6]; + [encoder setBytes:&s0 length:sizeof( int32_t) atIndex:7]; + [encoder setBytes:&s1 length:sizeof( int32_t) atIndex:8]; + [encoder setBytes:&p0 length:sizeof( int32_t) atIndex:9]; + [encoder setBytes:&p1 length:sizeof( int32_t) atIndex:10]; + [encoder setBytes:&d0 length:sizeof( int32_t) atIndex:11]; + [encoder setBytes:&d1 length:sizeof( int32_t) atIndex:12]; + + [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)]; + } break; + case GGML_OP_UPSCALE: + { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + + const float sf0 = (float)ne0/src0->ne[0]; + const float sf1 = (float)ne1/src0->ne[1]; + const float sf2 = (float)ne2/src0->ne[2]; + const float sf3 = (float)ne3/src0->ne[3]; + + const id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; + [encoder setBytes:&sf0 length:sizeof(sf0) atIndex:18]; + [encoder setBytes:&sf1 length:sizeof(sf1) atIndex:19]; + [encoder setBytes:&sf2 length:sizeof(sf2) atIndex:20]; + [encoder setBytes:&sf3 length:sizeof(sf3) atIndex:21]; + + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0); + + [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_PAD: + { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; + + const int nth = MIN(1024, ne0); + + [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_ARANGE: + { + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + float start; + float step; + + memcpy(&start, ((const int32_t *) dst->op_params) + 0, sizeof(float)); + memcpy(&step, ((const int32_t *) dst->op_params) + 2, sizeof(float)); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARANGE_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:0]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:1]; + [encoder setBytes:&start length:sizeof(start) atIndex:2]; + [encoder setBytes:&step length:sizeof(step) atIndex:3]; + + const int nth = MIN(1024, ne0); + + [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_TIMESTEP_EMBEDDING: + { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + + const int dim = dst->op_params[0]; + const int max_period = dst->op_params[1]; + + const int half = dim / 2; + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:2]; + [encoder setBytes:&dim length:sizeof(dim) atIndex:3]; + [encoder setBytes:&max_period length:sizeof(max_period) atIndex:4]; + + const int nth = MIN(1024, half); + + [encoder dispatchThreadgroups:MTLSizeMake(ne00, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_ARGSORT: + { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_I32); + + const int nrows = ggml_nrows(src0); + + enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; + + // bitonic sort requires the number of elements to be power of 2 + int64_t ne00_padded = 1; + while (ne00_padded < ne00) { + ne00_padded *= 2; + } + + // Metal kernels require the buffer size to be multiple of 16 bytes + // https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/1443142-setthreadgroupmemorylength + const int mem_size = GGML_PAD(ne00_padded*sizeof(int32_t), 16); + + id pipeline = nil; + + switch (order) { + case GGML_SORT_ORDER_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break; + case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break; + default: GGML_ABORT("fatal error"); + }; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne00_padded length:sizeof( int64_t) atIndex:3]; + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + + [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00_padded, 1, 1)]; + } break; + case GGML_OP_LEAKY_RELU: + { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + + float slope; + memcpy(&slope, dst->op_params, sizeof(float)); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&slope length:sizeof(slope) atIndex:2]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_FLASH_ATTN_EXT: + { + GGML_ASSERT(ne00 % 4 == 0); + GGML_ASSERT(ne11 % 32 == 0); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + + GGML_ASSERT(ggml_are_same_shape (src1, src2)); + + struct ggml_tensor * src3 = node->src[3]; + + size_t offs_src3 = 0; + + id id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil; + + GGML_ASSERT(!src3 || src3->type == GGML_TYPE_F16); + GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) && + "the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big"); + + const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30); + //const int64_t ne31 = src3 ? src3->ne[1] : 0; + const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32); + const int64_t ne33 = src3 ? src3->ne[3] : 0; GGML_UNUSED(ne33); + + const uint64_t nb30 = src3 ? src3->nb[0] : 0; GGML_UNUSED(nb30); + const uint64_t nb31 = src3 ? src3->nb[1] : 0; + const uint64_t nb32 = src3 ? src3->nb[2] : 0; GGML_UNUSED(nb32); + const uint64_t nb33 = src3 ? src3->nb[3] : 0; GGML_UNUSED(nb33); + + const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t); + + float scale; + float max_bias; + float logit_softcap; + memcpy(&scale, ((const int32_t *) dst->op_params) + 0, sizeof(scale)); + memcpy(&max_bias, ((const int32_t *) dst->op_params) + 1, sizeof(max_bias)); + memcpy(&logit_softcap, ((const int32_t *) dst->op_params) + 2, sizeof(logit_softcap)); + + if (logit_softcap != 0.0f) { + scale /= logit_softcap; + } + + const uint32_t n_head = src0->ne[2]; + const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head)); + + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + + id pipeline = nil; + + bool use_vec_kernel = false; + + if (ne01 >= 4 || (ne00%128 != 0)) { + switch (ne00) { + case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64 ].pipeline; break; + case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80 ].pipeline; break; + case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break; + case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break; + case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break; + //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break; + default: + { + GGML_LOG_ERROR("unsupported size: %lld\n", ne00); + GGML_LOG_ERROR("add template specialization for this size\n"); + GGML_ABORT("add template specialization for this size"); + } + } + } else { + use_vec_kernel = true; + + switch (ne00) { + case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break; + //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break; + default: + { + GGML_LOG_ERROR("unsupported size: %lld\n", ne00); + GGML_LOG_ERROR("add template specialization for this size\n"); + GGML_ABORT("add template specialization for this size"); + } + } + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; + if (id_src3) { + [encoder setBuffer:id_src3 offset:offs_src3 atIndex:3]; + } else { + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:3]; + } + [encoder setBuffer:id_dst offset:offs_dst atIndex:4]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10]; + [encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:14]; + [encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb21 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&nb22 length:sizeof(uint64_t) atIndex:18]; + [encoder setBytes:&nb23 length:sizeof(uint64_t) atIndex:19]; + [encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:20]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:21]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:22]; + [encoder setBytes:&scale length:sizeof( float) atIndex:23]; + [encoder setBytes:&max_bias length:sizeof( float) atIndex:24]; + [encoder setBytes:&m0 length:sizeof(m0) atIndex:25]; + [encoder setBytes:&m1 length:sizeof(m1) atIndex:26]; + [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:27]; + [encoder setBytes:&logit_softcap length:sizeof(logit_softcap) atIndex:28]; + + if (!use_vec_kernel) { + // half8x8 kernel + const int64_t nqptg = 8; // queries per threadgroup !! sync with kernel template arguments !! + const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !! + + GGML_ASSERT(nqptg <= 32); + GGML_ASSERT(nqptg % 8 == 0); + GGML_ASSERT(ncpsg % 32 == 0); + + int64_t nsgmax = 2; + + while (true) { + const size_t smem = nqptg*(ne00 + 2*nsgmax*(ncpsg + nqptg))*(sizeof(float)/2); + if (smem > device.maxThreadgroupMemoryLength) { + break; + } + nsgmax *= 2; + } + nsgmax /= 2; + + // simdgroups per threadgroup (a.k.a. warps) + const int64_t nsg = ne01 <= nqptg ? MAX(4, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))) : 4; + + const size_t smem = nqptg*(ne00 + 2*nsg*(ncpsg + nqptg))*(sizeof(float)/2); + + //printf("smem: %zu, max: %zu\n", smem, device.maxThreadgroupMemoryLength); + GGML_ASSERT(smem <= device.maxThreadgroupMemoryLength); + + [encoder setThreadgroupMemoryLength:GGML_PAD(smem, 16) atIndex:0]; + + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)]; + } else { + // half1x4 kernel + const int64_t nqptg = 1; // queries per threadgroup !! sync with kernel template arguments !! + const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !! + + GGML_ASSERT(nqptg <= 32); + GGML_ASSERT(nqptg % 1 == 0); + GGML_ASSERT(ncpsg % 32 == 0); + + // simdgroups per threadgroup (a.k.a. warps) + const int64_t nsgt = MAX(2, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32)); + + int64_t nsg = 1; + while (nsg <= nsgt) { + nsg *= 2; + } + nsg /= 2; + + const size_t smem = (nqptg*(ne00 + 2*nsg*(ncpsg + nqptg)) + nsg*ne00)*(sizeof(float)/2); + + //printf("smem: %zu, max: %zu\n", smem, device.maxThreadgroupMemoryLength); + GGML_ASSERT(smem <= device.maxThreadgroupMemoryLength); + [encoder setThreadgroupMemoryLength:GGML_PAD(smem, 16) atIndex:0]; + + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)]; + } + } break; + case GGML_OP_DUP: + case GGML_OP_CPY: + case GGML_OP_CONT: + { + GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); + + int nth = MIN(1024, ne00/ggml_blck_size(src0->type)); + + id pipeline = nil; + + switch (src0t) { + case GGML_TYPE_F32: + { + GGML_ASSERT(ne0 % ggml_blck_size(dst->type) == 0); + + switch (dstt) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F16].pipeline; break; + case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0].pipeline; break; + case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0].pipeline; break; + case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1].pipeline; break; + case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break; + case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break; + case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL].pipeline; break; + default: GGML_ABORT("not implemented"); + }; + } break; + case GGML_TYPE_F16: + { + switch (dstt) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break; + default: GGML_ABORT("not implemented"); + }; + } break; + default: GGML_ABORT("not implemented"); + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + default: + { + GGML_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op)); + GGML_ABORT("fatal error"); + } + } +} + +static enum ggml_status ggml_metal_graph_compute( + ggml_backend_t backend, + struct ggml_cgraph * gf) { + struct ggml_backend_metal_context * ctx = backend->context; + struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; + + // number of nodes encoded by the main thread (empirically determined) + const int n_main = 128; + + // number of threads in addition to the main thread + const int n_cb = ctx->n_cb; + + // submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them + // the first n_nodes_0 are encoded and submitted for processing directly by the calling thread + // while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes + // each thread creates it's own command buffer and enqueues the ops in parallel + // + // tests on M1 Pro and M2 Ultra using LLaMA models, show that optimal values for n_cb are 1 or 2 + + @autoreleasepool { + ctx->gf = gf; + + ctx->n_nodes_0 = MIN(n_main, gf->n_nodes); + ctx->n_nodes_1 = gf->n_nodes - ctx->n_nodes_0; + + ctx->n_nodes_per_cb = (ctx->n_nodes_1 + ctx->n_cb - 1) / ctx->n_cb; + + const bool should_capture = ctx->capture_next_compute; + if (should_capture) { + ctx->capture_next_compute = false; + + if (!ctx->capture_started) { + // create capture scope + ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:ctx_dev->mtl_device]; + + MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new]; + descriptor.captureObject = ctx->capture_scope; + descriptor.destination = MTLCaptureDestinationGPUTraceDocument; + descriptor.outputURL = [NSURL fileURLWithPath:[NSString stringWithFormat:@"/tmp/perf-metal.gputrace"]]; + + NSError * error = nil; + if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) { + GGML_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]); + } else { + [ctx->capture_scope beginScope]; + ctx->capture_started = true; + } } } - [encoder endEncoding]; + // the main thread commits the first few commands immediately + // command_buffer[n_cb] + { + id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; + ctx->command_buffers[n_cb] = command_buffer; - if (cb_idx < 2 || ctx->abort_callback == NULL) { - [command_buffer commit]; + [command_buffer enqueue]; + ctx->encode_async(n_cb); } - }); - // Wait for completion and check status of each command buffer - // needed to detect if the device ran out-of-memory for example (#1881) + // prepare the rest of the command buffers asynchronously + // command_buffer[0.. n_cb) + for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { + id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; + ctx->command_buffers[cb_idx] = command_buffer; - for (int i = 0; i < n_cb; ++i) { - id command_buffer = command_buffers[i]; - [command_buffer waitUntilCompleted]; + // always enqueue the first two command buffers + // enqueue all of the command buffers if we don't need to abort + if (cb_idx < 2 || ctx->abort_callback == NULL) { + [command_buffer enqueue]; + } + } - MTLCommandBufferStatus status = [command_buffer status]; - if (status != MTLCommandBufferStatusCompleted) { - GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); - if (status == MTLCommandBufferStatusError) { - GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); + dispatch_apply(n_cb, ctx->d_queue, ctx->encode_async); + + // wait for completion and check status of each command buffer + // needed to detect if the device ran out-of-memory for example (#1881) + { + id command_buffer = ctx->command_buffers[n_cb]; + [command_buffer waitUntilCompleted]; + + MTLCommandBufferStatus status = [command_buffer status]; + if (status != MTLCommandBufferStatusCompleted) { + GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status); + if (status == MTLCommandBufferStatusError) { + GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); + } + + return GGML_STATUS_FAILED; + } + } + + for (int i = 0; i < n_cb; ++i) { + id command_buffer = ctx->command_buffers[i]; + [command_buffer waitUntilCompleted]; + + MTLCommandBufferStatus status = [command_buffer status]; + if (status != MTLCommandBufferStatusCompleted) { + GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); + if (status == MTLCommandBufferStatusError) { + GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); + } + + return GGML_STATUS_FAILED; } - return GGML_STATUS_FAILED; + id next_buffer = (i + 1 < n_cb ? ctx->command_buffers[i + 1] : nil); + if (!next_buffer) { + continue; + } + + const bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued); + if (next_queued) { + continue; + } + + if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) { + GGML_LOG_INFO("%s: command buffer %d aborted", __func__, i); + return GGML_STATUS_ABORTED; + } + + [next_buffer commit]; } - id next_buffer = (i + 1 < n_cb ? command_buffers[i + 1] : nil); - if (!next_buffer) { - continue; + if (!should_capture && ctx->capture_started) { + [ctx->capture_scope endScope]; + [[MTLCaptureManager sharedCaptureManager] stopCapture]; } - - bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued); - if (next_queued) { - continue; - } - - if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) { - GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i); - return GGML_STATUS_ABORTED; - } - - [next_buffer commit]; } - if (should_capture) { - [[MTLCaptureManager sharedCaptureManager] stopCapture]; - } - - } return GGML_STATUS_SUCCESS; } @@ -3078,44 +3146,19 @@ static enum ggml_status ggml_metal_graph_compute( // backend interface -// default buffer -static id g_backend_device = nil; -static int g_backend_device_ref_count = 0; - -static id ggml_backend_metal_get_device(void) { - if (g_backend_device == nil) { - g_backend_device = MTLCreateSystemDefaultDevice(); - } - - g_backend_device_ref_count++; - - return g_backend_device; -} - -static void ggml_backend_metal_free_device(void) { - assert(g_backend_device_ref_count > 0); - - g_backend_device_ref_count--; - - if (g_backend_device_ref_count == 0) { - [g_backend_device release]; - g_backend_device = nil; - } -} - -GGML_CALL static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) { +static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) { return "Metal"; UNUSED(buffer); } -GGML_CALL static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { +static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; for (int i = 0; i < ctx->n_buffers; i++) { [ctx->buffers[i].metal release]; } - ggml_backend_metal_free_device(); + ggml_backend_metal_device_rel(buffer->buft->device->context); if (ctx->owned) { #if TARGET_OS_OSX @@ -3128,25 +3171,25 @@ GGML_CALL static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_ free(ctx); } -GGML_CALL static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { +static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; return ctx->all_data; } -GGML_CALL static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { memcpy((char *)tensor->data + offset, data, size); UNUSED(buffer); } -GGML_CALL static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { +static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { memcpy(data, (const char *)tensor->data + offset, size); UNUSED(buffer); } -GGML_CALL static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { +static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { if (ggml_backend_buffer_is_host(src->buffer)) { memcpy(dst->data, src->data, ggml_nbytes(src)); return true; @@ -3156,7 +3199,7 @@ GGML_CALL static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t UNUSED(buffer); } -GGML_CALL static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; memset(ctx->all_data, value, ctx->all_size); @@ -3177,7 +3220,7 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = { // default buffer type -GGML_CALL static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) { +static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) { return "Metal"; UNUSED(buft); @@ -3187,17 +3230,17 @@ static void ggml_backend_metal_log_allocated_size(id device, size_t s #ifndef GGML_METAL_NDEBUG #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) if (@available(macOS 10.12, iOS 16.0, *)) { - GGML_METAL_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n", + GGML_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n", __func__, size_aligned / 1024.0 / 1024.0, device.currentAllocatedSize / 1024.0 / 1024.0, device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) { - GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); + GGML_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); } } else { - GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n", + GGML_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n", __func__, size_aligned / 1024.0 / 1024.0, device.currentAllocatedSize / 1024.0 / 1024.0); @@ -3208,8 +3251,8 @@ static void ggml_backend_metal_log_allocated_size(id device, size_t s UNUSED(size_aligned); } -GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context)); +static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context)); const size_t size_page = sysconf(_SC_PAGESIZE); @@ -3218,7 +3261,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff size_aligned += (size_page - (size_aligned % size_page)); } - id device = ggml_backend_metal_get_device(); + id device = ggml_backend_metal_device_acq(buft->device->context); ctx->all_data = ggml_metal_host_malloc(size_aligned); ctx->all_size = size_aligned; @@ -3232,16 +3275,16 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff if (size_aligned > 0) { ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data - length:size_aligned - options:MTLResourceStorageModeShared - deallocator:nil]; + length:size_aligned + options:MTLResourceStorageModeShared + deallocator:nil]; } } if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) { - GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); + GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); free(ctx); - ggml_backend_metal_free_device(); + ggml_backend_metal_device_rel(buft->device->context); return NULL; } @@ -3250,28 +3293,28 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size); } -GGML_CALL static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return 32; UNUSED(buft); } -GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { - id device = ggml_backend_metal_get_device(); - size_t max_size = device.maxBufferLength; - ggml_backend_metal_free_device(); +static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + id device = ggml_backend_metal_device_acq(buft->device->context); + const size_t max_size = device.maxBufferLength; + ggml_backend_metal_device_rel(buft->device->context); return max_size; UNUSED(buft); } -GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { +static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return true; UNUSED(buft); } -GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { +ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = { /* .iface = */ { /* .get_name = */ ggml_backend_metal_buffer_type_get_name, @@ -3281,16 +3324,16 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .is_host = */ ggml_backend_metal_buffer_type_is_host, }, + /* .device = */ &g_ggml_backend_metal_device, /* .context = */ NULL, }; return &ggml_backend_buffer_type_metal; } -// buffer from ptr - -GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) { - struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context)); +// TODO: obsoleted by ggml_backend_metal_device_buffer_from_ptr +ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) { + struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context)); ctx->all_data = data; ctx->all_size = size; @@ -3311,7 +3354,7 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_aligned += (size_page - (size_aligned % size_page)); } - id device = ggml_backend_metal_get_device(); + id device = ggml_backend_metal_device_acq(&g_ggml_ctx_dev_main); // the buffer fits into the max buffer size allowed by the device if (size_aligned <= device.maxBufferLength) { @@ -3323,7 +3366,7 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); + GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); return false; } } @@ -3349,7 +3392,7 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0); + GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0); return false; } } @@ -3357,7 +3400,7 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, ggml_backend_metal_log_allocated_size(device, size_step_aligned); if (i + size_step < size) { - GGML_METAL_LOG_INFO("\n"); + GGML_LOG_INFO("\n"); } ++ctx->n_buffers; @@ -3369,40 +3412,89 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, // backend -GGML_CALL static const char * ggml_backend_metal_name(ggml_backend_t backend) { +static const char * ggml_backend_metal_name(ggml_backend_t backend) { return "Metal"; UNUSED(backend); } -GGML_CALL static void ggml_backend_metal_free(ggml_backend_t backend) { - struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; +static void ggml_backend_metal_free(ggml_backend_t backend) { + struct ggml_backend_metal_context * ctx = backend->context; + struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; + + ggml_backend_metal_device_rel(ctx_dev); ggml_metal_free(ctx); + free(backend); } -GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) { +static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) { return ggml_backend_metal_buffer_type(); UNUSED(backend); } -GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context; - - return ggml_metal_graph_compute(metal_ctx, cgraph); +static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + return ggml_metal_graph_compute(backend, cgraph); } -GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { - struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context; +static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { + GGML_ASSERT(ggml_backend_is_metal(backend)); - return ggml_metal_supports_op(metal_ctx, op); -} + struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; -GGML_CALL static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { - return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name; + if (ctx->n_cb != n_cb) { + ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_COMMAND_BUFFERS); - UNUSED(backend); + if (ctx->n_cb > 2) { + GGML_LOG_WARN("%s: n_cb = %d, using n_cb > 2 is not recommended and can degrade the performance in some cases\n", __func__, n_cb); + } + } + + if (ctx->encode_async) { + Block_release(ctx->encode_async); + } + + ctx->encode_async = Block_copy(^(size_t iter) { + const int cb_idx = iter; + const int n_cb_l = ctx->n_cb; + + const int n_nodes_0 = ctx->n_nodes_0; + const int n_nodes_1 = ctx->n_nodes_1; + + const int n_nodes_per_cb = ctx->n_nodes_per_cb; + + id command_buffer = ctx->command_buffers[cb_idx]; + id encoder = [command_buffer computeCommandEncoder]; + + int node_start = 0; + int node_end = n_nodes_0; + + if (cb_idx < n_cb_l) { + node_start = n_nodes_0 + ( (cb_idx + 0) * n_nodes_per_cb); + node_end = n_nodes_0 + (MIN((cb_idx == n_cb_l - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1)); + } + + const bool should_capture = ctx->capture_next_compute; + + for (int idx = node_start; idx < node_end; ++idx) { + if (should_capture) { + [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; + } + + ggml_metal_encode_node(backend, idx, encoder); + + if (should_capture) { + [encoder popDebugGroup]; + } + } + + [encoder endEncoding]; + + if (cb_idx < 2 || ctx->abort_callback == NULL) { + [command_buffer commit]; + } + }); } static struct ggml_backend_i ggml_backend_metal_i = { @@ -3418,56 +3510,46 @@ static struct ggml_backend_i ggml_backend_metal_i = { /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_metal_graph_compute, - /* .supports_op = */ ggml_backend_metal_supports_op, - /* .supports_buft = */ ggml_backend_metal_supports_buft, + /* .supports_op = */ NULL, + /* .supports_buft = */ NULL, /* .offload_op = */ NULL, - /* .event_new = */ NULL, - /* .event_free = */ NULL, /* .event_record = */ NULL, /* .event_wait = */ NULL, - /* .event_synchronize = */ NULL, }; -void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) { - ggml_metal_log_callback = log_callback; - ggml_metal_log_user_data = user_data; -} - static ggml_guid_t ggml_backend_metal_guid(void) { static ggml_guid guid = { 0x81, 0xa1, 0x8b, 0x1e, 0x71, 0xec, 0x79, 0xed, 0x2b, 0x85, 0xdc, 0x8a, 0x61, 0x98, 0x30, 0xe6 }; return &guid; } +// TODO: remove in the future ggml_backend_t ggml_backend_metal_init(void) { - struct ggml_backend_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS); + ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_metal_reg(), 0); + + struct ggml_backend_metal_context * ctx = ggml_metal_init(dev); if (ctx == NULL) { - GGML_METAL_LOG_ERROR("%s: error: failed to allocate context\n", __func__); + GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__); return NULL; } - ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend)); + ggml_backend_t backend = malloc(sizeof(struct ggml_backend)); - *metal_backend = (struct ggml_backend) { + *backend = (struct ggml_backend) { /* .guid = */ ggml_backend_metal_guid(), /* .interface = */ ggml_backend_metal_i, + /* .device = */ dev, /* .context = */ ctx, }; - return metal_backend; + ggml_backend_metal_set_n_cb(backend, 1); + + return backend; } bool ggml_backend_is_metal(ggml_backend_t backend) { return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_metal_guid()); } -void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { - GGML_ASSERT(ggml_backend_is_metal(backend)); - - struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; - - ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); -} - void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data) { GGML_ASSERT(ggml_backend_is_metal(backend)); @@ -3480,23 +3562,258 @@ void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_ca bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) { GGML_ASSERT(ggml_backend_is_metal(backend)); - struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; + struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; - return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)]; + return [ctx_dev->mtl_device supportsFamily:(MTLGPUFamilyApple1 + family - 1)]; } void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) { GGML_ASSERT(ggml_backend_is_metal(backend)); struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; - ctx->should_capture_next_compute = true; + ctx->capture_next_compute = true; } -GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning +// backend device -GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) { - return ggml_backend_metal_init(); +static const char * ggml_backend_metal_device_get_name(ggml_backend_dev_t dev) { + return "Metal"; + + GGML_UNUSED(dev); +} + +static const char * ggml_backend_metal_device_get_description(ggml_backend_dev_t dev) { + // acq/rel just to populate ctx->name in case it hasn't been done yet + struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)dev->context; + ggml_backend_metal_device_acq(ctx_dev); + ggml_backend_metal_device_rel(ctx_dev); + + return ctx_dev->name; +} + +static void ggml_backend_metal_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + if (@available(macOS 10.12, iOS 16.0, *)) { + struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)dev->context; + id device = ggml_backend_metal_device_acq(ctx_dev); + + *total = device.recommendedMaxWorkingSetSize; + *free = *total - device.currentAllocatedSize; + + ggml_backend_metal_device_rel(ctx_dev); + } else { + *free = 1; + *total = 1; + } +} + +static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backend_dev_t dev) { + return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; + + GGML_UNUSED(dev); +} + +static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { + props->name = ggml_backend_metal_device_get_name(dev); + props->description = ggml_backend_metal_device_get_description(dev); + props->type = ggml_backend_metal_device_get_type(dev); + ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = (struct ggml_backend_dev_caps) { + /* .async = */ false, + /* .host_buffer = */ false, + /* .buffer_from_host_ptr = */ true, + /* .events = */ false, + }; +} + +static ggml_backend_t ggml_backend_metal_device_init(ggml_backend_dev_t dev, const char * params) { + struct ggml_backend_metal_context * ctx = ggml_metal_init(dev); + if (ctx == NULL) { + GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__); + return NULL; + } + + ggml_backend_t backend = malloc(sizeof(struct ggml_backend)); + + *backend = (struct ggml_backend) { + /* .guid = */ ggml_backend_metal_guid(), + /* .interface = */ ggml_backend_metal_i, + /* .device = */ dev, + /* .context = */ ctx, + }; + + ggml_backend_metal_set_n_cb(backend, 1); + + return backend; GGML_UNUSED(params); - GGML_UNUSED(user_data); +} + +static ggml_backend_buffer_type_t ggml_backend_metal_device_get_buffer_type(ggml_backend_dev_t dev) { + return ggml_backend_metal_buffer_type(); + + GGML_UNUSED(dev); +} + +static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { + struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context)); + + ctx->all_data = ptr; + ctx->all_size = size; + ctx->owned = false; + ctx->n_buffers = 0; + + const size_t size_page = sysconf(_SC_PAGESIZE); + + // page-align the data ptr + { + const uintptr_t offs = (uintptr_t) ptr % size_page; + ptr = (void *) ((char *) ptr - offs); + size += offs; + } + + size_t size_aligned = size; + if ((size_aligned % size_page) != 0) { + size_aligned += (size_page - (size_aligned % size_page)); + } + + struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)dev->context; + id device = ggml_backend_metal_device_acq(ctx_dev); + + // the buffer fits into the max buffer size allowed by the device + if (size_aligned <= device.maxBufferLength) { + ctx->buffers[ctx->n_buffers].data = ptr; + ctx->buffers[ctx->n_buffers].size = size; + ctx->buffers[ctx->n_buffers].metal = nil; + + if (size_aligned > 0) { + ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; + + if (ctx->buffers[ctx->n_buffers].metal == nil) { + GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); + return false; + } + } + + ggml_backend_metal_log_allocated_size(device, size_aligned); + + ++ctx->n_buffers; + } else { + // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into + // one of the views + const size_t size_ovlp = ((max_tensor_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case + const size_t size_step = device.maxBufferLength - size_ovlp; + const size_t size_view = device.maxBufferLength; + + for (size_t i = 0; i < size; i += size_step) { + const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i); + + ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) ptr + i); + ctx->buffers[ctx->n_buffers].size = size_step_aligned; + ctx->buffers[ctx->n_buffers].metal = nil; + + if (size_step_aligned > 0) { + ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; + + if (ctx->buffers[ctx->n_buffers].metal == nil) { + GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0); + return false; + } + } + + ggml_backend_metal_log_allocated_size(device, size_step_aligned); + + if (i + size_step < size) { + GGML_LOG_INFO("\n"); + } + + ++ctx->n_buffers; + } + } + + return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size); +} + +static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { + struct ggml_backend_metal_device_context * ctx_dev = dev->context; + + return ggml_metal_supports_op(ctx_dev, op); +} + +static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name; + + UNUSED(dev); +} + +static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { + return false; + + GGML_UNUSED(dev); + GGML_UNUSED(op); +} + +static struct ggml_backend_device_i ggml_backend_metal_device_i = { + /* .get_name = */ ggml_backend_metal_device_get_name, + /* .get_description = */ ggml_backend_metal_device_get_description, + /* .get_memory = */ ggml_backend_metal_device_get_memory, + /* .get_type = */ ggml_backend_metal_device_get_type, + /* .get_props = */ ggml_backend_metal_device_get_props, + /* .init_backend = */ ggml_backend_metal_device_init, + /* .get_buffer_type = */ ggml_backend_metal_device_get_buffer_type, + /* .get_host_buffer_type = */ NULL, + /* .buffer_from_host_ptr = */ ggml_backend_metal_device_buffer_from_ptr, + /* .supports_op = */ ggml_backend_metal_device_supports_op, + /* .supports_buft = */ ggml_backend_metal_device_supports_buft, + /* .offload_op = */ ggml_backend_metal_device_offload_op, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, +}; + +// backend registry + +static const char * ggml_backend_metal_reg_get_name(ggml_backend_reg_t reg) { + return "Metal"; + + GGML_UNUSED(reg); +} + +static size_t ggml_backend_metal_reg_device_count(ggml_backend_reg_t reg) { + return 1; + + GGML_UNUSED(reg); +} + +static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t reg, size_t index) { + GGML_ASSERT(index == 0); + + return &g_ggml_backend_metal_device; + + GGML_UNUSED(reg); + GGML_UNUSED(index); +} + +static struct ggml_backend_reg_i ggml_backend_metal_reg_i = { + /* .get_name = */ ggml_backend_metal_reg_get_name, + /* .device_count = */ ggml_backend_metal_reg_device_count, + /* .device_get = */ ggml_backend_metal_reg_device_get, + /* .get_proc_address = */ NULL, +}; + +ggml_backend_reg_t ggml_backend_metal_reg(void) { + // TODO: make this thread-safe somehow? + { + g_ggml_backend_metal_reg = (struct ggml_backend_reg) { + /* .iface = */ ggml_backend_metal_reg_i, + /* .context = */ NULL, + }; + + g_ggml_backend_metal_device = (struct ggml_backend_device) { + /* .iface = */ ggml_backend_metal_device_i, + /* .reg = */ &g_ggml_backend_metal_reg, + /* .context = */ &g_ggml_ctx_dev_main, + }; + } + + return &g_ggml_backend_metal_reg; } diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 8bffce860..7aa6dce89 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -4013,7 +4013,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r svfloat32_t sumv0 = svdup_n_f32(0.0f); svfloat32_t sumv1 = svdup_n_f32(0.0f); - const int vector_length = ggml_sve_cnt_b*8; + const int vector_length = ggml_cpu_get_sve_cnt()*8; // VLA Implementation using switch case switch (vector_length) { @@ -5597,7 +5597,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r svfloat32_t sumv0 = svdup_n_f32(0.0f); svfloat32_t sumv1 = svdup_n_f32(0.0f); - const int vector_length = ggml_sve_cnt_b*8; + const int vector_length = ggml_cpu_get_sve_cnt()*8; //VLA Implemenation for SVE switch (vector_length) { diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index e96ce2b5e..df9c4b24a 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -142,10 +142,6 @@ void iq2xs_free_impl(enum ggml_type type); void iq3xs_init_impl(int grid_size); void iq3xs_free_impl(int grid_size); -#if defined(__ARM_FEATURE_SVE) -extern int ggml_sve_cnt_b; -#endif - #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp index 49b3fa911..0e936b343 100644 --- a/ggml/src/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc.cpp @@ -25,7 +25,7 @@ # include # include #endif -#include +#include #define UNUSED GGML_UNUSED @@ -57,8 +57,9 @@ struct socket_t { } }; -// ggml_tensor is serialized into rpc_tensor +// all RPC structures must be packed #pragma pack(push, 1) +// ggml_tensor is serialized into rpc_tensor struct rpc_tensor { uint64_t id; uint32_t type; @@ -76,7 +77,6 @@ struct rpc_tensor { char padding[4]; }; -#pragma pack(pop) static_assert(sizeof(rpc_tensor) % 8 == 0, "rpc_tensor size must be multiple of 8"); @@ -96,6 +96,65 @@ enum rpc_cmd { RPC_CMD_COUNT, }; +struct rpc_msg_alloc_buffer_req { + uint64_t size; +}; + +struct rpc_msg_alloc_buffer_rsp { + uint64_t remote_ptr; + uint64_t remote_size; +}; + +struct rpc_msg_get_alignment_rsp { + uint64_t alignment; +}; + +struct rpc_msg_get_max_size_rsp { + uint64_t max_size; +}; + +struct rpc_msg_buffer_get_base_req { + uint64_t remote_ptr; +}; + +struct rpc_msg_buffer_get_base_rsp { + uint64_t base_ptr; +}; + +struct rpc_msg_free_buffer_req { + uint64_t remote_ptr; +}; + +struct rpc_msg_buffer_clear_req { + uint64_t remote_ptr; + uint8_t value; +}; + +struct rpc_msg_get_tensor_req { + rpc_tensor tensor; + uint64_t offset; + uint64_t size; +}; + +struct rpc_msg_copy_tensor_req { + rpc_tensor src; + rpc_tensor dst; +}; + +struct rpc_msg_copy_tensor_rsp { + uint8_t result; +}; + +struct rpc_msg_graph_compute_rsp { + uint8_t result; +}; + +struct rpc_msg_get_device_memory_rsp { + uint64_t free_mem; + uint64_t total_mem; +}; +#pragma pack(pop) + // RPC data structures static ggml_guid_t ggml_backend_rpc_guid() { @@ -240,6 +299,38 @@ static bool recv_data(sockfd_t sockfd, void * data, size_t size) { return true; } +static bool send_msg(sockfd_t sockfd, const void * msg, size_t msg_size) { + if (!send_data(sockfd, &msg_size, sizeof(msg_size))) { + return false; + } + return send_data(sockfd, msg, msg_size); +} + +static bool recv_msg(sockfd_t sockfd, void * msg, size_t msg_size) { + uint64_t size; + if (!recv_data(sockfd, &size, sizeof(size))) { + return false; + } + if (size != msg_size) { + return false; + } + return recv_data(sockfd, msg, msg_size); +} + +static bool recv_msg(sockfd_t sockfd, std::vector & input) { + uint64_t size; + if (!recv_data(sockfd, &size, sizeof(size))) { + return false; + } + try { + input.resize(size); + } catch (const std::bad_alloc & e) { + fprintf(stderr, "Failed to allocate input buffer of size %" PRIu64 "\n", size); + return false; + } + return recv_data(sockfd, input.data(), size); +} + static bool parse_endpoint(const std::string & endpoint, std::string & host, int & port) { size_t pos = endpoint.find(':'); if (pos == std::string::npos) { @@ -252,28 +343,27 @@ static bool parse_endpoint(const std::string & endpoint, std::string & host, int // RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) | // RPC response: | response_size (8 bytes) | response_data (response_size bytes) | -static bool send_rpc_cmd(const std::shared_ptr & sock, enum rpc_cmd cmd, const std::vector & input, std::vector & output) { +static bool send_rpc_cmd(const std::shared_ptr & sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size) { uint8_t cmd_byte = cmd; if (!send_data(sock->fd, &cmd_byte, sizeof(cmd_byte))) { return false; } - uint64_t input_size = input.size(); if (!send_data(sock->fd, &input_size, sizeof(input_size))) { return false; } - if (!send_data(sock->fd, input.data(), input.size())) { + if (!send_data(sock->fd, input, input_size)) { return false; } - uint64_t output_size; - if (!recv_data(sock->fd, &output_size, sizeof(output_size))) { + // TODO: currently the output_size is always known, do we need support for commands with variable output size? + // even if we do, we can skip sending output_size from the server for commands with known output size + uint64_t out_size; + if (!recv_data(sock->fd, &out_size, sizeof(out_size))) { return false; } - if (output_size == 0) { - output.clear(); - return true; + if (out_size != output_size) { + return false; } - output.resize(output_size); - if (!recv_data(sock->fd, output.data(), output_size)) { + if (!recv_data(sock->fd, output, output_size)) { return false; } return true; @@ -319,43 +409,31 @@ static std::shared_ptr get_socket(const std::string & endpoint) { return sock; } -GGML_CALL static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) { +static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) { ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; return ctx->name.c_str(); } -GGML_CALL static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) { +static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; - // input serialization format: | remote_ptr (8 bytes) | - std::vector input(sizeof(uint64_t), 0); - uint64_t remote_ptr = ctx->remote_ptr; - memcpy(input.data(), &remote_ptr, sizeof(remote_ptr)); - std::vector output; - bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, input, output); + rpc_msg_free_buffer_req request = {ctx->remote_ptr}; + bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0); GGML_ASSERT(status); - GGML_ASSERT(output.empty()); delete ctx; } -GGML_CALL static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) { +static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) { ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; if (ctx->base_cache.find(buffer) != ctx->base_cache.end()) { return ctx->base_cache[buffer]; } - // input serialization format: | remote_ptr (8 bytes) | - std::vector input(sizeof(uint64_t), 0); - uint64_t remote_ptr = ctx->remote_ptr; - memcpy(input.data(), &remote_ptr, sizeof(remote_ptr)); - std::vector output; - bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_GET_BASE, input, output); + rpc_msg_buffer_get_base_req request = {ctx->remote_ptr}; + rpc_msg_buffer_get_base_rsp response; + bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_GET_BASE, &request, sizeof(request), &response, sizeof(response)); GGML_ASSERT(status); - GGML_ASSERT(output.size() == sizeof(uint64_t)); - // output serialization format: | base_ptr (8 bytes) | - uint64_t base_ptr; - memcpy(&base_ptr, output.data(), sizeof(base_ptr)); - void * base = reinterpret_cast(base_ptr); - ctx->base_cache[buffer] = base; - return base; + void * base_ptr = reinterpret_cast(response.base_ptr); + ctx->base_cache[buffer] = base_ptr; + return base_ptr; } static rpc_tensor serialize_tensor(const ggml_tensor * tensor) { @@ -388,7 +466,7 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) { return result; } -GGML_CALL static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { +static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { UNUSED(buffer); if (ggml_is_quantized(tensor->type)) { // TODO: this check is due to MATRIX_ROW_PADDING in CUDA and should be generalized @@ -396,7 +474,7 @@ GGML_CALL static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t } } -GGML_CALL static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; // input serialization format: | rpc_tensor | offset (8 bytes) | data (size bytes) | size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + size; @@ -405,29 +483,21 @@ GGML_CALL static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t b memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor)); memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset)); memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size); - std::vector output; - bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input, output); + bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size(), nullptr, 0); GGML_ASSERT(status); } -GGML_CALL static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; - // input serialization format: | rpc_tensor | offset (8 bytes) | size (8 bytes) | - int input_size = sizeof(rpc_tensor) + 2*sizeof(uint64_t); - std::vector input(input_size, 0); - rpc_tensor rpc_tensor = serialize_tensor(tensor); - memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor)); - memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset)); - memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), &size, sizeof(size)); - std::vector output; - bool status = send_rpc_cmd(ctx->sock, RPC_CMD_GET_TENSOR, input, output); + rpc_msg_get_tensor_req request; + request.tensor = serialize_tensor(tensor); + request.offset = offset; + request.size = size; + bool status = send_rpc_cmd(ctx->sock, RPC_CMD_GET_TENSOR, &request, sizeof(request), data, size); GGML_ASSERT(status); - GGML_ASSERT(output.size() == size); - // output serialization format: | data (size bytes) | - memcpy(data, output.data(), size); } -GGML_CALL static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { +static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { // check if src and dst are on the same server ggml_backend_buffer_t src_buffer = src->buffer; ggml_backend_rpc_buffer_context * src_ctx = (ggml_backend_rpc_buffer_context *)src_buffer->context; @@ -437,30 +507,19 @@ GGML_CALL static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t b return false; } ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; - // input serialization format: | rpc_tensor src | rpc_tensor dst | - int input_size = 2*sizeof(rpc_tensor); - std::vector input(input_size, 0); - rpc_tensor rpc_src = serialize_tensor(src); - rpc_tensor rpc_dst = serialize_tensor(dst); - memcpy(input.data(), &rpc_src, sizeof(rpc_src)); - memcpy(input.data() + sizeof(rpc_src), &rpc_dst, sizeof(rpc_dst)); - std::vector output; - bool status = send_rpc_cmd(ctx->sock, RPC_CMD_COPY_TENSOR, input, output); + rpc_msg_copy_tensor_req request; + request.src = serialize_tensor(src); + request.dst = serialize_tensor(dst); + rpc_msg_copy_tensor_rsp response; + bool status = send_rpc_cmd(ctx->sock, RPC_CMD_COPY_TENSOR, &request, sizeof(request), &response, sizeof(response)); GGML_ASSERT(status); - // output serialization format: | result (1 byte) | - GGML_ASSERT(output.size() == 1); - return output[0]; + return response.result; } -GGML_CALL static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; - // serialization format: | bufptr (8 bytes) | value (1 byte) | - int input_size = sizeof(uint64_t) + sizeof(uint8_t); - std::vector input(input_size, 0); - memcpy(input.data(), &ctx->remote_ptr, sizeof(ctx->remote_ptr)); - memcpy(input.data() + sizeof(ctx->remote_ptr), &value, sizeof(value)); - std::vector output; - bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_CLEAR, input, output); + rpc_msg_buffer_clear_req request = {ctx->remote_ptr, value}; + bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_CLEAR, &request, sizeof(request), nullptr, 0); GGML_ASSERT(status); } @@ -477,32 +536,23 @@ static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = { /* .reset = */ NULL, }; -GGML_CALL static const char * ggml_backend_rpc_buffer_type_name(ggml_backend_buffer_type_t buft) { +static const char * ggml_backend_rpc_buffer_type_name(ggml_backend_buffer_type_t buft) { ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context; return buft_ctx->name.c_str(); } -GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context; - // input serialization format: | size (8 bytes) | - int input_size = sizeof(uint64_t); - std::vector input(input_size, 0); - memcpy(input.data(), &size, sizeof(size)); - std::vector output; + rpc_msg_alloc_buffer_req request = {size}; + rpc_msg_alloc_buffer_rsp response; auto sock = get_socket(buft_ctx->endpoint); - bool status = send_rpc_cmd(sock, RPC_CMD_ALLOC_BUFFER, input, output); + bool status = send_rpc_cmd(sock, RPC_CMD_ALLOC_BUFFER, &request, sizeof(request), &response, sizeof(response)); GGML_ASSERT(status); - GGML_ASSERT(output.size() == 2*sizeof(uint64_t)); - // output serialization format: | remote_ptr (8 bytes) | remote_size (8 bytes) | - uint64_t remote_ptr; - memcpy(&remote_ptr, output.data(), sizeof(remote_ptr)); - size_t remote_size; - memcpy(&remote_size, output.data() + sizeof(uint64_t), sizeof(remote_size)); - if (remote_ptr != 0) { + if (response.remote_ptr != 0) { ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_rpc_buffer_interface, - new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"}, - remote_size); + new ggml_backend_rpc_buffer_context{sock, {}, response.remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"}, + response.remote_size); return buffer; } else { return nullptr; @@ -510,42 +560,30 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer } static size_t get_alignment(const std::shared_ptr & sock) { - // input serialization format: | 0 bytes | - std::vector input; - std::vector output; - bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALIGNMENT, input, output); + rpc_msg_get_alignment_rsp response; + bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALIGNMENT, nullptr, 0, &response, sizeof(response)); GGML_ASSERT(status); - GGML_ASSERT(output.size() == sizeof(uint64_t)); - // output serialization format: | alignment (8 bytes) | - uint64_t alignment; - memcpy(&alignment, output.data(), sizeof(alignment)); - return alignment; + return response.alignment; } -GGML_CALL static size_t ggml_backend_rpc_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +static size_t ggml_backend_rpc_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context; return buft_ctx->alignment; } static size_t get_max_size(const std::shared_ptr & sock) { - // input serialization format: | 0 bytes | - std::vector input; - std::vector output; - bool status = send_rpc_cmd(sock, RPC_CMD_GET_MAX_SIZE, input, output); + rpc_msg_get_max_size_rsp response; + bool status = send_rpc_cmd(sock, RPC_CMD_GET_MAX_SIZE, nullptr, 0, &response, sizeof(response)); GGML_ASSERT(status); - GGML_ASSERT(output.size() == sizeof(uint64_t)); - // output serialization format: | max_size (8 bytes) | - uint64_t max_size; - memcpy(&max_size, output.data(), sizeof(max_size)); - return max_size; + return response.max_size; } -GGML_CALL static size_t ggml_backend_rpc_get_max_size(ggml_backend_buffer_type_t buft) { +static size_t ggml_backend_rpc_get_max_size(ggml_backend_buffer_type_t buft) { ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context; return buft_ctx->max_size; } -GGML_CALL static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { +static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { UNUSED(buft); return ggml_nbytes(tensor); } @@ -559,24 +597,24 @@ static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = { /* .is_host = */ NULL, }; -GGML_CALL static const char * ggml_backend_rpc_name(ggml_backend_t backend) { +static const char * ggml_backend_rpc_name(ggml_backend_t backend) { ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; return rpc_ctx->name.c_str(); } -GGML_CALL static void ggml_backend_rpc_free(ggml_backend_t backend) { +static void ggml_backend_rpc_free(ggml_backend_t backend) { ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; delete rpc_ctx; delete backend; } -GGML_CALL static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) { +static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) { ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context; return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str()); } -GGML_CALL static void ggml_backend_rpc_synchronize(ggml_backend_t backend) { +static void ggml_backend_rpc_synchronize(ggml_backend_t backend) { UNUSED(backend); // this is no-op because we don't have any async operations } @@ -618,32 +656,15 @@ static void serialize_graph(const ggml_cgraph * cgraph, std::vector & o memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor)); } -GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; std::vector input; serialize_graph(cgraph, input); - std::vector output; + rpc_msg_graph_compute_rsp response; auto sock = get_socket(rpc_ctx->endpoint); - bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_COMPUTE, input, output); + bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_COMPUTE, input.data(), input.size(), &response, sizeof(response)); GGML_ASSERT(status); - GGML_ASSERT(output.size() == 1); - return (enum ggml_status)output[0]; -} - -GGML_CALL static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const ggml_tensor * op) { - UNUSED(backend); - UNUSED(op); - //TODO: call the remote backend and cache the results - return true; -} - -GGML_CALL static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { - if (!buft || buft->iface.get_name != ggml_backend_rpc_buffer_type_name) { - return false; - } - ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context; - ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; - return buft_ctx->endpoint == rpc_ctx->endpoint; + return (enum ggml_status)response.result; } static ggml_backend_i ggml_backend_rpc_interface = { @@ -659,17 +680,14 @@ static ggml_backend_i ggml_backend_rpc_interface = { /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_rpc_graph_compute, - /* .supports_op = */ ggml_backend_rpc_supports_op, - /* .supports_buft = */ ggml_backend_rpc_supports_buft, + /* .supports_op = */ NULL, + /* .supports_buft = */ NULL, /* .offload_op = */ NULL, - /* .event_new = */ NULL, - /* .event_free = */ NULL, /* .event_record = */ NULL, /* .event_wait = */ NULL, - /* .event_synchronize = */ NULL, }; -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) { +GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) { static std::mutex mutex; std::lock_guard lock(mutex); // NOTE: buffer types are allocated and never freed; this is by design @@ -694,13 +712,14 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const ggml_backend_buffer_type_t buft = new ggml_backend_buffer_type { /* .iface = */ ggml_backend_rpc_buffer_type_interface, + /* .device = */ ggml_backend_rpc_add_device(endpoint), /* .context = */ buft_ctx }; buft_map[endpoint] = buft; return buft; } -GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) { +ggml_backend_t ggml_backend_rpc_init(const char * endpoint) { ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context { /* .endpoint = */ endpoint, /* .name = */ "RPC[" + std::string(endpoint) + "]", @@ -709,32 +728,25 @@ GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) { ggml_backend_t backend = new ggml_backend { /* .guid = */ ggml_backend_rpc_guid(), /* .interface = */ ggml_backend_rpc_interface, + /* .device = */ ggml_backend_rpc_add_device(endpoint), /* .context = */ ctx }; return backend; } -GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend) { +GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend) { return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_rpc_guid()); } static void get_device_memory(const std::shared_ptr & sock, size_t * free, size_t * total) { - // input serialization format: | 0 bytes | - std::vector input; - std::vector output; - bool status = send_rpc_cmd(sock, RPC_CMD_GET_DEVICE_MEMORY, input, output); + rpc_msg_get_device_memory_rsp response; + bool status = send_rpc_cmd(sock, RPC_CMD_GET_DEVICE_MEMORY, nullptr, 0, &response, sizeof(response)); GGML_ASSERT(status); - GGML_ASSERT(output.size() == 2*sizeof(uint64_t)); - // output serialization format: | free (8 bytes) | total (8 bytes) | - uint64_t free_mem; - memcpy(&free_mem, output.data(), sizeof(free_mem)); - uint64_t total_mem; - memcpy(&total_mem, output.data() + sizeof(uint64_t), sizeof(total_mem)); - *free = free_mem; - *total = total_mem; + *free = response.free_mem; + *total = response.total_mem; } -GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) { +GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) { auto sock = get_socket(endpoint); if (sock == nullptr) { *free = 0; @@ -751,16 +763,16 @@ public: rpc_server(ggml_backend_t backend) : backend(backend) {} ~rpc_server(); - bool alloc_buffer(const std::vector & input, std::vector & output); - void get_alignment(std::vector & output); - void get_max_size(std::vector & output); - bool buffer_get_base(const std::vector & input, std::vector & output); - bool free_buffer(const std::vector & input); - bool buffer_clear(const std::vector & input); + void alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response); + void get_alignment(rpc_msg_get_alignment_rsp & response); + void get_max_size(rpc_msg_get_max_size_rsp & response); + bool buffer_get_base(const rpc_msg_buffer_get_base_req & request, rpc_msg_buffer_get_base_rsp & response); + bool free_buffer(const rpc_msg_free_buffer_req & request); + bool buffer_clear(const rpc_msg_buffer_clear_req & request); bool set_tensor(const std::vector & input); - bool get_tensor(const std::vector & input, std::vector & output); - bool copy_tensor(const std::vector & input, std::vector & output); - bool graph_compute(const std::vector & input, std::vector & output); + bool get_tensor(const rpc_msg_get_tensor_req & request, std::vector & response); + bool copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response); + bool graph_compute(const std::vector & input, rpc_msg_graph_compute_rsp & response); private: ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor); @@ -774,80 +786,50 @@ private: std::unordered_set buffers; }; -bool rpc_server::alloc_buffer(const std::vector & input, std::vector & output) { - // input serialization format: | size (8 bytes) | - if (input.size() != sizeof(uint64_t)) { - return false; - } - uint64_t size; - memcpy(&size, input.data(), sizeof(size)); +void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response) { ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend); - ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size); - uint64_t remote_ptr = 0; - uint64_t remote_size = 0; + ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, request.size); + response.remote_ptr = 0; + response.remote_size = 0; if (buffer != nullptr) { - remote_ptr = reinterpret_cast(buffer); - remote_size = buffer->size; - GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, size, remote_ptr, remote_size); + response.remote_ptr = reinterpret_cast(buffer); + response.remote_size = buffer->size; + GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, request.size, response.remote_ptr, response.remote_size); buffers.insert(buffer); } else { - GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> failed\n", __func__, size); + GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size); } - // output serialization format: | remote_ptr (8 bytes) | remote_size (8 bytes) | - output.resize(2*sizeof(uint64_t), 0); - memcpy(output.data(), &remote_ptr, sizeof(remote_ptr)); - memcpy(output.data() + sizeof(uint64_t), &remote_size, sizeof(remote_size)); - return true; } -void rpc_server::get_alignment(std::vector & output) { +void rpc_server::get_alignment(rpc_msg_get_alignment_rsp & response) { ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend); size_t alignment = ggml_backend_buft_get_alignment(buft); GGML_PRINT_DEBUG("[%s] alignment: %lu\n", __func__, alignment); - // output serialization format: | alignment (8 bytes) | - output.resize(sizeof(uint64_t), 0); - memcpy(output.data(), &alignment, sizeof(alignment)); + response.alignment = alignment; } -void rpc_server::get_max_size(std::vector & output) { +void rpc_server::get_max_size(rpc_msg_get_max_size_rsp & response) { ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend); size_t max_size = ggml_backend_buft_get_max_size(buft); GGML_PRINT_DEBUG("[%s] max_size: %lu\n", __func__, max_size); - // output serialization format: | max_size (8 bytes) | - output.resize(sizeof(uint64_t), 0); - memcpy(output.data(), &max_size, sizeof(max_size)); + response.max_size = max_size; } -bool rpc_server::buffer_get_base(const std::vector & input, std::vector & output) { - // input serialization format: | remote_ptr (8 bytes) | - if (input.size() != sizeof(uint64_t)) { - return false; - } - uint64_t remote_ptr; - memcpy(&remote_ptr, input.data(), sizeof(remote_ptr)); - GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, remote_ptr); - ggml_backend_buffer_t buffer = reinterpret_cast(remote_ptr); +bool rpc_server::buffer_get_base(const rpc_msg_buffer_get_base_req & request, rpc_msg_buffer_get_base_rsp & response) { + GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr); + ggml_backend_buffer_t buffer = reinterpret_cast(request.remote_ptr); if (buffers.find(buffer) == buffers.end()) { GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__); return false; } void * base = ggml_backend_buffer_get_base(buffer); - // output serialization format: | base_ptr (8 bytes) | - uint64_t base_ptr = reinterpret_cast(base); - output.resize(sizeof(uint64_t), 0); - memcpy(output.data(), &base_ptr, sizeof(base_ptr)); + response.base_ptr = reinterpret_cast(base); return true; } -bool rpc_server::free_buffer(const std::vector & input) { - // input serialization format: | remote_ptr (8 bytes) | - if (input.size() != sizeof(uint64_t)) { - return false; - } - uint64_t remote_ptr; - memcpy(&remote_ptr, input.data(), sizeof(remote_ptr)); - GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, remote_ptr); - ggml_backend_buffer_t buffer = reinterpret_cast(remote_ptr); +bool rpc_server::free_buffer(const rpc_msg_free_buffer_req & request) { + GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr); + ggml_backend_buffer_t buffer = reinterpret_cast(request.remote_ptr); if (buffers.find(buffer) == buffers.end()) { GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__); return false; @@ -857,22 +839,14 @@ bool rpc_server::free_buffer(const std::vector & input) { return true; } -bool rpc_server::buffer_clear(const std::vector & input) { - // input serialization format: | remote_ptr (8 bytes) | value (1 byte) | - if (input.size() != sizeof(uint64_t) + sizeof(uint8_t)) { - return false; - } - uint64_t remote_ptr; - memcpy(&remote_ptr, input.data(), sizeof(remote_ptr)); - uint8_t value; - memcpy(&value, input.data() + sizeof(uint64_t), sizeof(value)); - GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, remote_ptr, value); - ggml_backend_buffer_t buffer = reinterpret_cast(remote_ptr); +bool rpc_server::buffer_clear(const rpc_msg_buffer_clear_req & request) { + GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, request.remote_ptr, request.value); + ggml_backend_buffer_t buffer = reinterpret_cast(request.remote_ptr); if (buffers.find(buffer) == buffers.end()) { GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__); return false; } - ggml_backend_buffer_clear(buffer, value); + ggml_backend_buffer_clear(buffer, request.value); return true; } @@ -947,74 +921,55 @@ bool rpc_server::set_tensor(const std::vector & input) { return true; } -bool rpc_server::get_tensor(const std::vector & input, std::vector & output) { - // serialization format: | rpc_tensor | offset (8 bytes) | size (8 bytes) | - if (input.size() != sizeof(rpc_tensor) + 2*sizeof(uint64_t)) { - return false; - } - const rpc_tensor * in_tensor = (const rpc_tensor *)input.data(); - uint64_t offset; - memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset)); - uint64_t size; - memcpy(&size, input.data() + sizeof(rpc_tensor) + sizeof(offset), sizeof(size)); - +bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector & response) { struct ggml_init_params params { /*.mem_size =*/ ggml_tensor_overhead(), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; struct ggml_context * ctx = ggml_init(params); - ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor); + ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor); if (tensor == nullptr) { GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__); ggml_free(ctx); return false; } - GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size); + GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, request.offset, request.size); // sanitize tensor->data { const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer); const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer); - if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) { - GGML_ABORT("[%s] tensor->data out of bounds\n", __func__); + if (request.tensor.data + request.offset < p0 || + request.tensor.data + request.offset >= p1 || + request.size > (p1 - request.tensor.data - request.offset)) { + GGML_ABORT("[%s] tensor->data out of bounds\n", __func__); } } - // output serialization format: | data (size bytes) | - output.resize(size, 0); - ggml_backend_tensor_get(tensor, output.data(), offset, size); + response.resize(request.size, 0); + ggml_backend_tensor_get(tensor, response.data(), request.offset, request.size); ggml_free(ctx); return true; } -bool rpc_server::copy_tensor(const std::vector & input, std::vector & output) { - // serialization format: | rpc_tensor src | rpc_tensor dst | - if (input.size() != 2*sizeof(rpc_tensor)) { - return false; - } - const rpc_tensor * rpc_src = (const rpc_tensor *)input.data(); - const rpc_tensor * rpc_dst = (const rpc_tensor *)(input.data() + sizeof(rpc_src)); - +bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response) { struct ggml_init_params params { /*.mem_size =*/ 2*ggml_tensor_overhead(), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; struct ggml_context * ctx = ggml_init(params); - ggml_tensor * src = deserialize_tensor(ctx, rpc_src); - ggml_tensor * dst = deserialize_tensor(ctx, rpc_dst); + ggml_tensor * src = deserialize_tensor(ctx, &request.src); + ggml_tensor * dst = deserialize_tensor(ctx, &request.dst); if (src == nullptr || dst == nullptr) { GGML_PRINT_DEBUG("[%s] error deserializing tensors\n", __func__); ggml_free(ctx); return false; } GGML_PRINT_DEBUG("[%s] src->buffer: %p, dst->buffer: %p\n", __func__, (void*)src->buffer, (void*)dst->buffer); - bool result = ggml_backend_buffer_copy_tensor(src, dst); - // output serialization format: | result (1 byte) | - output.resize(1, 0); - output[0] = result; + response.result = ggml_backend_buffer_copy_tensor(src, dst); ggml_free(ctx); return true; } @@ -1043,7 +998,7 @@ ggml_tensor * rpc_server::create_node(uint64_t id, return result; } -bool rpc_server::graph_compute(const std::vector & input, std::vector & output) { +bool rpc_server::graph_compute(const std::vector & input, rpc_msg_graph_compute_rsp & response) { // serialization format: // | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) | if (input.size() < sizeof(uint32_t)) { @@ -1083,9 +1038,7 @@ bool rpc_server::graph_compute(const std::vector & input, std::vectornodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map); } ggml_status status = ggml_backend_graph_compute(backend, graph); - // output serialization format: | status (1 byte) | - output.resize(1, 0); - output[0] = status; + response.result = status; ggml_free(ctx); return true; } @@ -1108,89 +1061,157 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre fprintf(stderr, "Unknown command: %d\n", cmd); break; } - std::vector input; - std::vector output; - uint64_t input_size; - if (!recv_data(sockfd, &input_size, sizeof(input_size))) { - break; - } - try { - input.resize(input_size); - } catch (const std::bad_alloc & e) { - fprintf(stderr, "Failed to allocate input buffer of size %" PRIu64 "\n", input_size); - break; - } - if (!recv_data(sockfd, input.data(), input_size)) { - break; - } - bool ok = true; switch (cmd) { case RPC_CMD_ALLOC_BUFFER: { - ok = server.alloc_buffer(input, output); + rpc_msg_alloc_buffer_req request; + if (!recv_msg(sockfd, &request, sizeof(request))) { + return; + } + rpc_msg_alloc_buffer_rsp response; + server.alloc_buffer(request, response); + if (!send_msg(sockfd, &response, sizeof(response))) { + return; + } break; } case RPC_CMD_GET_ALIGNMENT: { - server.get_alignment(output); + if (!recv_msg(sockfd, nullptr, 0)) { + return; + } + rpc_msg_get_alignment_rsp response; + server.get_alignment(response); + if (!send_msg(sockfd, &response, sizeof(response))) { + return; + } break; } case RPC_CMD_GET_MAX_SIZE: { - server.get_max_size(output); + if (!recv_msg(sockfd, nullptr, 0)) { + return; + } + rpc_msg_get_max_size_rsp response; + server.get_max_size(response); + if (!send_msg(sockfd, &response, sizeof(response))) { + return; + } break; } case RPC_CMD_BUFFER_GET_BASE: { - ok = server.buffer_get_base(input, output); + rpc_msg_buffer_get_base_req request; + if (!recv_msg(sockfd, &request, sizeof(request))) { + return; + } + rpc_msg_buffer_get_base_rsp response; + if (!server.buffer_get_base(request, response)) { + return; + } + if (!send_msg(sockfd, &response, sizeof(response))) { + return; + } break; } case RPC_CMD_FREE_BUFFER: { - ok = server.free_buffer(input); + rpc_msg_free_buffer_req request; + if (!recv_msg(sockfd, &request, sizeof(request))) { + return; + } + if (!server.free_buffer(request)) { + return; + } + if (!send_msg(sockfd, nullptr, 0)) { + return; + } break; } case RPC_CMD_BUFFER_CLEAR: { - ok = server.buffer_clear(input); + rpc_msg_buffer_clear_req request; + if (!recv_msg(sockfd, &request, sizeof(request))) { + return; + } + if (!server.buffer_clear(request)) { + return; + } + if (!send_msg(sockfd, nullptr, 0)) { + return; + } break; } case RPC_CMD_SET_TENSOR: { - ok = server.set_tensor(input); + std::vector input; + if (!recv_msg(sockfd, input)) { + return; + } + if (!server.set_tensor(input)) { + return; + } + if (!send_msg(sockfd, nullptr, 0)) { + return; + } break; } case RPC_CMD_GET_TENSOR: { - ok = server.get_tensor(input, output); + rpc_msg_get_tensor_req request; + if (!recv_msg(sockfd, &request, sizeof(request))) { + return; + } + std::vector response; + if (!server.get_tensor(request, response)) { + return; + } + if (!send_msg(sockfd, response.data(), response.size())) { + return; + } break; } case RPC_CMD_COPY_TENSOR: { - ok = server.copy_tensor(input, output); + rpc_msg_copy_tensor_req request; + if (!recv_msg(sockfd, &request, sizeof(request))) { + return; + } + rpc_msg_copy_tensor_rsp response; + if (!server.copy_tensor(request, response)) { + return; + } + if (!send_msg(sockfd, &response, sizeof(response))) { + return; + } break; } case RPC_CMD_GRAPH_COMPUTE: { - ok = server.graph_compute(input, output); + std::vector input; + if (!recv_msg(sockfd, input)) { + return; + } + rpc_msg_graph_compute_rsp response; + if (!server.graph_compute(input, response)) { + return; + } + if (!send_msg(sockfd, &response, sizeof(response))) { + return; + } break; } case RPC_CMD_GET_DEVICE_MEMORY: { - // output serialization format: | free (8 bytes) | total (8 bytes) | - output.resize(2*sizeof(uint64_t), 0); - memcpy(output.data(), &free_mem, sizeof(free_mem)); - memcpy(output.data() + sizeof(uint64_t), &total_mem, sizeof(total_mem)); + if (!recv_msg(sockfd, nullptr, 0)) { + return; + } + rpc_msg_get_device_memory_rsp response; + response.free_mem = free_mem; + response.total_mem = total_mem; + if (!send_msg(sockfd, &response, sizeof(response))) { + return; + } break; } default: { fprintf(stderr, "Unknown command: %d\n", cmd); - ok = false; + return; } } - if (!ok) { - break; - } - uint64_t output_size = output.size(); - if (!send_data(sockfd, &output_size, sizeof(output_size))) { - break; - } - if (!send_data(sockfd, output.data(), output_size)) { - break; - } } } -void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem) { +void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem) { std::string host; int port; if (!parse_endpoint(endpoint, host, port)) { @@ -1227,3 +1248,179 @@ void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free WSACleanup(); #endif } + +// device interface + +struct ggml_backend_rpc_device_context { + std::string endpoint; + std::string name; +}; + +static const char * ggml_backend_rpc_device_get_name(ggml_backend_dev_t dev) { + ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context; + + return ctx->name.c_str(); +} + +static const char * ggml_backend_rpc_device_get_description(ggml_backend_dev_t dev) { + ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context; + + return ctx->name.c_str(); +} + +static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context; + + ggml_backend_rpc_get_device_memory(ctx->endpoint.c_str(), free, total); + + UNUSED(dev); +} + +static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) { + // TODO: obtain value from the server + return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; + + UNUSED(dev); +} + +static void ggml_backend_rpc_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { + props->name = ggml_backend_rpc_device_get_name(dev); + props->description = ggml_backend_rpc_device_get_description(dev); + props->type = ggml_backend_rpc_device_get_type(dev); + ggml_backend_rpc_device_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* .async = */ false, + /* .host_buffer = */ false, + /* .buffer_from_host_ptr = */ false, + /* .events = */ false, + }; +} + +static ggml_backend_t ggml_backend_rpc_device_init(ggml_backend_dev_t dev, const char * params) { + ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context; + + return ggml_backend_rpc_init(ctx->endpoint.c_str()); + + UNUSED(params); +} + +static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_backend_dev_t dev) { + ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context; + + return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str()); + + UNUSED(dev); +} + +static ggml_backend_buffer_t ggml_backend_rpc_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { + return ggml_backend_cpu_buffer_from_ptr(ptr, size); + + UNUSED(dev); + UNUSED(max_tensor_size); +} + +static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { + UNUSED(dev); + UNUSED(op); + //TODO: call the remote backend and cache the results + return true; +} + +static bool ggml_backend_rpc_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + if (!buft || buft->iface.get_name != ggml_backend_rpc_buffer_type_name) { + return false; + } + ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context; + ggml_backend_rpc_device_context * dev_ctx = (ggml_backend_rpc_device_context *)dev->context; + return buft_ctx->endpoint == dev_ctx->endpoint; +} + +static const struct ggml_backend_device_i ggml_backend_rpc_device_i = { + /* .get_name = */ ggml_backend_rpc_device_get_name, + /* .get_description = */ ggml_backend_rpc_device_get_description, + /* .get_memory = */ ggml_backend_rpc_device_get_memory, + /* .get_type = */ ggml_backend_rpc_device_get_type, + /* .get_props = */ ggml_backend_rpc_device_get_props, + /* .init_backend = */ ggml_backend_rpc_device_init, + /* .get_buffer_type = */ ggml_backend_rpc_device_get_buffer_type, + /* .get_host_buffer_type = */ NULL, + /* .buffer_from_host_ptr = */ ggml_backend_rpc_device_buffer_from_ptr, + /* .supports_op = */ ggml_backend_rpc_device_supports_op, + /* .supports_buft = */ ggml_backend_rpc_device_supports_buft, + /* .offload_op = */ NULL, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, +}; + +// backend reg interface + +static const char * ggml_backend_rpc_reg_get_name(ggml_backend_reg_t reg) { + return "RPC"; + + UNUSED(reg); +} + +static size_t ggml_backend_rpc_reg_get_device_count(ggml_backend_reg_t reg) { + return 0; + + UNUSED(reg); +} + +static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg, size_t index) { + GGML_ABORT("The RPC backend does not have enumerated devices - use ggml_backend_add_device instead"); + + UNUSED(reg); + UNUSED(index); +} + +static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) { + if (std::strcmp(name, "ggml_backend_rpc_add_device") == 0) { + return (void *)ggml_backend_rpc_add_device; + } + return NULL; + + UNUSED(reg); +} + +static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = { + /* .get_name = */ ggml_backend_rpc_reg_get_name, + /* .get_device_count = */ ggml_backend_rpc_reg_get_device_count, + /* .get_device = */ ggml_backend_rpc_reg_get_device, + /* .get_proc_address = */ ggml_backend_rpc_get_proc_address, +}; + +ggml_backend_reg_t ggml_backend_rpc_reg(void) { + static struct ggml_backend_reg ggml_backend_rpc_reg = { + /* .iface = */ ggml_backend_rpc_reg_i, + /* .context = */ NULL, + }; + + return &ggml_backend_rpc_reg; +} + +ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) { + static std::unordered_map dev_map; + + static std::mutex mutex; + std::lock_guard lock(mutex); + + if (dev_map.find(endpoint) != dev_map.end()) { + return dev_map[endpoint]; + } + + ggml_backend_rpc_device_context * ctx = new ggml_backend_rpc_device_context { + /* .endpoint = */ endpoint, + /* .name = */ "RPC[" + std::string(endpoint) + "]", + }; + + ggml_backend_dev_t dev = new ggml_backend_device { + /* .iface = */ ggml_backend_rpc_device_i, + /* .reg = */ ggml_backend_rpc_reg(), + /* .context = */ ctx, + }; + + dev_map[endpoint] = dev; + + return dev; +} diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 6978a3192..4d91ee460 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -40,17 +40,316 @@ #include "ggml-sycl/presets.hpp" #include "ggml-sycl/gemm.hpp" -bool ggml_sycl_loaded(void); -void ggml_sycl_free_data(struct ggml_tensor * tensor); -void ggml_sycl_copy_to_device(struct ggml_tensor * tensor); -void ggml_sycl_set_main_device(int main_device); -void ggml_sycl_set_mul_mat_q(bool mul_mat_q); -void ggml_sycl_get_device_description(int device, char * description, size_t description_size); -bool ggml_backend_is_sycl(ggml_backend_t backend); -int ggml_backend_sycl_get_device(ggml_backend_t backend); -static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer); -static inline int get_sycl_env(const char *env_name, int default_val); +static bool g_sycl_loaded = false; +static ggml_sycl_device_info ggml_sycl_init() { + ggml_sycl_device_info info = {}; + + info.device_count = dpct::dev_mgr::instance().device_count(); + if (info.device_count == 0) { + fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": %s\n", __func__); + return info; + } + + GGML_ASSERT(info.device_count <= GGML_SYCL_MAX_DEVICES); + + int64_t total_vram = 0; +#if defined(GGML_SYCL_FORCE_MMQ) + fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ: yes\n", __func__); +#else + fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ: no\n", __func__); +#endif +#if defined(SYCL_USE_XMX) + fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__); +#else + fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__); +#endif + fprintf(stderr, "%s: found %d " GGML_SYCL_NAME " devices:\n", __func__, info.device_count); + + for (int i = 0; i < info.device_count; ++i) { + info.devices[i].vmm = 0; + dpct::device_info prop; + SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( + prop, dpct::dev_mgr::instance().get_device(i)))); + + info.default_tensor_split[i] = total_vram; + total_vram += prop.get_global_mem_size(); + + info.devices[i].cc = + 100 * prop.get_major_version() + 10 * prop.get_minor_version(); + + info.max_work_group_sizes[i] = prop.get_max_work_group_size(); + } + + for (int id = 0; id < info.device_count; ++id) { + info.default_tensor_split[id] /= total_vram; + } + return info; +} + +const ggml_sycl_device_info & ggml_sycl_info() { + static ggml_sycl_device_info info = ggml_sycl_init(); + return info; +} + +void print_device_detail(int id, sycl::device &device, std::string device_type) { + + dpct::device_info prop; + SYCL_CHECK(CHECK_TRY_ERROR( + dpct::get_device_info(prop, device))); + + std::string version; + version += std::to_string(prop.get_major_version()); + version += "."; + version += std::to_string(prop.get_minor_version()); + + device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), ""); + std::string name = std::string(prop.get_name()); + name = std::regex_replace(name, std::regex("\\(R\\)"), ""); + name = std::regex_replace(name, std::regex("\\(TM\\)"), ""); + + auto global_mem_size = prop.get_global_mem_size()/1000000; + + fprintf(stderr, "|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(), + name.c_str(), version.c_str(), prop.get_max_compute_units(), + prop.get_max_work_group_size(), prop.get_max_sub_group_size(), + global_mem_size, device.get_info().c_str()); +} + +void ggml_backend_sycl_print_sycl_devices() { + GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n"); + int device_count = dpct::dev_mgr::instance().device_count(); + std::map DeviceNums; + fprintf(stderr, "found %d SYCL devices:\n", device_count); + fprintf(stderr, "| | | | |Max | |Max |Global | |\n"); + fprintf(stderr, "| | | | |compute|Max work|sub |mem | |\n"); + fprintf(stderr, "|ID| Device Type| Name|Version|units |group |group|size | Driver version|\n"); + fprintf(stderr, "|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|\n"); + for (int id = 0; id < device_count; ++id) { + sycl::device device = dpct::dev_mgr::instance().get_device(id); + sycl::backend backend = device.get_backend(); + std::string backend_type = get_device_backend_and_type(device); + int type_id=DeviceNums[backend_type]++; + std::stringstream device_type; + device_type << "[" << backend_type << ":" << std::to_string(type_id) << "]"; + print_device_detail(id, device, device_type.str()); + } +} + +static inline int get_sycl_env(const char *env_name, int default_val) { + char *user_device_string = getenv(env_name); + int user_number = default_val; + + unsigned n; + if (user_device_string != NULL && + sscanf(user_device_string, " %u", &n) == 1) { + user_number = (int)n; + } else { + user_number = default_val; + } + return user_number; +} + +static void ggml_check_sycl() try { + static bool initialized = false; + + if (!initialized) { + fprintf(stderr, "[SYCL] call ggml_check_sycl\n"); + g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); + + fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug); + +#if defined(GGML_SYCL_F16) + fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__); +#else + fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__); +#endif + +/* NOT REMOVE, keep it for next optimize for XMX. +#if defined(SYCL_USE_XMX) + fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__); +#else + fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__); +#endif +*/ + + if (CHECK_TRY_ERROR(g_all_sycl_device_count = + dpct::dev_mgr::instance().device_count()) != 0) { + initialized = true; + g_sycl_loaded = false; + return; + } + GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES); + ggml_backend_sycl_print_sycl_devices(); + initialized = true; + g_sycl_loaded = true; + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +/* +device_index: device index from 0 to n (continue numbers). + It is used for device select/set in SYCL backend internal data structure. +*/ +inline void check_allow_gpu_index(const int device_index) { + if (device_index >= ggml_sycl_info().device_count) { + char error_buf[256]; + snprintf( + error_buf, + sizeof(error_buf), + "%s error: device_index:%d is out of range: [0-%d]", + __func__, + device_index, + ggml_sycl_info().device_count - 1); + fprintf(stderr, "%s\n", error_buf); + assert(false); + } +} + +GGML_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len) try { + GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_gpu_list\n"); + for(int i=0;i=max_len) break; + id_list[i] = i; + } + return; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +// sycl buffer + +struct ggml_backend_sycl_buffer_context { + int device; + void * dev_ptr = nullptr; + queue_ptr stream; + std::string name; + + ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) : + device(device), dev_ptr(dev_ptr), stream(stream) { + check_allow_gpu_index(device); + name = (GGML_SYCL_NAME + std::to_string(device)); + } + + + ~ggml_backend_sycl_buffer_context() { + if (dev_ptr != nullptr) { + ggml_sycl_set_device(device); + SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream))); + } + } +}; + +static const char * ggml_backend_sycl_buffer_get_name(ggml_backend_buffer_t buffer) { + ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context; + return ctx->name.c_str(); +} + +static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) { + return buffer->iface.get_name == ggml_backend_sycl_buffer_get_name; +} + +static void +ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try { + ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; + ggml_sycl_set_device(ctx->device); + + delete ctx; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) { + ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; + return ctx->dev_ptr; +} + +static void +ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer, + ggml_tensor *tensor) try { + ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context; + + if (tensor->view_src != NULL && tensor->view_offs == 0) { + assert(tensor->view_src->buffer->buft == buffer->buft); + tensor->backend = tensor->view_src->backend; + tensor->extra = tensor->view_src->extra; + return; + } + + + if (ggml_is_quantized(tensor->type)) { + // initialize padding to 0 to avoid possible NaN values + size_t original_size = ggml_nbytes(tensor); + size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor); + + if (padded_size > original_size && tensor->view_src == nullptr) { + SYCL_CHECK(CHECK_TRY_ERROR(ctx->stream->memset( + (char *)tensor->data + original_size, 0, + padded_size - original_size).wait())); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer, + ggml_tensor *tensor, + const void *data, size_t offset, + size_t size) try { + + ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; + + ggml_sycl_set_device(ctx->device); + auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue()); + SYCL_CHECK( + CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw())); + char* host_buf = (char*)malloc(size); + memcpy(host_buf, data, size); + SYCL_CHECK( + CHECK_TRY_ERROR((*stream).memcpy((char *)tensor->data + offset, host_buf, size) + .wait())); + free(host_buf); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer, + const ggml_tensor *tensor, + void *data, size_t offset, + size_t size) try { + + ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; + + ggml_sycl_set_device(ctx->device); + auto stream = dpct::dev_mgr::instance().get_device(ctx->device).default_queue(); + + SYCL_CHECK(CHECK_TRY_ERROR( + stream.memcpy(data, (const char *)tensor->data + offset, size) + .wait())); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst, const void *ptr_src, size_t size) { @@ -60,6 +359,850 @@ void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst, free(host_buf); } +static bool +ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer, + const ggml_tensor *src, + ggml_tensor *dst) try { + if (ggml_backend_buffer_is_sycl(src->buffer)) { + ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context; + ggml_backend_sycl_buffer_context * dst_ctx = (ggml_backend_sycl_buffer_context *)dst->buffer->context; + + ggml_sycl_set_device(src_ctx->device); + /* + DPCT1009:198: SYCL uses exceptions to report errors and does not use the + error codes. The original code was commented out and a warning string + was inserted. You need to rewrite this code. + */ + SYCL_CHECK(CHECK_TRY_ERROR( + dpct::dev_mgr::instance().get_device(src_ctx->device).queues_wait_and_throw())); + ggml_sycl_set_device(dst_ctx->device); + /* + DPCT1009:199: SYCL uses exceptions to report errors and does not use the + error codes. The original code was commented out and a warning string + was inserted. You need to rewrite this code. + */ + SYCL_CHECK(CHECK_TRY_ERROR( + dpct::dev_mgr::instance().get_device(dst_ctx->device).queues_wait_and_throw())); + /* + DPCT1009:200: SYCL uses exceptions to report errors and does not use the + error codes. The original code was commented out and a warning string + was inserted. You need to rewrite this code. + */ + + queue_ptr stream_dst = dst_ctx->stream; + queue_ptr stream_src = src_ctx->stream; + size_t size = ggml_nbytes(src); + + //todo. it's dirty solutino to walkaroud known issue:device2device cross GPUs. + dev2dev_memcpy(*stream_dst, *stream_src, dst->data, src->data, size); + +//todo, it's known issue:error in device2device cross GPUs. reused when the issue is fixed. DON"T remove +#if 0 + SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy( + (char *)dst->data, (const char *)src->data, size).wait())); + + /* + DPCT1009:201: SYCL uses exceptions to report errors and does not use the + error codes. The original code was commented out and a warning string + was inserted. You need to rewrite this code. + */ + SYCL_CHECK(CHECK_TRY_ERROR( + dpct::dev_mgr::instance().get_device(dst_ctx->device).queues_wait_and_throw())); +#endif + return true; + } + return false; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + + +static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer, + uint8_t value) try { + ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; + + ggml_sycl_set_device(ctx->device); + queue_ptr stream = ctx->stream; + SYCL_CHECK( + CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw())); + + SYCL_CHECK(CHECK_TRY_ERROR((*stream) + .memset(ctx->dev_ptr, value, buffer->size) + .wait())); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = { + /* .get_name = */ ggml_backend_sycl_buffer_get_name, + /* .free_buffer = */ ggml_backend_sycl_buffer_free_buffer, + /* .get_base = */ ggml_backend_sycl_buffer_get_base, + /* .init_tensor = */ ggml_backend_sycl_buffer_init_tensor, + /* .memset_tensor = */ NULL, + /* .set_tensor = */ ggml_backend_sycl_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_sycl_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_sycl_buffer_cpy_tensor, + /* .clear = */ ggml_backend_sycl_buffer_clear, + /* .reset = */ NULL, +}; + +// sycl buffer type +struct ggml_backend_sycl_buffer_type_context { + int device; + std::string name; + + // each buffer type has its own stream + queue_ptr stream = nullptr; +}; + +static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + ggml_backend_sycl_buffer_type_context * ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; + + return ctx->name.c_str(); +} + +static ggml_backend_buffer_t +ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, + size_t size) try { + ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; + ggml_sycl_set_device(buft_ctx->device); + const queue_ptr stream = buft_ctx->stream; + size = std::max(size, (size_t)1); // syclMalloc returns null for size 0 + + void * dev_ptr; + SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device( + size, *stream))); + if (!dev_ptr) { + fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, size); + return nullptr; + } + ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream); + return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + return 128; + GGML_UNUSED(buft); +} + +static size_t ggml_backend_sycl_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + return dpct::get_current_device().get_max_mem_alloc_size(); + + GGML_UNUSED(buft); +} + +static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { + size_t size = ggml_nbytes(tensor); + int64_t ne0 = tensor->ne[0]; + + if (ggml_is_quantized(tensor->type)) { + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + } + + return size; + + GGML_UNUSED(buft); +} + +static const ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = { + /* .get_name = */ ggml_backend_sycl_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size, + /* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size, + /* .is_host = */ NULL, +}; + +ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) { + static std::mutex mutex; + std::lock_guard lock(mutex); + + GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n"); + + auto dev_count = ggml_backend_sycl_get_device_count(); + + if (device>=dev_count or device<0) { + printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", + device, dev_count-1); + GGML_ASSERT(devicedevice; + if (device>=ggml_sycl_info().device_count or device<0) { + printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", + device, ggml_sycl_info().device_count-1); + GGML_ASSERT(devicestream(i, 0)}, + }; + } + ggml_backend_sycl_buffer_type_initialized = true; + } + return &ggml_backend_sycl_buffer_types[device]; +} + +// sycl split buffer + +static int64_t get_row_rounding(ggml_type type, const std::array & tensor_split) { + int64_t min_compute_capability = INT_MAX; + int64_t max_compute_capability = INT_MIN; + for (int i = 0; i < ggml_sycl_info().device_count; ++i) { + if (tensor_split[i] < (i + 1 < ggml_sycl_info().device_count ? tensor_split[i + 1] : 1.0f)) { + if (min_compute_capability > ggml_sycl_info().devices[i].cc) { + min_compute_capability = ggml_sycl_info().devices[i].cc; + } + if (max_compute_capability < ggml_sycl_info().devices[i].cc) { + max_compute_capability = ggml_sycl_info().devices[i].cc; + } + } + } + + switch(type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + return max_compute_capability >= VER_GEN9 ? 128 : 64; + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return 64; + case GGML_TYPE_F16: + case GGML_TYPE_F32: + return 1; + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ4_XS: + case GGML_TYPE_IQ4_NL: + return max_compute_capability >= VER_GEN9 ? 128 : 64; + case GGML_TYPE_IQ3_S: + return max_compute_capability >= VER_GEN9 ? 128 : 64; + case GGML_TYPE_Q6_K: + return 64; + default: + GGML_ABORT("fatal error"); + } +} + +static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array & tensor_split, int id) { + const int64_t nrows = ggml_nrows(tensor); + const int64_t rounding = get_row_rounding(tensor->type, tensor_split); + + *row_low = id == 0 ? 0 : nrows*tensor_split[id]; + *row_low -= *row_low % rounding; + if (id == ggml_sycl_info().device_count - 1) { + *row_high = nrows; + } else { + *row_high = nrows*tensor_split[id + 1]; + *row_high -= *row_high % rounding; + } +} + +static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]); +} + +struct ggml_backend_sycl_split_buffer_type_context { + std::array tensor_split; +}; + +struct ggml_backend_sycl_split_buffer_context { + ~ggml_backend_sycl_split_buffer_context() try { + for (ggml_tensor_extra_gpu * extra : tensor_extras) { + for (int i = 0; i < ggml_sycl_info().device_count; ++i) { + for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) { + if (extra->events[i][is] != nullptr) { + /* + DPCT1009:206: SYCL uses exceptions to report errors and + does not use the error codes. The original code was + commented out and a warning string was inserted. You + need to rewrite this code. + */ + SYCL_CHECK(CHECK_TRY_ERROR( + dpct::destroy_event(extra->events[i][is]))); + } + } + if (extra->data_device[i] != nullptr) { + /* + DPCT1009:207: SYCL uses exceptions to report errors and does + not use the error codes. The original code was commented out + and a warning string was inserted. You need to rewrite this + code. + */ + ggml_sycl_set_device(i); + SYCL_CHECK(CHECK_TRY_ERROR(sycl::free( + extra->data_device[i], *(streams[i])))); + } + } + delete extra; + } + } + catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); + } + + std::vector tensor_extras; + std::vector streams; +}; + +static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backend_buffer_t buffer) { + return GGML_SYCL_NAME "_Split"; + + GGML_UNUSED(buffer); +} + +static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) { + return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name; +} + +static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; + delete ctx; +} + +static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) { + // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced + return (void *)0x1000; + + GGML_UNUSED(buffer); +} + +static void +ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, + ggml_tensor *tensor) try { + GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported + + ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; + ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *)buffer->buft->context; + + const int64_t ne0 = tensor->ne[0]; + + ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{}; + + ctx->tensor_extras.push_back(extra); + ctx->streams.push_back(&(dpct::get_current_device().default_queue())); + + for (int i = 0; i < ggml_sycl_info().device_count; ++i) { + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + continue; + } + + size_t size = ggml_nbytes_split(tensor, nrows_split); + const size_t original_size = size; + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + + // FIXME: do not crash if cudaMalloc fails + // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first + ggml_sycl_set_device(i); + const queue_ptr stream = ctx->streams[i]; + char * buf; + /* + DPCT1009:208: SYCL uses exceptions to report errors and does not use the + error codes. The original code was commented out and a warning string + was inserted. You need to rewrite this code. + */ + SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device( + size, *stream))); + if (!buf) { + char err_buf[1024]; + snprintf(err_buf, 1023, "%s: can't malloc %lu Bytes memory on device", __func__, size); + throw std::runtime_error(err_buf); + } + // set padding to 0 to avoid possible NaN values + if (size > original_size) { + /* + DPCT1009:209: SYCL uses exceptions to report errors and does not use + the error codes. The original code was commented out and a warning + string was inserted. You need to rewrite this code. + */ + SYCL_CHECK(CHECK_TRY_ERROR( + (*stream) + .memset(buf + original_size, 0, size - original_size) + .wait())); + } + + extra->data_device[i] = buf; + + for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) { + /* + DPCT1009:210: SYCL uses exceptions to report errors and does not use + the error codes. The original code was commented out and a warning + string was inserted. You need to rewrite this code. + */ + SYCL_CHECK( + CHECK_TRY_ERROR(extra->events[i][is] = new sycl::event())); + } + } + tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT; + tensor->extra = extra; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void +ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer, + ggml_tensor *tensor, const void *data, + size_t offset, size_t size) try { + // split tensors must always be set in their entirety at once + GGML_ASSERT(offset == 0); + GGML_ASSERT(size == ggml_nbytes(tensor)); + + ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; + ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *)buffer->buft->context; + + const int64_t ne0 = tensor->ne[0]; + const size_t nb1 = tensor->nb[1]; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; + + for (int i = 0; i < ggml_sycl_info().device_count; ++i) { + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + continue; + } + + const size_t offset_split = row_low*nb1; + size_t size = ggml_nbytes_split(tensor, nrows_split); + const size_t original_size = size; + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + + const char * buf_host = (const char *)data + offset_split; + /* + DPCT1009:211: SYCL uses exceptions to report errors and does not use the + error codes. The original code was commented out and a warning string + was inserted. You need to rewrite this code. + */ + ggml_sycl_set_device(i); + const queue_ptr stream = ctx->streams[i]; + SYCL_CHECK(CHECK_TRY_ERROR( + (*stream) + .memcpy(extra->data_device[i], buf_host, original_size) + .wait())); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void +ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer, + const ggml_tensor *tensor, void *data, + size_t offset, size_t size) try { + // split tensors must always be set in their entirety at once + GGML_ASSERT(offset == 0); + GGML_ASSERT(size == ggml_nbytes(tensor)); + + ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; + ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *)buffer->buft->context; + + const int64_t ne0 = tensor->ne[0]; + const size_t nb1 = tensor->nb[1]; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; + + for (int i = 0; i < ggml_sycl_info().device_count; ++i) { + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + continue; + } + + const size_t offset_split = row_low*nb1; + size_t size = ggml_nbytes_split(tensor, nrows_split); + const size_t original_size = size; + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + + char * buf_host = (char *)data + offset_split; + /* + DPCT1009:212: SYCL uses exceptions to report errors and does not use the + error codes. The original code was commented out and a warning string + was inserted. You need to rewrite this code. + */ + ggml_sycl_set_device(i); + const queue_ptr stream = ctx->streams[i]; + SYCL_CHECK(CHECK_TRY_ERROR( + (*stream) + .memcpy(buf_host, extra->data_device[i], original_size) + .wait())); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + GGML_UNUSED(buffer); + GGML_UNUSED(value); +} + +static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = { + /* .get_name = */ ggml_backend_sycl_split_buffer_get_name, + /* .free_buffer = */ ggml_backend_sycl_split_buffer_free_buffer, + /* .get_base = */ ggml_backend_sycl_split_buffer_get_base, + /* .init_tensor = */ ggml_backend_sycl_split_buffer_init_tensor, + /* .memset_tensor = */ NULL, + /* .set_tensor = */ ggml_backend_sycl_split_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_sycl_split_buffer_get_tensor, + /* .cpy_tensor = */ NULL, + /* .clear = */ ggml_backend_sycl_split_buffer_clear, + /* .reset = */ NULL, +}; + +// sycl split buffer type + +static const char * ggml_backend_sycl_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return GGML_SYCL_NAME "_Split"; + + GGML_UNUSED(buft); +} + +static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point + // instead, we allocate them for each tensor separately in init_tensor + // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated, + // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct. + ggml_backend_sycl_split_buffer_context * ctx = new ggml_backend_sycl_split_buffer_context(); + + return ggml_backend_buffer_init(buft, ggml_backend_sycl_split_buffer_interface, ctx, size); +} + +static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + return 128; + GGML_UNUSED(buft); +} + +static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { + ggml_backend_sycl_split_buffer_type_context * ctx = (ggml_backend_sycl_split_buffer_type_context *)buft->context; + + size_t total_size = 0; + + const int64_t ne0 = tensor->ne[0]; + + for (int i = 0; i < ggml_sycl_info().device_count; ++i) { + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, i); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + continue; + } + + total_size += ggml_nbytes_split(tensor, nrows_split); + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + } + + return total_size; +} + +static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + return false; + + GGML_UNUSED(buft); +} + +static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface = { + /* .get_name = */ ggml_backend_sycl_split_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_sycl_split_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_sycl_split_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_sycl_split_buffer_type_get_alloc_size, + /* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host, +}; + +ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) { + static std::mutex mutex; + std::lock_guard lock(mutex); + + GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_split_buffer_type\n"); + ggml_check_sycl(); + // FIXME: this is not thread safe + static std::map, struct ggml_backend_buffer_type> buft_map; + + std::array tensor_split_arr = {}; + + bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_SYCL_MAX_DEVICES, [](float x) { return x == 0.0f; }); + if (all_zero) { + tensor_split_arr = ggml_sycl_info().default_tensor_split; + } else { + float split_sum = 0.0f; + for (int i = 0; i < ggml_sycl_info().device_count; ++i) { + tensor_split_arr[i] = split_sum; + split_sum += tensor_split[i]; + } + for (int i = 0; i < ggml_sycl_info().device_count; ++i) { + tensor_split_arr[i] /= split_sum; + } + } + + auto it = buft_map.find(tensor_split_arr); + if (it != buft_map.end()) { + return &it->second; + } + + struct ggml_backend_buffer_type buft { + /* .iface = */ ggml_backend_sycl_split_buffer_type_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), 0), + /* .context = */ new ggml_backend_sycl_split_buffer_type_context{tensor_split_arr}, + }; + + auto result = buft_map.emplace(tensor_split_arr, buft); + return &result.first->second; +} + +// host buffer type + +static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_type_t buft) { + return GGML_SYCL_NAME "_Host"; + + GGML_UNUSED(buft); +} + +static const char * ggml_backend_sycl_host_buffer_name(ggml_backend_buffer_t buffer) { + return GGML_SYCL_NAME "_Host"; + + GGML_UNUSED(buffer); +} + +static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_sycl_host_free(buffer->context); +} + +static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + void * ptr = ggml_sycl_host_malloc(size); + + if (ptr == nullptr) { + // fallback to cpu buffer + return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); + } + + // FIXME: this is a hack to avoid having to implement a new buffer type + ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); + buffer->buft = buft; + buffer->iface.get_name = ggml_backend_sycl_host_buffer_name; + buffer->iface.free_buffer = ggml_backend_sycl_host_buffer_free_buffer; + + return buffer; +} + +ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() { + GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_host_buffer_type\n"); + static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = { + /* .iface = */ { + /* .get_name = */ ggml_backend_sycl_host_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_sycl_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, + /* .get_max_size = */ NULL, // TODO: return device.maxBufferLength + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), 0), + /* .context = */ nullptr, + }; + + return &ggml_backend_sycl_buffer_type_host; +} + +// buffer pool for sycl (legacy) +struct ggml_sycl_pool_leg : public ggml_sycl_pool { + static const int MAX_SYCL_BUFFERS = 256; + + int device; + queue_ptr qptr; + struct ggml_sycl_buffer { + void * ptr = nullptr; + size_t size = 0; + }; + + ggml_sycl_buffer buffer_pool[MAX_SYCL_BUFFERS] = {}; + size_t pool_size = 0; + + explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : + qptr(qptr_), + device(device_) { + } + + ~ggml_sycl_pool_leg() { + for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) { + ggml_sycl_buffer & b = buffer_pool[i]; + if (b.ptr != nullptr) { + SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(b.ptr, *qptr))); + pool_size -= b.size; + } + } + GGML_ASSERT(pool_size == 0); + } + + void * alloc(size_t size, size_t * actual_size) override { +#ifdef DEBUG_sycl_MALLOC + int nnz = 0; + size_t max_size = 0; +#endif + size_t best_diff = 1ull << 36; + int ibest = -1; + for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) { + ggml_sycl_buffer& b = buffer_pool[i]; + if (b.ptr != nullptr) { +#ifdef DEBUG_sycl_MALLOC + ++nnz; + if (b.size > max_size) max_size = b.size; +#endif + if (b.size >= size) { + size_t diff = b.size - size; + if (diff < best_diff) { + best_diff = diff; + ibest = i; + if (!best_diff) { + void * ptr = b.ptr; + *actual_size = b.size; + b.ptr = nullptr; + b.size = 0; + return ptr; + } + } + } + } + } + if (ibest >= 0) { + ggml_sycl_buffer& b = buffer_pool[ibest]; + void * ptr = b.ptr; + *actual_size = b.size; + b.ptr = nullptr; + b.size = 0; + return ptr; + } + void * ptr; + size_t look_ahead_size = (size_t) (1.05 * size); + + SYCL_CHECK( + CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device( + look_ahead_size, *qptr))); + if (!ptr) { + fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, look_ahead_size); + return nullptr; + } + + *actual_size = look_ahead_size; + pool_size += look_ahead_size; + + #ifdef DEBUG_SYCL_MALLOC + fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz, + (uint32_t)(max_size/1024/1024), (uint32_t)(g_sycl_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024)); + #endif + // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg look_ahead_size=%lu, return %p\n", look_ahead_size, ptr); + return ptr; + } + + void free(void * ptr, size_t size) override { + for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) { + ggml_sycl_buffer& b = buffer_pool[i]; + if (b.ptr == nullptr) { + b.ptr = ptr; + b.size = size; + return; + } + } + fprintf(stderr, "WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n"); + SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr))); + pool_size -= size; + } +}; + +std::unique_ptr ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) { + // TBD: NO VMM support + // if (ggml_sycl_info().devices[device].vmm) { + // return std::unique_ptr(new ggml_sycl_pool_vmm(device)); + // } + return std::unique_ptr(new ggml_sycl_pool_leg(qptr, device)); +} + +// TBD pool with virtual memory management +// struct ggml_sycl_pool_vmm : public ggml_sycl_pool + +/// kernels + typedef void (*cpy_kernel_t)(const char * cx, char * cdst); typedef void (*ggml_sycl_func_t)(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); typedef void (*ggml_sycl_op_mul_mat_t)( @@ -1706,296 +2849,6 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst, }); } -static bool g_sycl_loaded = false; - -bool ggml_sycl_loaded(void) { - return g_sycl_loaded; -} - -void print_device_detail(int id, sycl::device &device, std::string device_type) { - - dpct::device_info prop; - SYCL_CHECK(CHECK_TRY_ERROR( - dpct::get_device_info(prop, device))); - - std::string version; - version += std::to_string(prop.get_major_version()); - version += "."; - version += std::to_string(prop.get_minor_version()); - - device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), ""); - std::string name = std::string(prop.get_name()); - name = std::regex_replace(name, std::regex("\\(R\\)"), ""); - name = std::regex_replace(name, std::regex("\\(TM\\)"), ""); - - auto global_mem_size = prop.get_global_mem_size()/1000000; - - fprintf(stderr, "|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(), - name.c_str(), version.c_str(), prop.get_max_compute_units(), - prop.get_max_work_group_size(), prop.get_max_sub_group_size(), - global_mem_size, device.get_info().c_str()); -} - -void ggml_backend_sycl_print_sycl_devices() { - GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n"); - int device_count = dpct::dev_mgr::instance().device_count(); - std::map DeviceNums; - fprintf(stderr, "found %d SYCL devices:\n", device_count); - fprintf(stderr, "| | | | |Max | |Max |Global | |\n"); - fprintf(stderr, "| | | | |compute|Max work|sub |mem | |\n"); - fprintf(stderr, "|ID| Device Type| Name|Version|units |group |group|size | Driver version|\n"); - fprintf(stderr, "|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|\n"); - for (int id = 0; id < device_count; ++id) { - sycl::device device = dpct::dev_mgr::instance().get_device(id); - sycl::backend backend = device.get_backend(); - std::string backend_type = get_device_backend_and_type(device); - int type_id=DeviceNums[backend_type]++; - std::stringstream device_type; - device_type << "[" << backend_type << ":" << std::to_string(type_id) << "]"; - print_device_detail(id, device, device_type.str()); - } -} - -static inline int get_sycl_env(const char *env_name, int default_val) { - char *user_device_string = getenv(env_name); - int user_number = default_val; - - unsigned n; - if (user_device_string != NULL && - sscanf(user_device_string, " %u", &n) == 1) { - user_number = (int)n; - } else { - user_number = default_val; - } - return user_number; -} - -static void ggml_check_sycl() try { - static bool initialized = false; - - if (!initialized) { - fprintf(stderr, "[SYCL] call ggml_check_sycl\n"); - g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); - - fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug); - -#if defined(GGML_SYCL_F16) - fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__); -#else - fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__); -#endif - -/* NOT REMOVE, keep it for next optimize for XMX. -#if defined(SYCL_USE_XMX) - fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__); -#else - fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__); -#endif -*/ - - if (CHECK_TRY_ERROR(g_all_sycl_device_count = - dpct::dev_mgr::instance().device_count()) != 0) { - initialized = true; - g_sycl_loaded = false; - return; - } - GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES); - ggml_backend_sycl_print_sycl_devices(); - initialized = true; - g_sycl_loaded = true; - } -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static ggml_sycl_device_info ggml_sycl_init() { - ggml_sycl_device_info info = {}; - - info.device_count = dpct::dev_mgr::instance().device_count(); - if (info.device_count == 0) { - fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": %s\n", __func__); - return info; - } - - GGML_ASSERT(info.device_count <= GGML_SYCL_MAX_DEVICES); - - int64_t total_vram = 0; -#if defined(GGML_SYCL_FORCE_MMQ) - fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ: yes\n", __func__); -#else - fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ: no\n", __func__); -#endif -#if defined(SYCL_USE_XMX) - fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__); -#else - fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__); -#endif - fprintf(stderr, "%s: found %d " GGML_SYCL_NAME " devices:\n", __func__, info.device_count); - - for (int i = 0; i < info.device_count; ++i) { - info.devices[i].vmm = 0; - dpct::device_info prop; - SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( - prop, dpct::dev_mgr::instance().get_device(i)))); - - info.default_tensor_split[i] = total_vram; - total_vram += prop.get_global_mem_size(); - - info.devices[i].cc = - 100 * prop.get_major_version() + 10 * prop.get_minor_version(); - - info.max_work_group_sizes[i] = prop.get_max_work_group_size(); - } - - for (int id = 0; id < info.device_count; ++id) { - info.default_tensor_split[id] /= total_vram; - } - return info; -} - -const ggml_sycl_device_info & ggml_sycl_info() { - static ggml_sycl_device_info info = ggml_sycl_init(); - return info; -} - -/* -device_index: device index from 0 to n (continue numbers). - It is used for device select/set in SYCL backend internal data structure. -*/ -inline void check_allow_gpu_index(const int device_index) { - if (device_index >= ggml_sycl_info().device_count) { - char error_buf[256]; - snprintf( - error_buf, - sizeof(error_buf), - "%s error: device_index:%d is out of range: [0-%d]", - __func__, - device_index, - ggml_sycl_info().device_count - 1); - fprintf(stderr, "%s\n", error_buf); - assert(false); - } -} - -// buffer pool for sycl (legacy) -struct ggml_sycl_pool_leg : public ggml_sycl_pool { - static const int MAX_SYCL_BUFFERS = 256; - - int device; - queue_ptr qptr; - struct ggml_sycl_buffer { - void * ptr = nullptr; - size_t size = 0; - }; - - ggml_sycl_buffer buffer_pool[MAX_SYCL_BUFFERS] = {}; - size_t pool_size = 0; - - explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : - qptr(qptr_), - device(device_) { - } - - ~ggml_sycl_pool_leg() { - for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) { - ggml_sycl_buffer & b = buffer_pool[i]; - if (b.ptr != nullptr) { - SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(b.ptr, *qptr))); - pool_size -= b.size; - } - } - GGML_ASSERT(pool_size == 0); - } - - void * alloc(size_t size, size_t * actual_size) override { -#ifdef DEBUG_sycl_MALLOC - int nnz = 0; - size_t max_size = 0; -#endif - size_t best_diff = 1ull << 36; - int ibest = -1; - for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) { - ggml_sycl_buffer& b = buffer_pool[i]; - if (b.ptr != nullptr) { -#ifdef DEBUG_sycl_MALLOC - ++nnz; - if (b.size > max_size) max_size = b.size; -#endif - if (b.size >= size) { - size_t diff = b.size - size; - if (diff < best_diff) { - best_diff = diff; - ibest = i; - if (!best_diff) { - void * ptr = b.ptr; - *actual_size = b.size; - b.ptr = nullptr; - b.size = 0; - return ptr; - } - } - } - } - } - if (ibest >= 0) { - ggml_sycl_buffer& b = buffer_pool[ibest]; - void * ptr = b.ptr; - *actual_size = b.size; - b.ptr = nullptr; - b.size = 0; - return ptr; - } - void * ptr; - size_t look_ahead_size = (size_t) (1.05 * size); - - SYCL_CHECK( - CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device( - look_ahead_size, *qptr))); - if (!ptr) { - fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, look_ahead_size); - return nullptr; - } - - *actual_size = look_ahead_size; - pool_size += look_ahead_size; - - #ifdef DEBUG_SYCL_MALLOC - fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz, - (uint32_t)(max_size/1024/1024), (uint32_t)(g_sycl_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024)); - #endif - // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg look_ahead_size=%lu, return %p\n", look_ahead_size, ptr); - return ptr; - } - - void free(void * ptr, size_t size) override { - for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) { - ggml_sycl_buffer& b = buffer_pool[i]; - if (b.ptr == nullptr) { - b.ptr = ptr; - b.size = size; - return; - } - } - fprintf(stderr, "WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n"); - SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr))); - pool_size -= size; - } -}; - -std::unique_ptr ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) { - // TBD: NO VMM support - // if (ggml_sycl_info().devices[device].vmm) { - // return std::unique_ptr(new ggml_sycl_pool_vmm(device)); - // } - return std::unique_ptr(new ggml_sycl_pool_leg(qptr, device)); -} - -// TBD pool with virtual memory management -// struct ggml_sycl_pool_vmm : public ggml_sycl_pool - static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst, const struct ggml_tensor *src, int64_t i3, int64_t i2, @@ -2376,54 +3229,6 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor (void) src1_dd; } -static int64_t get_row_rounding(ggml_type type, const std::array & tensor_split) { - int64_t min_compute_capability = INT_MAX; - int64_t max_compute_capability = INT_MIN; - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - if (tensor_split[i] < (i + 1 < ggml_sycl_info().device_count ? tensor_split[i + 1] : 1.0f)) { - if (min_compute_capability > ggml_sycl_info().devices[i].cc) { - min_compute_capability = ggml_sycl_info().devices[i].cc; - } - if (max_compute_capability < ggml_sycl_info().devices[i].cc) { - max_compute_capability = ggml_sycl_info().devices[i].cc; - } - } - } - - switch(type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - return max_compute_capability >= VER_GEN9 ? 128 : 64; - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - return 64; - case GGML_TYPE_F16: - case GGML_TYPE_F32: - return 1; - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_IQ4_NL: - return max_compute_capability >= VER_GEN9 ? 128 : 64; - case GGML_TYPE_IQ3_S: - return max_compute_capability >= VER_GEN9 ? 128 : 64; - case GGML_TYPE_Q6_K: - return 64; - default: - GGML_ABORT("fatal error"); - } - -} - inline void ggml_sycl_op_mul_mat_sycl( ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, @@ -2783,10 +3588,6 @@ static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) { peer_access_enabled = enable_peer_access; } -struct ggml_backend_sycl_split_buffer_type_context { - std::array tensor_split; -}; - static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, ggml_sycl_op_mul_mat_t op, @@ -3865,12 +4666,6 @@ static void ggml_sycl_nop(ggml_backend_sycl_context & ctx, const ggml_tensor * s (void) dst; } -static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); - - return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]); -} - void ggml_sycl_set_main_device(const int main_device) try { if (dpct::get_current_device_id() == main_device) return; check_allow_gpu_index(main_device); @@ -4038,39 +4833,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens return true; } -GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try { - GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n"); - for(int i=0;i=max_len) break; - id_list[i] = i; - } - return; -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -int ggml_sycl_get_device_count() try { - int device_count; - if (CHECK_TRY_ERROR(device_count = - dpct::dev_mgr::instance().device_count()) != 0) { - return 0; - } - return device_count; -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, +GGML_API void ggml_backend_sycl_get_device_description(int device, char *description, size_t description_size) try { - GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n"); + GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_description\n"); dpct::device_info prop; SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( prop, dpct::dev_mgr::instance().get_device(device)))); @@ -4082,7 +4847,7 @@ catch (sycl::exception const &exc) { std::exit(1); } -GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, +void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total) try { GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n"); ggml_sycl_set_device(device); @@ -4108,804 +4873,16 @@ catch (sycl::exception const &exc) { //////////////////////////////////////////////////////////////////////////////// -// backend interface - -#define UNUSED GGML_UNUSED - -// sycl buffer - -struct ggml_backend_sycl_buffer_context { - int device; - void * dev_ptr = nullptr; - queue_ptr stream; - std::string name; - - ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) : - device(device), dev_ptr(dev_ptr), stream(stream) { - check_allow_gpu_index(device); - name = (GGML_SYCL_NAME + std::to_string(device)); - } - - - ~ggml_backend_sycl_buffer_context() { - if (dev_ptr != nullptr) { - ggml_sycl_set_device(device); - SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream))); - } - } -}; - -GGML_CALL static const char * ggml_backend_sycl_buffer_get_name(ggml_backend_buffer_t buffer) { - ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context; - return ctx->name.c_str(); -} - -GGML_CALL static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) { - return buffer->iface.get_name == ggml_backend_sycl_buffer_get_name; -} - -static void -ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try { - ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; - ggml_sycl_set_device(ctx->device); - - delete ctx; -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) { - ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; - return ctx->dev_ptr; -} - -GGML_CALL static void -ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer, - ggml_tensor *tensor) try { - ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context; - - if (tensor->view_src != NULL && tensor->view_offs == 0) { - assert(tensor->view_src->buffer->buft == buffer->buft); - tensor->backend = tensor->view_src->backend; - tensor->extra = tensor->view_src->extra; - return; - } - - - if (ggml_is_quantized(tensor->type)) { - // initialize padding to 0 to avoid possible NaN values - size_t original_size = ggml_nbytes(tensor); - size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor); - - if (padded_size > original_size && tensor->view_src == nullptr) { - SYCL_CHECK(CHECK_TRY_ERROR(ctx->stream->memset( - (char *)tensor->data + original_size, 0, - padded_size - original_size).wait())); - } - } -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer, - ggml_tensor *tensor, - const void *data, size_t offset, - size_t size) try { - - ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; - - ggml_sycl_set_device(ctx->device); - auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue()); - SYCL_CHECK( - CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw())); - char* host_buf = (char*)malloc(size); - memcpy(host_buf, data, size); - SYCL_CHECK( - CHECK_TRY_ERROR((*stream).memcpy((char *)tensor->data + offset, host_buf, size) - .wait())); - free(host_buf); -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer, - const ggml_tensor *tensor, - void *data, size_t offset, - size_t size) try { - - ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; - - ggml_sycl_set_device(ctx->device); - auto stream = dpct::dev_mgr::instance().get_device(ctx->device).default_queue(); - - SYCL_CHECK(CHECK_TRY_ERROR( - stream.memcpy(data, (const char *)tensor->data + offset, size) - .wait())); -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -GGML_CALL static bool -ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer, - const ggml_tensor *src, - ggml_tensor *dst) try { - if (ggml_backend_buffer_is_sycl(src->buffer)) { - ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context; - ggml_backend_sycl_buffer_context * dst_ctx = (ggml_backend_sycl_buffer_context *)dst->buffer->context; - - ggml_sycl_set_device(src_ctx->device); - /* - DPCT1009:198: SYCL uses exceptions to report errors and does not use the - error codes. The original code was commented out and a warning string - was inserted. You need to rewrite this code. - */ - SYCL_CHECK(CHECK_TRY_ERROR( - dpct::dev_mgr::instance().get_device(src_ctx->device).queues_wait_and_throw())); - ggml_sycl_set_device(dst_ctx->device); - /* - DPCT1009:199: SYCL uses exceptions to report errors and does not use the - error codes. The original code was commented out and a warning string - was inserted. You need to rewrite this code. - */ - SYCL_CHECK(CHECK_TRY_ERROR( - dpct::dev_mgr::instance().get_device(dst_ctx->device).queues_wait_and_throw())); - /* - DPCT1009:200: SYCL uses exceptions to report errors and does not use the - error codes. The original code was commented out and a warning string - was inserted. You need to rewrite this code. - */ - - queue_ptr stream_dst = dst_ctx->stream; - queue_ptr stream_src = src_ctx->stream; - size_t size = ggml_nbytes(src); - - //todo. it's dirty solutino to walkaroud known issue:device2device cross GPUs. - dev2dev_memcpy(*stream_dst, *stream_src, dst->data, src->data, size); - -//todo, it's known issue:error in device2device cross GPUs. reused when the issue is fixed. DON"T remove -#if 0 - SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy( - (char *)dst->data, (const char *)src->data, size).wait())); - - /* - DPCT1009:201: SYCL uses exceptions to report errors and does not use the - error codes. The original code was commented out and a warning string - was inserted. You need to rewrite this code. - */ - SYCL_CHECK(CHECK_TRY_ERROR( - dpct::dev_mgr::instance().get_device(dst_ctx->device).queues_wait_and_throw())); -#endif - return true; - } - return false; -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - - -static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer, - uint8_t value) try { - ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; - - ggml_sycl_set_device(ctx->device); - queue_ptr stream = ctx->stream; - SYCL_CHECK( - CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw())); - - SYCL_CHECK(CHECK_TRY_ERROR((*stream) - .memset(ctx->dev_ptr, value, buffer->size) - .wait())); -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static struct ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = { - /* .get_name = */ ggml_backend_sycl_buffer_get_name, - /* .free_buffer = */ ggml_backend_sycl_buffer_free_buffer, - /* .get_base = */ ggml_backend_sycl_buffer_get_base, - /* .init_tensor = */ ggml_backend_sycl_buffer_init_tensor, - /* .memset_tensor = */ NULL, - /* .set_tensor = */ ggml_backend_sycl_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_sycl_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_sycl_buffer_cpy_tensor, - /* .clear = */ ggml_backend_sycl_buffer_clear, - /* .reset = */ NULL, -}; - -// sycl buffer type -struct ggml_backend_sycl_buffer_type_context { - int device; - std::string name; - - // each buffer type has its own stream - queue_ptr stream = nullptr; -}; - -GGML_CALL static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_buffer_type_t buft) { - ggml_backend_sycl_buffer_type_context * ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; - - return ctx->name.c_str(); -} -GGML_CALL static ggml_backend_buffer_t -ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, - size_t size) try { - ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; - ggml_sycl_set_device(buft_ctx->device); - const queue_ptr stream = buft_ctx->stream; - size = std::max(size, (size_t)1); // syclMalloc returns null for size 0 - - void * dev_ptr; - SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device( - size, *stream))); - if (!dev_ptr) { - fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, size); - return nullptr; - } - ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream); - return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size); -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - return 128; - UNUSED(buft); -} - -static size_t ggml_backend_sycl_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { - return dpct::get_current_device().get_max_mem_alloc_size(); - - UNUSED(buft); -} - -GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { - size_t size = ggml_nbytes(tensor); - int64_t ne0 = tensor->ne[0]; - - if (ggml_is_quantized(tensor->type)) { - if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); - } - } - - return size; - - UNUSED(buft); -} - -static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = { - /* .get_name = */ ggml_backend_sycl_buffer_type_name, - /* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment, - /* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size, - /* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size, - /* .is_host = */ nullptr, -}; - -ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) { - static std::mutex mutex; - std::lock_guard lock(mutex); - - GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n"); - - if (device>=ggml_sycl_info().device_count or device<0) { - printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", - device, ggml_sycl_info().device_count-1); - GGML_ASSERT(devicedevice; - if (device>=ggml_sycl_info().device_count or device<0) { - printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", - device, ggml_sycl_info().device_count-1); - GGML_ASSERT(devicestream(i, 0)}, - }; - } - ggml_backend_sycl_buffer_type_initialized = true; - } - return &ggml_backend_sycl_buffer_types[device]; -} - -// sycl split buffer type -static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array & tensor_split, int id) { - const int64_t nrows = ggml_nrows(tensor); - const int64_t rounding = get_row_rounding(tensor->type, tensor_split); - - *row_low = id == 0 ? 0 : nrows*tensor_split[id]; - *row_low -= *row_low % rounding; - if (id == ggml_sycl_info().device_count - 1) { - *row_high = nrows; - } else { - *row_high = nrows*tensor_split[id + 1]; - *row_high -= *row_high % rounding; - } -} - -struct ggml_backend_sycl_split_buffer_context { - ~ggml_backend_sycl_split_buffer_context() try { - for (ggml_tensor_extra_gpu * extra : tensor_extras) { - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) { - if (extra->events[i][is] != nullptr) { - /* - DPCT1009:206: SYCL uses exceptions to report errors and - does not use the error codes. The original code was - commented out and a warning string was inserted. You - need to rewrite this code. - */ - SYCL_CHECK(CHECK_TRY_ERROR( - dpct::destroy_event(extra->events[i][is]))); - } - } - if (extra->data_device[i] != nullptr) { - /* - DPCT1009:207: SYCL uses exceptions to report errors and does - not use the error codes. The original code was commented out - and a warning string was inserted. You need to rewrite this - code. - */ - ggml_sycl_set_device(i); - SYCL_CHECK(CHECK_TRY_ERROR(sycl::free( - extra->data_device[i], *(streams[i])))); - } - } - delete extra; - } - } - catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); - } - - std::vector tensor_extras; - std::vector streams; -}; - -GGML_CALL static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backend_buffer_t buffer) { - return GGML_SYCL_NAME "_Split"; - - UNUSED(buffer); -} - -static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) { - return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name; -} - -GGML_CALL static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; - delete ctx; -} - -GGML_CALL static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) { - // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced - return (void *)0x1000; - - UNUSED(buffer); -} - -GGML_CALL static void -ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, - ggml_tensor *tensor) try { - GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported - - ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; - ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *)buffer->buft->context; - - const int64_t ne0 = tensor->ne[0]; - - ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{}; - - ctx->tensor_extras.push_back(extra); - ctx->streams.push_back(&(dpct::get_current_device().default_queue())); - - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i); - - int64_t nrows_split = row_high - row_low; - if (nrows_split == 0) { - continue; - } - - size_t size = ggml_nbytes_split(tensor, nrows_split); - const size_t original_size = size; - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses - if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); - } - - // FIXME: do not crash if cudaMalloc fails - // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first - ggml_sycl_set_device(i); - const queue_ptr stream = ctx->streams[i]; - char * buf; - /* - DPCT1009:208: SYCL uses exceptions to report errors and does not use the - error codes. The original code was commented out and a warning string - was inserted. You need to rewrite this code. - */ - SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device( - size, *stream))); - if (!buf) { - char err_buf[1024]; - snprintf(err_buf, 1023, "%s: can't malloc %lu Bytes memory on device", __func__, size); - throw std::runtime_error(err_buf); - } - // set padding to 0 to avoid possible NaN values - if (size > original_size) { - /* - DPCT1009:209: SYCL uses exceptions to report errors and does not use - the error codes. The original code was commented out and a warning - string was inserted. You need to rewrite this code. - */ - SYCL_CHECK(CHECK_TRY_ERROR( - (*stream) - .memset(buf + original_size, 0, size - original_size) - .wait())); - } - - extra->data_device[i] = buf; - - for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) { - /* - DPCT1009:210: SYCL uses exceptions to report errors and does not use - the error codes. The original code was commented out and a warning - string was inserted. You need to rewrite this code. - */ - SYCL_CHECK( - CHECK_TRY_ERROR(extra->events[i][is] = new sycl::event())); - } - } - tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT; - tensor->extra = extra; -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -GGML_CALL static void -ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer, - ggml_tensor *tensor, const void *data, - size_t offset, size_t size) try { - // split tensors must always be set in their entirety at once - GGML_ASSERT(offset == 0); - GGML_ASSERT(size == ggml_nbytes(tensor)); - - ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; - ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *)buffer->buft->context; - - const int64_t ne0 = tensor->ne[0]; - const size_t nb1 = tensor->nb[1]; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; - - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i); - - int64_t nrows_split = row_high - row_low; - if (nrows_split == 0) { - continue; - } - - const size_t offset_split = row_low*nb1; - size_t size = ggml_nbytes_split(tensor, nrows_split); - const size_t original_size = size; - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses - if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); - } - - const char * buf_host = (const char *)data + offset_split; - /* - DPCT1009:211: SYCL uses exceptions to report errors and does not use the - error codes. The original code was commented out and a warning string - was inserted. You need to rewrite this code. - */ - ggml_sycl_set_device(i); - const queue_ptr stream = ctx->streams[i]; - SYCL_CHECK(CHECK_TRY_ERROR( - (*stream) - .memcpy(extra->data_device[i], buf_host, original_size) - .wait())); - } -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -GGML_CALL static void -ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer, - const ggml_tensor *tensor, void *data, - size_t offset, size_t size) try { - // split tensors must always be set in their entirety at once - GGML_ASSERT(offset == 0); - GGML_ASSERT(size == ggml_nbytes(tensor)); - - ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; - ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *)buffer->buft->context; - - const int64_t ne0 = tensor->ne[0]; - const size_t nb1 = tensor->nb[1]; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; - - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i); - - int64_t nrows_split = row_high - row_low; - if (nrows_split == 0) { - continue; - } - - const size_t offset_split = row_low*nb1; - size_t size = ggml_nbytes_split(tensor, nrows_split); - const size_t original_size = size; - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses - if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); - } - - char * buf_host = (char *)data + offset_split; - /* - DPCT1009:212: SYCL uses exceptions to report errors and does not use the - error codes. The original code was commented out and a warning string - was inserted. You need to rewrite this code. - */ - ggml_sycl_set_device(i); - const queue_ptr stream = ctx->streams[i]; - SYCL_CHECK(CHECK_TRY_ERROR( - (*stream) - .memcpy(buf_host, extra->data_device[i], original_size) - .wait())); - } -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -GGML_CALL static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - UNUSED(buffer); - UNUSED(value); -} - -static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = { - /* .get_name = */ ggml_backend_sycl_split_buffer_get_name, - /* .free_buffer = */ ggml_backend_sycl_split_buffer_free_buffer, - /* .get_base = */ ggml_backend_sycl_split_buffer_get_base, - /* .init_tensor = */ ggml_backend_sycl_split_buffer_init_tensor, - /* .memset_tensor = */ NULL, - /* .set_tensor = */ ggml_backend_sycl_split_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_sycl_split_buffer_get_tensor, - /* .cpy_tensor = */ NULL, - /* .clear = */ ggml_backend_sycl_split_buffer_clear, - /* .reset = */ NULL, -}; - -GGML_CALL static const char * ggml_backend_sycl_split_buffer_type_name(ggml_backend_buffer_type_t buft) { - return GGML_SYCL_NAME "_Split"; - - UNUSED(buft); -} - -GGML_CALL static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point - // instead, we allocate them for each tensor separately in init_tensor - // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated, - // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct. - ggml_backend_sycl_split_buffer_context * ctx = new ggml_backend_sycl_split_buffer_context(); - - return ggml_backend_buffer_init(buft, ggml_backend_sycl_split_buffer_interface, ctx, size); -} - -GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - return 128; - UNUSED(buft); -} - -GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { - ggml_backend_sycl_split_buffer_type_context * ctx = (ggml_backend_sycl_split_buffer_type_context *)buft->context; - - size_t total_size = 0; - - const int64_t ne0 = tensor->ne[0]; - - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, i); - - int64_t nrows_split = row_high - row_low; - if (nrows_split == 0) { - continue; - } - - total_size += ggml_nbytes_split(tensor, nrows_split); - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses - if (ne0 % MATRIX_ROW_PADDING != 0) { - total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); - } - } - - return total_size; -} - -GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { - return false; - - UNUSED(buft); -} - -static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface = { - /* .get_name = */ ggml_backend_sycl_split_buffer_type_name, - /* .alloc_buffer = */ ggml_backend_sycl_split_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_sycl_split_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ ggml_backend_sycl_split_buffer_type_get_alloc_size, - /* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host, -}; - -GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) { - static std::mutex mutex; - std::lock_guard lock(mutex); - - GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_split_buffer_type\n"); - ggml_check_sycl(); - // FIXME: this is not thread safe - static std::map, struct ggml_backend_buffer_type> buft_map; - - std::array tensor_split_arr = {}; - - bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_SYCL_MAX_DEVICES, [](float x) { return x == 0.0f; }); - if (all_zero) { - tensor_split_arr = ggml_sycl_info().default_tensor_split; - } else { - float split_sum = 0.0f; - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - tensor_split_arr[i] = split_sum; - split_sum += tensor_split[i]; - } - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - tensor_split_arr[i] /= split_sum; - } - } - - auto it = buft_map.find(tensor_split_arr); - if (it != buft_map.end()) { - return &it->second; - } - - struct ggml_backend_buffer_type buft { - /* .iface = */ ggml_backend_sycl_split_buffer_type_interface, - /* .context = */ new ggml_backend_sycl_split_buffer_type_context{tensor_split_arr}, - }; - - auto result = buft_map.emplace(tensor_split_arr, buft); - return &result.first->second; -} - -// host buffer type - -GGML_CALL static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_type_t buft) { - return GGML_SYCL_NAME "_Host"; - - UNUSED(buft); -} - -GGML_CALL static const char * ggml_backend_sycl_host_buffer_name(ggml_backend_buffer_t buffer) { - return GGML_SYCL_NAME "_Host"; - - UNUSED(buffer); -} - -static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_sycl_host_free(buffer->context); -} - -static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - void * ptr = ggml_sycl_host_malloc(size); - - if (ptr == nullptr) { - // fallback to cpu buffer - return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); - } - - // FIXME: this is a hack to avoid having to implement a new buffer type - ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); - buffer->buft = buft; - buffer->iface.get_name = ggml_backend_sycl_host_buffer_name; - buffer->iface.free_buffer = ggml_backend_sycl_host_buffer_free_buffer; - - return buffer; -} - -ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() { - GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_host_buffer_type\n"); - static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = { - /* .iface = */ { - /* .get_name = */ ggml_backend_sycl_host_buffer_type_name, - /* .alloc_buffer = */ ggml_backend_sycl_host_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, - /* .get_max_size = */ NULL, // TODO: return device.maxBufferLength - /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, - /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, - }, - /* .context = */ nullptr, - }; - - return &ggml_backend_sycl_buffer_type_host; -} - // backend -GGML_CALL static const char * ggml_backend_sycl_name(ggml_backend_t backend) { +static const char * ggml_backend_sycl_get_name(ggml_backend_t backend) { ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; return sycl_ctx->name.c_str(); } -GGML_CALL static void ggml_backend_sycl_free(ggml_backend_t backend) { +static void ggml_backend_sycl_free(ggml_backend_t backend) { ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; delete sycl_ctx; @@ -4913,12 +4890,12 @@ GGML_CALL static void ggml_backend_sycl_free(ggml_backend_t backend) { } -GGML_CALL static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) { +static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) { ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; return ggml_backend_sycl_buffer_type(sycl_ctx->device); } -GGML_CALL static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend, +static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend, ggml_tensor *tensor, const void *data, size_t offset, size_t size) try { @@ -4927,8 +4904,8 @@ GGML_CALL static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend, GGML_ASSERT(buf->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type"); const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0); - SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy( - (char *)tensor->data + offset, data, size).wait())); + SYCL_CHECK(CHECK_TRY_ERROR( + (stream)->memcpy((char *)tensor->data + offset, data, size))); } catch (sycl::exception const &exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ @@ -4936,7 +4913,7 @@ catch (sycl::exception const &exc) { std::exit(1); } -GGML_CALL static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend, +static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend, const ggml_tensor *tensor, void *data, size_t offset, size_t size) try { @@ -4954,9 +4931,9 @@ catch (sycl::exception const &exc) { std::exit(1); } -GGML_CALL static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend, - const ggml_tensor *src, - ggml_tensor *dst) try { +static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend, + const ggml_tensor *src, + ggml_tensor *dst) try { ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; if (dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && ggml_backend_buffer_is_sycl(src->buffer)) { /* @@ -4983,7 +4960,7 @@ static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try { const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0); SYCL_CHECK(CHECK_TRY_ERROR((stream)->wait())); - UNUSED(backend); + GGML_UNUSED(backend); } catch (sycl::exception const &exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ @@ -4991,7 +4968,7 @@ catch (sycl::exception const &exc) { std::exit(1); } -GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; ggml_sycl_set_main_device(sycl_ctx->device); @@ -5019,7 +4996,151 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back return GGML_STATUS_SUCCESS; } -GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) { +static void ggml_backend_sycl_event_record(ggml_backend_t backend, ggml_backend_event_t event) +try +{ + ggml_backend_sycl_context *sycl_ctx = + (ggml_backend_sycl_context *)backend->context; + sycl::event *sycl_event = static_cast(event->context); + + const queue_ptr &stream = sycl_ctx->stream(sycl_ctx->device, 0); + // Record the current state of the queue + SYCL_CHECK(CHECK_TRY_ERROR(*sycl_event = stream->ext_oneapi_submit_barrier())); +} +catch (sycl::exception const &exc) +{ + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_event_t event) try { + ggml_backend_sycl_context* sycl_ctx = static_cast(backend->context); + sycl::event* sycl_event = static_cast(event->context); + + if (ggml_backend_is_sycl(backend)) { + SYCL_CHECK(CHECK_TRY_ERROR(sycl_event->wait())); + } else + GGML_ABORT("fatal error"); +} catch (sycl::exception const& exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static ggml_backend_i ggml_backend_sycl_interface = { + /* .get_name = */ ggml_backend_sycl_get_name, + /* .free = */ ggml_backend_sycl_free, + /* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type, + /* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async, + /* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async, + /* .cpy_tensor_async = */ NULL, // ggml_backend_sycl_cpy_tensor_async, + // // TODO: update for the new + // interface + /* .synchronize = */ ggml_backend_sycl_synchronize, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_sycl_graph_compute, + /* .supports_op = */ NULL, // moved to device + /* .supports_buft = */ NULL, // moved to device + /* .offload_op = */ NULL, // moved to device + /* .event_record = */ ggml_backend_sycl_event_record, + /* .event_wait = */ ggml_backend_sycl_event_wait, +}; + +static ggml_guid_t ggml_backend_sycl_guid() { + static ggml_guid guid = { 0x58, 0x05, 0x13, 0x8f, 0xcd, 0x3a, 0x61, 0x9d, 0xe7, 0xcd, 0x98, 0xa9, 0x03, 0xfd, 0x7c, 0x53 }; + return &guid; +} + +bool ggml_backend_is_sycl(ggml_backend_t backend) { + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_sycl_guid()); +} + +int ggml_backend_sycl_get_device_count() { + GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n"); + return ggml_sycl_info().device_count; +} + + +// backend device + +struct ggml_backend_sycl_device_context { + int device; + std::string name; + std::string description; +}; + +static const char * ggml_backend_sycl_device_get_name(ggml_backend_dev_t dev) { + ggml_backend_sycl_device_context * ctx = (ggml_backend_sycl_device_context *)dev->context; + return ctx->name.c_str(); +} + +static const char * ggml_backend_sycl_device_get_description(ggml_backend_dev_t dev) { + ggml_backend_sycl_device_context * ctx = (ggml_backend_sycl_device_context *)dev->context; + return ctx->description.c_str(); +} + +static void ggml_backend_sycl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + ggml_backend_sycl_device_context * ctx = (ggml_backend_sycl_device_context *)dev->context; + ggml_sycl_set_device(ctx->device); + SYCL_CHECK(CHECK_TRY_ERROR( + dpct::dev_mgr::instance().get_device(ctx->device).get_memory_info(*free, *total))); +} + +static enum ggml_backend_dev_type ggml_backend_sycl_device_get_type(ggml_backend_dev_t dev) { + GGML_UNUSED(dev); + return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; +} + +static void ggml_backend_sycl_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { + props->name = ggml_backend_sycl_device_get_name(dev); + props->description = ggml_backend_sycl_device_get_description(dev); + props->type = ggml_backend_sycl_device_get_type(dev); + ggml_backend_sycl_device_get_memory(dev, &props->memory_free, &props->memory_total); + + bool host_buffer = getenv("GGML_SYCL_NO_PINNED") == nullptr; +#ifdef GGML_SYCL_NO_PEER_COPY + bool events = false; +#else + bool events = true; +#endif + + props->caps = { + /* .async = */ true, + /* .host_buffer = */ host_buffer, + /* .buffer_from_host_ptr = */ false, + /* .events = */ events, + }; +} + +static ggml_backend_t ggml_backend_sycl_device_init(ggml_backend_dev_t dev, const char * params) { + GGML_UNUSED(params); + ggml_backend_sycl_device_context * ctx = (ggml_backend_sycl_device_context *)dev->context; + return ggml_backend_sycl_init(ctx->device); +} + +static ggml_backend_buffer_type_t ggml_backend_sycl_device_get_buffer_type(ggml_backend_dev_t dev) { + ggml_backend_sycl_device_context * ctx = (ggml_backend_sycl_device_context *)dev->context; + return ggml_backend_sycl_buffer_type(ctx->device); +} + +static ggml_backend_buffer_type_t ggml_backend_sycl_device_get_host_buffer_type(ggml_backend_dev_t dev) { + GGML_UNUSED(dev); + return ggml_backend_sycl_host_buffer_type(); +} + +static ggml_backend_buffer_t ggml_backend_sycl_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { + GGML_UNUSED(dev); + GGML_UNUSED(ptr); + GGML_UNUSED(size); + GGML_UNUSED(max_tensor_size); + return nullptr; +} + +static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { switch (op->op) { case GGML_OP_CONV_TRANSPOSE_1D: { @@ -5163,53 +5284,176 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons return false; } - UNUSED(backend); + GGML_UNUSED(dev); } -GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) { - const int min_batch_size = 32; - return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS && op->op != GGML_OP_MUL_MAT_ID; - GGML_UNUSED(backend); -} - -GGML_CALL static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { - if (buft->iface.get_name != ggml_backend_sycl_buffer_type_name) { +static bool ggml_backend_sycl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + if (buft->iface.get_name != ggml_backend_sycl_buffer_type_get_name) { return false; } ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; - ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; + ggml_backend_sycl_device_context * sycl_ctx = (ggml_backend_sycl_device_context *)dev->context; return buft_ctx->device == sycl_ctx->device; } -static ggml_backend_i ggml_backend_sycl_interface = { - /* .get_name = */ ggml_backend_sycl_name, - /* .free = */ ggml_backend_sycl_free, - /* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type, - /* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async, - /* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async, - /* .cpy_tensor_async = */ NULL, //ggml_backend_sycl_cpy_tensor_async, // TODO: update for the new interface - /* .synchronize = */ ggml_backend_sycl_synchronize, - /* .graph_plan_create = */ NULL, - /* .graph_plan_free = */ NULL, - /* .graph_plan_update = */ NULL, - /* .graph_plan_compute = */ NULL, - /* .graph_compute = */ ggml_backend_sycl_graph_compute, - /* .supports_op = */ ggml_backend_sycl_supports_op, - /* .supports_buft = */ ggml_backend_sycl_supports_buft, - /* .offload_op = */ ggml_backend_sycl_offload_op, - /* .event_new = */ NULL, - /* .event_free = */ NULL, - /* .event_record = */ NULL, - /* .event_wait = */ NULL, - /* .event_synchronize = */ NULL, -}; - -static ggml_guid_t ggml_backend_sycl_guid() { - static ggml_guid guid = { 0x58, 0x05, 0x13, 0x8f, 0xcd, 0x3a, 0x61, 0x9d, 0xe7, 0xcd, 0x98, 0xa9, 0x03, 0xfd, 0x7c, 0x53 }; - return &guid; +static bool ggml_backend_sycl_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { + const int min_batch_size = 32; + return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS && op->op != GGML_OP_MUL_MAT_ID; + GGML_UNUSED(dev); } -GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) { +static ggml_backend_event_t +ggml_backend_sycl_device_event_new(ggml_backend_dev_t dev) { + +#ifdef GGML_SYCL_NO_PEER_COPY + return nullptr; +#else + sycl::event *event_ptr = new sycl::event(); + + return new ggml_backend_event{ + /* .device = */ dev, + /* .context = */ event_ptr, + }; +#endif +} + +static void ggml_backend_sycl_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) try { + GGML_UNUSED(dev); + if (event == nullptr) { + return; + } + + if (event->context != nullptr) { + sycl::event *sycl_event = static_cast(event->context); + delete sycl_event; + event->context = nullptr; + } + + delete event; +} catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + + +static void ggml_backend_sycl_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) try { + GGML_UNUSED(dev); + + sycl::event *sycl_event = static_cast(event->context); + SYCL_CHECK(CHECK_TRY_ERROR(sycl_event->wait())); +} catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static const ggml_backend_device_i ggml_backend_sycl_device_interface = { + /* .get_name = */ ggml_backend_sycl_device_get_name, + /* .get_description = */ ggml_backend_sycl_device_get_description, + /* .get_memory = */ ggml_backend_sycl_device_get_memory, + /* .get_type = */ ggml_backend_sycl_device_get_type, + /* .get_props = */ ggml_backend_sycl_device_get_props, + /* .init_backend = */ ggml_backend_sycl_device_init, + /* .get_buffer_type = */ ggml_backend_sycl_device_get_buffer_type, + /* .get_host_buffer_type = */ ggml_backend_sycl_device_get_host_buffer_type, + /* .buffer_from_host_ptr = */ ggml_backend_sycl_device_buffer_from_host_ptr, + /* .supports_op = */ ggml_backend_sycl_device_supports_op, + /* .supports_buft = */ ggml_backend_sycl_device_supports_buft, + /* .offload_op = */ ggml_backend_sycl_device_offload_op, + /* .event_new = */ ggml_backend_sycl_device_event_new, + /* .event_free = */ ggml_backend_sycl_device_event_free, + /* .event_synchronize = */ ggml_backend_sycl_device_event_synchronize, +}; + +// backend reg + +struct ggml_backend_sycl_reg_context { + std::vector devices; +}; + +static const char * ggml_backend_sycl_reg_get_name(ggml_backend_reg_t reg) { + GGML_UNUSED(reg); + return GGML_SYCL_NAME; +} + +static size_t ggml_backend_sycl_reg_get_device_count(ggml_backend_reg_t reg) { + ggml_backend_sycl_reg_context * ctx = (ggml_backend_sycl_reg_context *)reg->context; + return ctx->devices.size(); +} + +static ggml_backend_dev_t ggml_backend_sycl_reg_get_device(ggml_backend_reg_t reg, size_t index) { + ggml_backend_sycl_reg_context * ctx = (ggml_backend_sycl_reg_context *)reg->context; + GGML_ASSERT(index < ctx->devices.size()); + return ctx->devices[index]; +} + +static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) +{ + GGML_UNUSED(reg); + if (strcmp(name, "ggml_backend_split_buffer_type") == 0) { + return (void *)ggml_backend_sycl_split_buffer_type; + } + // SYCL doesn't support registering host memory, left here for reference + // "ggml_backend_register_host_buffer" + // "ggml_backend_unregister_host_buffer" + return nullptr; +} + +static const ggml_backend_reg_i ggml_backend_sycl_reg_interface = { + /* .get_name = */ ggml_backend_sycl_reg_get_name, + /* .get_device_count = */ ggml_backend_sycl_reg_get_device_count, + /* .get_device_get = */ ggml_backend_sycl_reg_get_device, + /* .get_proc_address = */ ggml_backend_sycl_reg_get_proc_address, +}; + + +// backend registry + +ggml_backend_reg_t ggml_backend_sycl_reg() { + static ggml_backend_reg reg; + static bool initialized = false; + + { + static std::mutex mutex; + std::lock_guard lock(mutex); + if (!initialized) { + ggml_backend_sycl_reg_context * ctx = new ggml_backend_sycl_reg_context; + + for (int i = 0; i < ggml_sycl_info().device_count; i++) { + ggml_backend_sycl_device_context * dev_ctx = new ggml_backend_sycl_device_context; + dev_ctx->device = i; + dev_ctx->name = GGML_SYCL_NAME + std::to_string(i); + + ggml_sycl_set_device(i); + + dpct::device_info prop; + SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( + prop, dpct::dev_mgr::instance().get_device(i)))); + + dev_ctx->description = prop.get_name(); + + ggml_backend_dev_t dev = new ggml_backend_device { + /* .interface = */ ggml_backend_sycl_device_interface, + /* .reg = */ ®, + /* .context = */ dev_ctx + }; + ctx->devices.push_back(dev); + } + + reg = ggml_backend_reg { + /* .interface = */ ggml_backend_sycl_reg_interface, + /* .context = */ ctx + }; + } + + initialized = true; + } + + return ® +} + +ggml_backend_t ggml_backend_sycl_init(int device) { GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n"); ggml_check_sycl(); @@ -5224,36 +5468,10 @@ GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) { ggml_backend_t sycl_backend = new ggml_backend { /* .guid = */ ggml_backend_sycl_guid(), /* .interface = */ ggml_backend_sycl_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), device), /* .context = */ ctx }; return sycl_backend; } -bool ggml_backend_is_sycl(ggml_backend_t backend) { - return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_sycl_guid()); -} - -GGML_CALL int ggml_backend_sycl_get_device_count() { - GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n"); - return ggml_sycl_info().device_count; -} - -GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) { - ggml_backend_t sycl_backend = ggml_backend_sycl_init((int) (intptr_t) user_data); - return sycl_backend; - - UNUSED(params); -} - -extern "C" int ggml_backend_sycl_reg_devices(); - -int ggml_backend_sycl_reg_devices() { - assert(ggml_sycl_info().device_count>0); - for (int i = 0; i < ggml_sycl_info().device_count; i++) { - char name[128]; - snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, i); - ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(i), (void *) (intptr_t) i); - } - return ggml_sycl_info().device_count; -} diff --git a/ggml/src/ggml-sycl/dequantize.hpp b/ggml/src/ggml-sycl/dequantize.hpp index 8f4041fff..b8304c3a2 100644 --- a/ggml/src/ggml-sycl/dequantize.hpp +++ b/ggml/src/ggml-sycl/dequantize.hpp @@ -55,12 +55,12 @@ static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib, #ifdef GGML_SYCL_F16 // v = v * {d, d}; // v = v + {m, m}; - v.s0() = (v.s0() * d) + m; - v.s1() = (v.s1() * d) + m; + v.s0() = sycl::fma(v.s0(), d, m); + v.s1() = sycl::fma(v.s1(), d, m); #else - v.x() = (v.x() * d) + m; - v.y() = (v.y() * d) + m; + v.x() = sycl::fma(v.x(), d, m); + v.y() = sycl::fma(v.y(), d, m); #endif // GGML_SYCL_F16 } @@ -110,11 +110,11 @@ static __dpct_inline__ void dequantize_q5_1(const void *vx, const int64_t ib, #ifdef GGML_SYCL_F16 // v = v * {d, d}; // v = v + {m, m}; - v.s0() = (v.s0() * d) + m; - v.s1() = (v.s1() * d) + m; + v.s0() = sycl::fma(v.s0(), d, m); + v.s1() = sycl::fma(v.s1(), d, m); #else - v.x() = (v.x() * d) + m; - v.y() = (v.y() * d) + m; + v.x() = sycl::fma(v.x(), d, m); + v.y() = sycl::fma(v.y(), d, m); #endif // GGML_SYCL_F16 } diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp index 1b96925e1..7b10cf688 100644 --- a/ggml/src/ggml-sycl/mmvq.cpp +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -1,6 +1,6 @@ #include "mmvq.hpp" #include "vecdotq.hpp" - +#include template static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, @@ -13,7 +13,8 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_ } const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * WARP_SIZE / qi; + const int blocks_per_warp = vdr * QK_WARP_SIZE / qi; + assert(blocks_per_warp>0); // partial sum for each thread float tmp = 0.0f; @@ -37,7 +38,7 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_ // sum up partial sums and write back result #pragma unroll - for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -61,7 +62,8 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx, } const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * WARP_SIZE / qi; + const int blocks_per_warp = vdr * QK_WARP_SIZE / qi; + assert(blocks_per_warp>0); // partial sum for each thread float tmp = 0.0f; @@ -85,7 +87,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -109,8 +111,8 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx, } const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * WARP_SIZE / qi; - + const int blocks_per_warp = vdr * QK_WARP_SIZE / qi; + assert(blocks_per_warp>0); // partial sum for each thread float tmp = 0.0f; @@ -133,7 +135,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -157,8 +159,8 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx, } const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * WARP_SIZE / qi; - + const int blocks_per_warp = vdr * QK_WARP_SIZE / qi; + assert(blocks_per_warp>0); // partial sum for each thread float tmp = 0.0f; @@ -181,7 +183,7 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -205,8 +207,8 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx, } const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * WARP_SIZE / qi; - + const int blocks_per_warp = vdr * QK_WARP_SIZE / qi; + assert(blocks_per_warp>0); // partial sum for each thread float tmp = 0.0f; @@ -229,7 +231,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -253,8 +255,8 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx, } const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * WARP_SIZE / qi; - + const int blocks_per_warp = vdr * QK_WARP_SIZE / qi; + assert(blocks_per_warp>0); // partial sum for each thread float tmp = 0.0f; @@ -277,7 +279,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -301,8 +303,8 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx, } const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * WARP_SIZE / qi; - + const int blocks_per_warp = vdr * QK_WARP_SIZE / qi; + assert(blocks_per_warp>0); // partial sum for each thread float tmp = 0.0f; @@ -325,7 +327,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -349,8 +351,8 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx, } const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * WARP_SIZE / qi; - + const int blocks_per_warp = vdr * QK_WARP_SIZE / qi; + assert(blocks_per_warp>0); // partial sum for each thread float tmp = 0.0f; @@ -373,7 +375,7 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -397,8 +399,8 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx, } const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * WARP_SIZE / qi; - + const int blocks_per_warp = vdr * QK_WARP_SIZE / qi; + assert(blocks_per_warp>0); // partial sum for each thread float tmp = 0.0f; @@ -421,7 +423,7 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -446,8 +448,8 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx, } const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * WARP_SIZE / qi; - + const int blocks_per_warp = vdr * QK_WARP_SIZE / qi; + assert(blocks_per_warp>0); // partial sum for each thread float tmp = 0.0f; @@ -470,7 +472,7 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -487,7 +489,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK4_0 == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -495,7 +497,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -511,7 +513,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK4_1 == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -519,7 +521,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -535,7 +537,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK5_0 == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -543,7 +545,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -559,7 +561,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK5_1 == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -567,7 +569,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -583,7 +585,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK8_0 == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -591,7 +593,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -607,7 +609,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -615,7 +617,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -631,7 +633,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -639,7 +641,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -655,7 +657,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -663,7 +665,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -679,7 +681,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -687,7 +689,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -703,7 +705,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -711,7 +713,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -728,13 +730,13 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { mul_mat_vec_q_iq2_xxs_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -749,7 +751,7 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -759,7 +761,7 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { mul_mat_vec_q_iq2_xs_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -774,7 +776,7 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -784,7 +786,7 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { mul_mat_vec_q_iq2_s_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -799,7 +801,7 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -809,7 +811,7 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { mul_mat_vec_q_iq3_xxs_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -824,7 +826,7 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -833,7 +835,7 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { mul_mat_vec_q_iq3_s_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -848,7 +850,7 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -858,7 +860,7 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { mul_mat_vec_q_iq1_s_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -873,13 +875,13 @@ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { mul_mat_vec_q_iq1_m_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -894,14 +896,14 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK4_NL == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { mul_mat_vec_q_iq4_nl_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -916,14 +918,14 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { mul_mat_vec_q_iq4_xs_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index f9da45881..e749bbe70 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include "ggml-impl.h" #include "ggml-backend-impl.h" @@ -117,11 +119,11 @@ struct ggml_backend_vk_buffer_type_context { vk_device device; }; -GGML_CALL static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft); -GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size); -GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft); -GGML_CALL static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft); -GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor); +static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft); +static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size); +static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft); +static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft); +static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor); static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = { /* .get_name = */ ggml_backend_vk_buffer_type_name, /* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer, @@ -431,16 +433,6 @@ struct vk_context_struct { typedef std::shared_ptr vk_context; typedef std::weak_ptr vk_context_ref; -struct ggml_tensor_extra_gpu { - vk_buffer_ref buffer_gpu; - uint64_t offset; - - void reset() { - buffer_gpu.reset(); - offset = 0; - } -}; - struct ggml_vk_garbage_collector { std::vector tl_semaphores; std::vector semaphores; @@ -551,6 +543,31 @@ struct ggml_backend_vk_context { std::vector tensor_ctxs; }; +static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT + +static uint64_t vk_tensor_offset(const ggml_tensor * tensor) { + if (tensor->view_src) { + return (uint8_t *) tensor->view_src->data - (uint8_t *) vk_ptr_base; + } + return (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base; +} + +struct ggml_backend_vk_buffer_context { + vk_device_ref device; + vk_buffer dev_buffer; + std::string name; + + ggml_backend_vk_buffer_context(vk_device_ref device, vk_buffer&& dev_buffer, std::string& name) : + device(device), + dev_buffer(dev_buffer), + name(name) { + } + + ~ggml_backend_vk_buffer_context() { + ggml_vk_destroy_buffer(dev_buffer); + } +}; + #ifdef GGML_VULKAN_MEMORY_DEBUG void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) { std::lock_guard guard(log_mutex); @@ -605,15 +622,18 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor); typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); -GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend); +static void ggml_backend_vk_free(ggml_backend_t backend); -static void ggml_vk_create_pipeline(vk_device& device, vk_pipeline& pipeline, const std::string& name, size_t spv_size, const void* spv_data, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, std::vector&& specialization_constants, uint32_t align) { +// variables to track number of compiles in progress +static uint32_t compile_count = 0; +static std::mutex compile_count_mutex; +static std::condition_variable compile_count_cond; + +static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, std::vector specialization_constants, uint32_t align) { VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")"); GGML_ASSERT(parameter_count > 0); GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT - std::lock_guard guard(device->mutex); - pipeline = std::make_shared(); pipeline->name = name; pipeline->parameter_count = parameter_count; @@ -681,7 +701,17 @@ static void ggml_vk_create_pipeline(vk_device& device, vk_pipeline& pipeline, co pipeline->layout); pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value; - device->pipelines.insert({ pipeline->name, pipeline }); + { + std::lock_guard guard(device->mutex); + device->pipelines.insert({ pipeline->name, pipeline }); + } + + { + std::lock_guard guard(compile_count_mutex); + assert(compile_count > 0); + compile_count--; + } + compile_count_cond.notify_all(); } static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) { @@ -1040,10 +1070,25 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor try { buf->device_memory = device->device.allocateMemory({ mem_req.size, memory_type_index }); } catch (const vk::SystemError& e) { - // Out of Host/Device memory, clean up buffer - device->device.destroyBuffer(buf->buffer); - buf->size = 0; - throw e; + if (buf->memory_property_flags != fallback_flags) { + // Try again with fallback flags + memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags); + buf->memory_property_flags = fallback_flags; + + try { + buf->device_memory = device->device.allocateMemory({ mem_req.size, memory_type_index }); + } + catch (const vk::SystemError& e) { + device->device.destroyBuffer(buf->buffer); + buf->size = 0; + throw e; + } + } else { + // Out of Host/Device memory, clean up buffer + device->device.destroyBuffer(buf->buffer); + buf->size = 0; + throw e; + } } buf->ptr = nullptr; @@ -1079,7 +1124,8 @@ static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) { // Fall back to host memory type buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); } else { - buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal); + // use rebar if available, otherwise fallback to device only visible memory + buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, vk::MemoryPropertyFlagBits::eDeviceLocal); } } catch (const vk::SystemError& e) { std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl; @@ -1148,11 +1194,11 @@ static void ggml_vk_load_shaders(vk_device& device) { // mulmat std::initializer_list warptile_l = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; std::initializer_list warptile_m = { 128, 64, 64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size }; - std::initializer_list warptile_s = { device->subgroup_size, 32, 32, 16, 32, 32, 2, 2, 2, device->subgroup_size }; + std::initializer_list warptile_s = { std::max(device->subgroup_size, 16u), 32, 32, 16, 32, 32, 2, 2, 2, device->subgroup_size }; std::initializer_list warptile_mmq_l = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; std::initializer_list warptile_mmq_m = { 128, 64, 64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size }; - std::initializer_list warptile_mmq_s = { device->subgroup_size, 32, 32, 32, 32, 32, 2, 2, 2, device->subgroup_size }; + std::initializer_list warptile_mmq_s = { std::max(device->subgroup_size, 16u), 32, 32, 32, 32, 32, 2, 2, 2, device->subgroup_size }; std::array l_wg_denoms = {128, 128, 1 }; std::array m_wg_denoms = { 64, 64, 1 }; @@ -1193,6 +1239,20 @@ static void ggml_vk_load_shaders(vk_device& device) { device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared(); device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared(); + std::vector> compiles; + auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, std::vector&& specialization_constants, uint32_t align) { + { + // wait until fewer than N compiles are in progress + uint32_t N = std::max(1u, std::thread::hardware_concurrency()); + std::unique_lock guard(compile_count_mutex); + while (compile_count >= N) { + compile_count_cond.wait(guard); + } + compile_count++; + } + compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align)); + }; + if (device->fp16) { ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); @@ -1742,6 +1802,10 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1); + + for (auto &c : compiles) { + c.wait(); + } } static vk_device ggml_vk_get_device(size_t idx) { @@ -1877,7 +1941,7 @@ static vk_device ggml_vk_get_device(size_t idx) { if (device->fp16) { device_extensions.push_back("VK_KHR_shader_float16_int8"); } - device->name = device->properties.deviceName.data(); + device->name = GGML_VK_NAME + std::to_string(idx); device_create_info = { vk::DeviceCreateFlags(), @@ -1904,6 +1968,7 @@ static vk_device ggml_vk_get_device(size_t idx) { device->buffer_type = { /* .iface = */ ggml_backend_vk_buffer_type_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), idx), /* .context = */ new ggml_backend_vk_buffer_type_context{ device->name, device }, }; @@ -2806,7 +2871,11 @@ static void ggml_vk_buffer_read_async(vk_context subctx, vk_buffer& src, size_t static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_t size) { VK_LOG_DEBUG("ggml_vk_buffer_read(" << src->buffer << ", " << offset << ", " << size << ")"); - if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { + + // If the device is not an UMA device the memory is host-accessible through rebar. While writing + // through PCIe is sufficient fast reading back data from PCIe is slower than going through + // the HW device to host copy path. + if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && src->device->uma) { GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent); memcpy(dst, (uint8_t *) src->ptr + offset, size); @@ -3038,9 +3107,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub const uint64_t r2 = ne12 / ne02; const uint64_t r3 = ne13 / ne03; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; - ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; + ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; vk_buffer d_Qx; size_t qx_buf_offset = 0; @@ -3142,8 +3211,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub return; } - vk_buffer d_D = extra->buffer_gpu.lock(); - const uint64_t d_buf_offset = extra->offset + dst->view_offs; + vk_buffer d_D = dst_buf_ctx->dev_buffer; + const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; GGML_ASSERT(d_D != nullptr); GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03); vk_buffer d_X; @@ -3151,13 +3220,13 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub vk_buffer d_Y; uint64_t y_buf_offset = 0; if (!src0_uma) { - d_Qx = extra_src0->buffer_gpu.lock(); - qx_buf_offset = extra_src0->offset + src0->view_offs; + d_Qx = src0_buf_ctx->dev_buffer; + qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; GGML_ASSERT(d_Qx != nullptr); } if (!src1_uma) { - d_Qy = extra_src1->buffer_gpu.lock(); - qy_buf_offset = extra_src1->offset + src1->view_offs; + d_Qy = src1_buf_ctx->dev_buffer; + qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; GGML_ASSERT(d_Qy != nullptr); } if (qx_needs_dequant) { @@ -3238,9 +3307,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& const uint64_t r2 = ne12 / ne02; const uint64_t r3 = ne13 / ne03; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; - ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; + ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; vk_buffer d_Qx; size_t qx_buf_offset = 0; @@ -3319,21 +3388,21 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& return; } - vk_buffer d_D = extra->buffer_gpu.lock(); - const uint64_t d_buf_offset = extra->offset + dst->view_offs; + vk_buffer d_D = dst_buf_ctx->dev_buffer; + const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; GGML_ASSERT(d_D != nullptr); vk_buffer d_X; uint64_t x_buf_offset = 0; vk_buffer d_Y; uint64_t y_buf_offset = 0; if(!src0_uma) { - d_Qx = extra_src0->buffer_gpu.lock(); - qx_buf_offset = extra_src0->offset + src0->view_offs; + d_Qx = src0_buf_ctx->dev_buffer; + qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; GGML_ASSERT(d_Qx != nullptr); } if(!src1_uma) { - d_Qy = extra_src1->buffer_gpu.lock(); - qy_buf_offset = extra_src1->offset + src1->view_offs; + d_Qy = src1_buf_ctx->dev_buffer; + qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; GGML_ASSERT(d_Qy != nullptr); } if (qx_needs_dequant) { @@ -3416,9 +3485,9 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c GGML_ASSERT(ne11 == 1); - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; - ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; + ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; vk_buffer d_Qy; size_t qy_buf_offset = 0; @@ -3444,15 +3513,15 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c return; } - vk_buffer d_D = extra->buffer_gpu.lock(); - const uint64_t d_buf_offset = extra->offset + dst->view_offs; + vk_buffer d_D = dst_buf_ctx->dev_buffer; + const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; GGML_ASSERT(d_D != nullptr); - vk_buffer d_Qx = extra_src0->buffer_gpu.lock(); - const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs; + vk_buffer d_Qx = src0_buf_ctx->dev_buffer; + const uint64_t qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; GGML_ASSERT(d_Qx != nullptr); if (!src1_uma) { - d_Qy = extra_src1->buffer_gpu.lock(); - qy_buf_offset = extra_src1->offset + src1->view_offs; + d_Qy = src1_buf_ctx->dev_buffer; + qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; GGML_ASSERT(d_Qx != nullptr); } @@ -3494,9 +3563,9 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con GGML_ASSERT(ne11 == 1); - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; - ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; + ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; vk_buffer d_Qy = nullptr; size_t qy_buf_offset = 0; @@ -3523,15 +3592,15 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con return; } - vk_buffer d_D = extra->buffer_gpu.lock(); - const uint64_t d_buf_offset = extra->offset + dst->view_offs; + vk_buffer d_D = dst_buf_ctx->dev_buffer; + const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; GGML_ASSERT(d_D != nullptr); - vk_buffer d_Qx = extra_src0->buffer_gpu.lock(); - const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs; + vk_buffer d_Qx = src0_buf_ctx->dev_buffer; + const uint64_t qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; GGML_ASSERT(d_Qx != nullptr); if (!src1_uma) { - d_Qy = extra_src1->buffer_gpu.lock(); - qy_buf_offset = extra_src1->offset + src1->view_offs; + d_Qy = src1_buf_ctx->dev_buffer; + qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; GGML_ASSERT(d_Qx != nullptr); } @@ -3593,10 +3662,10 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& const uint64_t n_as = ne02; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; - ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; - ggml_tensor_extra_gpu * extra_ids = (ggml_tensor_extra_gpu *) ids->extra; + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; + ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; + ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context; vk_buffer d_Qx; size_t qx_buf_offset = 0; @@ -3693,26 +3762,26 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& return; } - vk_buffer d_D = extra->buffer_gpu.lock(); - const uint64_t d_buf_offset = extra->offset + dst->view_offs; + vk_buffer d_D = dst_buf_ctx->dev_buffer; + const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; GGML_ASSERT(d_D != nullptr); vk_buffer d_X; uint64_t x_buf_offset = 0; vk_buffer d_Y; uint64_t y_buf_offset = 0; if (!src0_uma) { - d_Qx = extra_src0->buffer_gpu.lock(); - qx_buf_offset = extra_src0->offset + src0->view_offs; + d_Qx = src0_buf_ctx->dev_buffer; + qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; GGML_ASSERT(d_Qx != nullptr); } if (!src1_uma) { - d_Qy = extra_src1->buffer_gpu.lock(); - qy_buf_offset = extra_src1->offset + src1->view_offs; + d_Qy = src1_buf_ctx->dev_buffer; + qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; GGML_ASSERT(d_Qy != nullptr); } if (!ids_uma) { - d_ids = extra_ids->buffer_gpu.lock(); - ids_buf_offset = extra_ids->offset + ids->view_offs; + d_ids = ids_buf_ctx->dev_buffer; + ids_buf_offset = vk_tensor_offset(ids) + ids->view_offs; GGML_ASSERT(d_ids != nullptr); } if (qx_needs_dequant) { @@ -3798,10 +3867,10 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte const uint64_t ne22 = dst->ne[2]; const uint64_t ne23 = dst->ne[3]; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; - ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; - ggml_tensor_extra_gpu * extra_ids = (ggml_tensor_extra_gpu *) ids->extra; + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; + ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; + ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context; vk_buffer d_Qx; size_t qx_buf_offset = 0; @@ -3886,26 +3955,26 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte return; } - vk_buffer d_D = extra->buffer_gpu.lock(); - const uint64_t d_buf_offset = extra->offset + dst->view_offs; + vk_buffer d_D = dst_buf_ctx->dev_buffer; + const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; GGML_ASSERT(d_D != nullptr); vk_buffer d_X; uint64_t x_buf_offset = 0; vk_buffer d_Y; uint64_t y_buf_offset = 0; if(!src0_uma) { - d_Qx = extra_src0->buffer_gpu.lock(); - qx_buf_offset = extra_src0->offset + src0->view_offs; + d_Qx = src0_buf_ctx->dev_buffer; + qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; GGML_ASSERT(d_Qx != nullptr); } if(!src1_uma) { - d_Qy = extra_src1->buffer_gpu.lock(); - qy_buf_offset = extra_src1->offset + src1->view_offs; + d_Qy = src1_buf_ctx->dev_buffer; + qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; GGML_ASSERT(d_Qy != nullptr); } if(!ids_uma) { - d_ids = extra_ids->buffer_gpu.lock(); - ids_buf_offset = extra_ids->offset + ids->view_offs; + d_ids = ids_buf_ctx->dev_buffer; + ids_buf_offset = vk_tensor_offset(ids) + ids->view_offs; GGML_ASSERT(d_ids != nullptr); } if (qx_needs_dequant) { @@ -4212,7 +4281,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co std::cerr << "), " << ggml_op_name(op) << ", " << (dryrun ? "dryrun" : "") << ")"); GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT - GGML_ASSERT(dst->extra != nullptr); + GGML_ASSERT(dst->buffer != nullptr); const uint64_t ne00 = src0->ne[0]; const uint64_t ne01 = src0->ne[1]; const uint64_t ne02 = src0->ne[2]; @@ -4258,10 +4327,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co const bool op_supports_incontiguous = ggml_vk_op_supports_incontiguous(op); - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; - ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; - ggml_tensor_extra_gpu * extra_src2 = use_src2 ? (ggml_tensor_extra_gpu *) src2->extra : nullptr; + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; + ggml_backend_vk_buffer_context * src1_buf_ctx = use_src1 ? (ggml_backend_vk_buffer_context *)src1->buffer->context : nullptr; + ggml_backend_vk_buffer_context * src2_buf_ctx = use_src2 ? (ggml_backend_vk_buffer_context *)src2->buffer->context : nullptr; vk_buffer d_X = nullptr; size_t x_buf_offset = 0; @@ -4292,7 +4361,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co uint64_t z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 : 0; uint64_t d_sz = ggml_type_size(dst->type) * ned; - vk_buffer d_D = extra->buffer_gpu.lock(); + vk_buffer d_D = dst_buf_ctx->dev_buffer; // Workaround for tiny tensor inputs on ROPE if (op == GGML_OP_ROPE && use_src1 && y_sz > d_D->size) { @@ -4300,21 +4369,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co } GGML_ASSERT(d_D != nullptr); - uint64_t d_buf_offset = ((extra->offset + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; - GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT + uint64_t d_buf_offset = ((vk_tensor_offset(dst) + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; + GGML_ASSERT(d_buf_offset == vk_tensor_offset(dst) || op == GGML_OP_CPY); // NOLINT if(!src0_uma) { - d_X = extra_src0->buffer_gpu.lock(); - x_buf_offset = extra_src0->offset + src0->view_offs; + d_X = src0_buf_ctx->dev_buffer; + x_buf_offset = vk_tensor_offset(src0) + src0->view_offs; GGML_ASSERT(d_X != nullptr); } if (use_src1 && !src1_uma) { - d_Y = extra_src1->buffer_gpu.lock(); - y_buf_offset = extra_src1->offset + src1->view_offs; + d_Y = src1_buf_ctx->dev_buffer; + y_buf_offset = vk_tensor_offset(src1) + src1->view_offs; GGML_ASSERT(d_Y != nullptr); } if (use_src2 && !src2_uma) { - d_Z = extra_src2->buffer_gpu.lock(); - z_buf_offset = extra_src2->offset + src2->view_offs; + d_Z = src2_buf_ctx->dev_buffer; + z_buf_offset = vk_tensor_offset(src2) + src2->view_offs; GGML_ASSERT(d_Z != nullptr); } @@ -4493,11 +4562,10 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, } static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); - const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; + const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; int nb1 = dst->op_params[0] / 4; // 4 bytes of float32 int nb2 = dst->op_params[1] / 4; // 4 bytes of float32 @@ -4686,10 +4754,9 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, co } static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t dst_type_size = ggml_type_size(dst->type); - const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; + const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, { (uint32_t)ggml_nelements(src0), @@ -5008,6 +5075,8 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t } } + ggml_pipeline_allocate_descriptor_sets(ctx->device); + vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); vk_buffer d_D = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); @@ -5124,7 +5193,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t avg_err /= m * n; - std::cerr << "TEST " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time / num_it << "ms avg_err=" << avg_err << std::endl; + double tflops = 2.0*m*n*k*batch*num_it / (time / 1000.0) / (1000.0*1000.0*1000.0*1000.0); + + std::cerr << "TEST " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time / num_it << "ms " << tflops << " TFLOPS avg_err=" << avg_err << std::endl; if (avg_err > 0.1) { std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl; @@ -5216,9 +5287,9 @@ static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, gg return; } - ggml_type_traits_t tt = ggml_internal_get_type_traits(quant); + const auto * tt = ggml_get_type_traits(quant); - ggml_to_float_t dequant_fn = tt.to_float; + ggml_to_float_t dequant_fn = tt->to_float; dequant_fn(from, to, ne); } @@ -5246,12 +5317,14 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_ ggml_pipeline_request_descriptor_sets(ctx->device, p, 1); + ggml_pipeline_allocate_descriptor_sets(ctx->device); + ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz); vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); ggml_vk_ctx_begin(ctx->device, subctx); const std::vector pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne }; - ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1}); + ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1}); ggml_vk_ctx_end(subctx); auto begin = std::chrono::high_resolution_clock::now(); @@ -5378,6 +5451,8 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, } } + ggml_pipeline_allocate_descriptor_sets(ctx->device); + ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz); ggml_vk_buffer_write(y_buf, 0, y, y_sz); @@ -5445,7 +5520,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, avg_err /= m * n; - std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms avg_err=" << avg_err << std::endl; + double tflops = 2.0*m*n*k*batch*num_it / (time_ms / 1000.0) / (1000.0*1000.0*1000.0*1000.0); + + std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms " << tflops << " TFLOPS avg_err=" << avg_err << std::endl; if (avg_err > 0.01 || std::isnan(avg_err)) { std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl; @@ -5487,19 +5564,8 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, } #endif -static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) { - VK_LOG_DEBUG("ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))"); - ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu; - extra->reset(); - tensor->extra = extra; - return extra; -} - static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { #if defined(GGML_VULKAN_RUN_TESTS) - ctx->staging = ggml_vk_create_buffer_check(ctx->device, 100ul * 1024ul * 1024ul, - vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, - vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32); ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_0); ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_1); @@ -5666,9 +5732,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* t // Returns true if node has enqueued work into the queue, false otherwise // If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution. static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){ - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra; - - if (ggml_is_empty(node) || extra == nullptr) { + if (ggml_is_empty(node) || !node->buffer) { return false; } @@ -5920,7 +5984,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod } static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){ - ggml_tensor_extra_gpu * extra = nullptr; + ggml_backend_buffer * buf = nullptr; switch (tensor->op) { case GGML_OP_ADD: @@ -5956,7 +6020,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_LEAKY_RELU: case GGML_OP_REPEAT: - extra = (ggml_tensor_extra_gpu *) tensor->extra; + buf = tensor->buffer; break; case GGML_OP_UNARY: @@ -5966,7 +6030,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_TANH: - extra = (ggml_tensor_extra_gpu *) tensor->extra; + buf = tensor->buffer; break; default: return false; @@ -5974,14 +6038,14 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * break; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: - extra = (ggml_tensor_extra_gpu *) tensor->extra; + buf = tensor->buffer; break; default: return false; } - if (extra == nullptr) { + if (buf == nullptr) { return false; } @@ -6099,13 +6163,13 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { ctx->device->device.destroyFence(ctx->fence); } -GGML_CALL static int ggml_vk_get_device_count() { +static int ggml_vk_get_device_count() { ggml_vk_instance_init(); return vk_instance.device_indices.size(); } -GGML_CALL static void ggml_vk_get_device_description(int device, char * description, size_t description_size) { +static void ggml_vk_get_device_description(int device, char * description, size_t description_size) { ggml_vk_instance_init(); std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); @@ -6122,111 +6186,61 @@ GGML_CALL static void ggml_vk_get_device_description(int device, char * descript // device backend -static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT - -struct ggml_backend_vk_buffer_context { - vk_device_ref device; - vk_buffer dev_buffer; - ggml_tensor_extra_gpu * temp_tensor_extras = nullptr; - size_t temp_tensor_extra_index = 0; - std::string name; - - ggml_backend_vk_buffer_context(vk_device_ref device, vk_buffer&& dev_buffer, std::string& name) : - device(device), - dev_buffer(dev_buffer), - name(name) { - } - - ~ggml_backend_vk_buffer_context() { - ggml_vk_destroy_buffer(dev_buffer); - if (temp_tensor_extras != nullptr) { - delete[] temp_tensor_extras; - } - } - - ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() { - if (temp_tensor_extras == nullptr) { - temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_VK_MAX_NODES]; - } - - size_t alloc_index = temp_tensor_extra_index; - temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_VK_MAX_NODES; - ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index]; - extra->reset(); - - return extra; - } -}; - -GGML_CALL static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) { +static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) { ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; return ctx->name.c_str(); } -GGML_CALL static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) { +static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) { return buffer->iface.get_name == ggml_backend_vk_buffer_get_name; } -GGML_CALL static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { +static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { VK_LOG_MEMORY("ggml_backend_vk_buffer_free_buffer()"); ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_vk_destroy_buffer(ctx->dev_buffer); delete ctx; } -GGML_CALL static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) { +static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) { return vk_ptr_base; UNUSED(buffer); } -GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { +static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")"); - ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; - if (tensor->view_src != nullptr) { GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); - GGML_ASSERT(tensor->view_src->extra != nullptr); - tensor->extra = tensor->view_src->extra; - } else { - ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra(); - extra->buffer_gpu = ctx->dev_buffer; - extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base; - tensor->extra = extra; } } -GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; + vk_buffer buf = buf_ctx->dev_buffer; - vk_buffer buf = extra->buffer_gpu.lock(); - - ggml_vk_buffer_write(buf, extra->offset + tensor->view_offs + offset, data, size); - - GGML_UNUSED(buffer); + ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); } -GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; - vk_buffer buf = extra->buffer_gpu.lock(); + vk_buffer buf = buf_ctx->dev_buffer; - ggml_vk_buffer_read(buf, extra->offset + tensor->view_offs + offset, data, size); - - GGML_UNUSED(buffer); + ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); } -GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { +static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { if (ggml_backend_buffer_is_vk(src->buffer)) { - ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra; - ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context; + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; - vk_buffer src_buf = src_extra->buffer_gpu.lock(); - vk_buffer dst_buf = dst_extra->buffer_gpu.lock(); + vk_buffer src_buf = src_buf_ctx->dev_buffer; + vk_buffer dst_buf = dst_buf_ctx->dev_buffer; - ggml_vk_buffer_copy(dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src)); + ggml_vk_buffer_copy(dst_buf, vk_tensor_offset(dst) + dst->view_offs, src_buf, vk_tensor_offset(src) + src->view_offs, ggml_nbytes(src)); return true; } @@ -6235,7 +6249,7 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu UNUSED(buffer); } -GGML_CALL static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_vk_buffer_memset(ctx->dev_buffer, 0, value, buffer->size); @@ -6255,13 +6269,13 @@ static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = { }; // vk buffer type -GGML_CALL static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft) { +static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft) { ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *)buft->context; return ctx->name.c_str(); } -GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")"); ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; @@ -6277,23 +6291,23 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer( return ggml_backend_buffer_init(buft, ggml_backend_vk_buffer_interface, bufctx, size); } -GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; return ctx->device->properties.limits.minStorageBufferOffsetAlignment; } -GGML_CALL static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { +static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; return ctx->device->max_memory_allocation_size; } -GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { +static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { return ggml_nbytes(tensor); UNUSED(buft); } -GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) { +ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) { ggml_vk_instance_init(); VK_LOG_DEBUG("ggml_backend_vk_buffer_type(" << dev_num << ")"); @@ -6305,24 +6319,24 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) // host buffer type -GGML_CALL static const char * ggml_backend_vk_host_buffer_type_name(ggml_backend_buffer_type_t buft) { +static const char * ggml_backend_vk_host_buffer_type_name(ggml_backend_buffer_type_t buft) { return GGML_VK_NAME "_Host"; UNUSED(buft); } -GGML_CALL static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffer) { +static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffer) { return GGML_VK_NAME "_Host"; UNUSED(buffer); } -GGML_CALL static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { +static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()"); ggml_vk_host_free(vk_instance.devices[0], buffer->context); } -GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")"); size += 32; // Behave like the CPU buffer type @@ -6346,7 +6360,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_bu UNUSED(buft); } -GGML_CALL static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return vk_instance.devices[0]->properties.limits.minMemoryMapAlignment; UNUSED(buft); @@ -6354,7 +6368,7 @@ GGML_CALL static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_back // Should be changed to return device-specific host buffer type // but that probably requires changes in llama.cpp -GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { +ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { static struct ggml_backend_buffer_type ggml_backend_vk_buffer_type_host = { /* .iface = */ { /* .get_name = */ ggml_backend_vk_host_buffer_type_name, @@ -6364,6 +6378,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), 0), /* .context = */ nullptr, }; @@ -6377,13 +6392,13 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { // backend -GGML_CALL static const char * ggml_backend_vk_name(ggml_backend_t backend) { +static const char * ggml_backend_vk_name(ggml_backend_t backend) { ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; return ctx->name.c_str(); } -GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) { +static void ggml_backend_vk_free(ggml_backend_t backend) { ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")"); @@ -6393,18 +6408,18 @@ GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) { delete backend; } -GGML_CALL static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_backend_t backend) { +static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_backend_t backend) { ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; return &ctx->device->buffer_type; } -GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; vk_context transfer_ctx; @@ -6417,17 +6432,17 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g transfer_ctx = ctx->transfer_ctx.lock(); } - vk_buffer buf = extra->buffer_gpu.lock(); + vk_buffer buf = buf_ctx->dev_buffer; - ggml_vk_buffer_write_async(transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size); + ggml_vk_buffer_write_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); } -GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; vk_context transfer_ctx; @@ -6440,17 +6455,17 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c transfer_ctx = ctx->transfer_ctx.lock(); } - vk_buffer buf = extra->buffer_gpu.lock(); + vk_buffer buf = buf_ctx->dev_buffer; - ggml_vk_buffer_read_async(transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size); + ggml_vk_buffer_read_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); } -GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { +static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; if ((dst->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) { - ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra; - ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context; + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; vk_context transfer_ctx; @@ -6463,17 +6478,17 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c transfer_ctx = ctx->transfer_ctx.lock(); } - vk_buffer src_buf = src_extra->buffer_gpu.lock(); - vk_buffer dst_buf = dst_extra->buffer_gpu.lock(); + vk_buffer src_buf = src_buf_ctx->dev_buffer; + vk_buffer dst_buf = dst_buf_ctx->dev_buffer; - ggml_vk_buffer_copy_async(transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src)); + ggml_vk_buffer_copy_async(transfer_ctx, dst_buf, vk_tensor_offset(dst) + dst->view_offs, src_buf, vk_tensor_offset(src) + src->view_offs, ggml_nbytes(src)); return true; } return false; } -GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) { +static void ggml_backend_vk_synchronize(ggml_backend_t backend) { VK_LOG_DEBUG("ggml_backend_vk_synchronize()"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; if(ctx->transfer_ctx.expired()) { @@ -6503,7 +6518,7 @@ static bool ggml_vk_is_empty(ggml_tensor * node) { return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE; } -GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; @@ -6566,9 +6581,135 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen UNUSED(backend); } -GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) { - // ggml_backend_vk_context * ctx = (ggml_backend_vk_context *) backend->context; +// TODO: enable async and synchronize +static ggml_backend_i ggml_backend_vk_interface = { + /* .get_name = */ ggml_backend_vk_name, + /* .free = */ ggml_backend_vk_free, + /* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type, + /* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async, + /* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async, + /* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async, + /* .synchronize = */ NULL, // ggml_backend_vk_synchronize, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_vk_graph_compute, + /* .supports_op = */ NULL, + /* .supports_buft = */ NULL, + /* .offload_op = */ NULL, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, +}; +static ggml_guid_t ggml_backend_vk_guid() { + static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b }; + return &guid; +} + +ggml_backend_t ggml_backend_vk_init(size_t dev_num) { + VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")"); + + ggml_backend_vk_context * ctx = new ggml_backend_vk_context; + ggml_vk_init(ctx, dev_num); + + ggml_backend_t vk_backend = new ggml_backend { + /* .guid = */ ggml_backend_vk_guid(), + /* .interface = */ ggml_backend_vk_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num), + /* .context = */ ctx, + }; + + return vk_backend; +} + +bool ggml_backend_is_vk(ggml_backend_t backend) { + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid()); +} + +int ggml_backend_vk_get_device_count() { + return ggml_vk_get_device_count(); +} + +void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) { + GGML_ASSERT(device < (int) vk_instance.device_indices.size()); + int dev_idx = vk_instance.device_indices[device]; + ggml_vk_get_device_description(dev_idx, description, description_size); +} + +void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) { + GGML_ASSERT(device < (int) vk_instance.device_indices.size()); + + vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]]; + + vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties(); + + for (const vk::MemoryHeap& heap : memprops.memoryHeaps) { + if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) { + *total = heap.size; + *free = heap.size; + break; + } + } +} + +////////////////////////// + +struct ggml_backend_vk_device_context { + int device; + std::string name; + std::string description; +}; + +static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + return ctx->name.c_str(); +} + +static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t dev) { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + return ctx->description.c_str(); +} + +static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context; + ggml_backend_vk_get_device_memory(ctx->device, free, total); +} + +static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + return ggml_backend_vk_buffer_type(ctx->device); +} + +static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(ggml_backend_dev_t dev) { + UNUSED(dev); + return ggml_backend_vk_host_buffer_type(); +} + +static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) { + UNUSED(dev); + return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; +} + +static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { + props->name = ggml_backend_vk_device_get_name(dev); + props->description = ggml_backend_vk_device_get_description(dev); + props->type = ggml_backend_vk_device_get_type(dev); + ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* async */ false, + /* host_buffer */ true, + /* events */ false, + }; +} + +static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) { + UNUSED(params); + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + return ggml_backend_vk_init(ctx->device); +} + +static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { switch (op->op) { case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { @@ -6686,120 +6827,101 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const return false; } - UNUSED(backend); + UNUSED(dev); } -GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) { +static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) { + return false; + } + + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context; + + return buft_ctx->device->idx == ctx->device; +} + +static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { const int min_batch_size = 32; return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) || (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID); - UNUSED(backend); + UNUSED(dev); } -GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { - if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) { - return false; - } - - ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context; - ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; - - return buft_ctx->device == ctx->device; -} - -// TODO: enable async and synchronize -static ggml_backend_i ggml_backend_vk_interface = { - /* .get_name = */ ggml_backend_vk_name, - /* .free = */ ggml_backend_vk_free, - /* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type, - /* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async, - /* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async, - /* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async, - /* .synchronize = */ NULL, // ggml_backend_vk_synchronize, - /* .graph_plan_create = */ NULL, - /* .graph_plan_free = */ NULL, - /* .graph_plan_update = */ NULL, - /* .graph_plan_compute = */ NULL, - /* .graph_compute = */ ggml_backend_vk_graph_compute, - /* .supports_op = */ ggml_backend_vk_supports_op, - /* .supports_buft = */ ggml_backend_vk_supports_buft, - /* .offload_op = */ ggml_backend_vk_offload_op, - /* .event_new = */ NULL, - /* .event_free = */ NULL, - /* .event_record = */ NULL, - /* .event_wait = */ NULL, - /* .event_synchronize = */ NULL, +static const struct ggml_backend_device_i ggml_backend_vk_device_i = { + /* .get_name = */ ggml_backend_vk_device_get_name, + /* .get_description = */ ggml_backend_vk_device_get_description, + /* .get_memory = */ ggml_backend_vk_device_get_memory, + /* .get_type = */ ggml_backend_vk_device_get_type, + /* .get_props = */ ggml_backend_vk_device_get_props, + /* .init_backend = */ ggml_backend_vk_device_init, + /* .get_buffer_type = */ ggml_backend_vk_device_get_buffer_type, + /* .get_host_buffer_type = */ ggml_backend_vk_device_get_host_buffer_type, + /* .buffer_from_host_ptr = */ NULL, + /* .supports_op = */ ggml_backend_vk_device_supports_op, + /* .supports_buft = */ ggml_backend_vk_device_supports_buft, + /* .offload_op = */ ggml_backend_vk_device_offload_op, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, }; -static ggml_guid_t ggml_backend_vk_guid() { - static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b }; - return &guid; +static const char * ggml_backend_vk_reg_get_name(ggml_backend_reg_t reg) { + UNUSED(reg); + return GGML_VK_NAME; } -GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) { - VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")"); - - ggml_backend_vk_context * ctx = new ggml_backend_vk_context; - ggml_vk_init(ctx, dev_num); - - ggml_backend_t vk_backend = new ggml_backend { - /* .guid = */ ggml_backend_vk_guid(), - /* .interface = */ ggml_backend_vk_interface, - /* .context = */ ctx, - }; - - return vk_backend; +static size_t ggml_backend_vk_reg_get_device_count(ggml_backend_reg_t reg) { + UNUSED(reg); + return ggml_backend_vk_get_device_count(); } -GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend) { - return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid()); -} +static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, size_t device) { + static std::vector devices; -GGML_CALL int ggml_backend_vk_get_device_count() { - return ggml_vk_get_device_count(); -} + static bool initialized = false; -GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) { - ggml_vk_get_device_description(device, description, description_size); -} - -GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) { - GGML_ASSERT(device < (int) vk_instance.device_indices.size()); - - vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]]; - - vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties(); - - for (const vk::MemoryHeap& heap : memprops.memoryHeaps) { - if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) { - *total = heap.size; - *free = heap.size; - break; + { + static std::mutex mutex; + std::lock_guard lock(mutex); + if (!initialized) { + for (size_t i = 0; i < ggml_backend_vk_get_device_count(); i++) { + ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context; + char desc[256]; + ggml_backend_vk_get_device_description(i, desc, sizeof(desc)); + ctx->device = i; + ctx->name = GGML_VK_NAME + std::to_string(i); + ctx->description = desc; + devices.push_back(new ggml_backend_device { + /* .iface = */ ggml_backend_vk_device_i, + /* .reg = */ reg, + /* .context = */ ctx, + }); + } + initialized = true; } } + + GGML_ASSERT(device < devices.size()); + return devices[device]; } -// backend registry -GGML_CALL static ggml_backend_t ggml_backend_reg_vk_init(const char * params, void * user_data) { - ggml_backend_t vk_backend = ggml_backend_vk_init((int) (intptr_t) user_data); - return vk_backend; +static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = { + /* .get_name = */ ggml_backend_vk_reg_get_name, + /* .get_device_count = */ ggml_backend_vk_reg_get_device_count, + /* .get_device = */ ggml_backend_vk_reg_get_device, + /* .get_proc_address = */ NULL, +}; - UNUSED(params); -} +ggml_backend_reg_t ggml_backend_vk_reg() { + static ggml_backend_reg reg = { + /* .iface = */ ggml_backend_vk_reg_i, + /* .context = */ nullptr, + }; -extern "C" GGML_CALL int ggml_backend_vk_reg_devices(); - -GGML_CALL int ggml_backend_vk_reg_devices() { - ggml_vk_instance_init(); - - for (size_t i = 0; i < vk_instance.device_indices.size(); i++) { - char name[128]; - snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, i); - ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(i), (void *) (intptr_t) i); // NOLINT - } - return vk_instance.device_indices.size(); + return ® } // Extension availability @@ -6904,10 +7026,10 @@ static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) const size_t tensor_size = ggml_nbytes(tensor); tensor_data = malloc(tensor_size); - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; - vk_buffer buffer_gpu = extra->buffer_gpu.lock(); - ggml_vk_buffer_read(buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size); + vk_buffer buffer_gpu = buf_ctx->dev_buffer; + ggml_vk_buffer_read(buffer_gpu, vk_tensor_offset(tensor) + tensor->view_offs, tensor_data, tensor_size); } std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl; @@ -6981,9 +7103,9 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { memcpy(src0_clone->data, src0->data, src0_size); memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS); } else if (ggml_backend_buffer_is_vk(src0->buffer)) { - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra; - vk_buffer buffer_gpu = extra->buffer_gpu.lock(); - uint64_t offset = extra->offset + src0->view_offs; + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; + vk_buffer& buffer_gpu = buf_ctx->dev_buffer; + uint64_t offset = vk_tensor_offset(src0) + src0->view_offs; if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) { for (int i3 = 0; i3 < src0->ne[3]; i3++) { for (int i2 = 0; i2 < src0->ne[2]; i2++) { @@ -7023,9 +7145,9 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { memcpy(src1_clone->data, src1->data, src1_size); memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS); } else if (ggml_backend_buffer_is_vk(src1->buffer)) { - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra; - vk_buffer buffer_gpu = extra->buffer_gpu.lock(); - uint64_t offset = extra->offset + src1->view_offs; + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; + vk_buffer& buffer_gpu = buf_ctx->dev_buffer; + uint64_t offset = vk_tensor_offset(src1) + src1->view_offs; if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) { for (int i3 = 0; i3 < src1->ne[3]; i3++) { for (int i2 = 0; i2 < src1->ne[2]; i2++) { @@ -7065,9 +7187,9 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { memcpy(src2_clone->data, src2->data, src2_size); memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS); } else if (ggml_backend_buffer_is_vk(src2->buffer)) { - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra; - vk_buffer buffer_gpu = extra->buffer_gpu.lock(); - uint64_t offset = extra->offset + src2->view_offs; + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src2->buffer->context; + vk_buffer& buffer_gpu = buf_ctx->dev_buffer; + uint64_t offset = vk_tensor_offset(src2) + src2->view_offs; if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) { for (int i3 = 0; i3 < src2->ne[3]; i3++) { for (int i2 = 0; i2 < src2->ne[2]; i2++) { @@ -7122,7 +7244,7 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { } else if (tensor->op == GGML_OP_PAD) { tensor_clone = ggml_pad(ggml_ctx, src0_clone, tensor->ne[0] - src0_clone->ne[0], tensor->ne[1] - src0_clone->ne[1], tensor->ne[2] - src0_clone->ne[2], tensor->ne[3] - src0_clone->ne[3]); } else if (tensor->op == GGML_OP_REPEAT) { - tensor_clone = ggml_repeat(ggml_ctx, src0_clone, src1_clone); + tensor_clone = ggml_repeat(ggml_ctx, src0_clone, tensor); } else if (tensor->op == GGML_OP_ADD) { tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone); } else if (tensor->op == GGML_OP_ACC) { @@ -7267,14 +7389,15 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) { size_t tensor_size = ggml_nbytes(tensor); tensor_data = malloc(tensor_size); - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; - vk_buffer buffer_gpu = extra->buffer_gpu.lock(); - if (extra->offset + tensor->view_offs + tensor_size >= buffer_gpu->size) { - tensor_size = buffer_gpu->size - (extra->offset + tensor->view_offs); + vk_buffer& buffer_gpu = buf_ctx->dev_buffer; + uint64_t offset = vk_tensor_offset(tensor) + tensor->view_offs; + if (offset + tensor_size >= buffer_gpu->size) { + tensor_size = buffer_gpu->size - offset; } - ggml_vk_buffer_read(buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size); + ggml_vk_buffer_read(buffer_gpu, offset, tensor_data, tensor_size); } float first_error_result = -1.0f; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 4b782b0c1..b16c462fa 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -35,13 +35,6 @@ #include #endif -#ifdef GGML_USE_METAL -#include -#endif - -#if defined(__ARM_FEATURE_SVE) -int ggml_sve_cnt_b = 0; -#endif #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8) #undef GGML_USE_LLAMAFILE #endif @@ -192,6 +185,8 @@ typedef pthread_t ggml_thread_t; #endif #if defined(__APPLE__) +#include +#include #include #endif @@ -322,26 +317,64 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) { // logging // +struct ggml_logger_state { + ggml_log_callback log_callback; + void * log_callback_user_data; +}; +static struct ggml_logger_state g_logger_state = {ggml_log_callback_default, NULL}; + +static void ggml_log_internal_v(enum ggml_log_level level, const char * format, va_list args) { + if (format == NULL) { + return; + } + va_list args_copy; + va_copy(args_copy, args); + char buffer[128]; + int len = vsnprintf(buffer, 128, format, args); + if (len < 128) { + g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data); + } else { + char * buffer2 = (char *) calloc(len + 1, sizeof(char)); + vsnprintf(buffer2, len + 1, format, args_copy); + buffer2[len] = 0; + g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data); + free(buffer2); + } + va_end(args_copy); +} + +void ggml_log_internal(enum ggml_log_level level, const char * format, ...) { + va_list args; + va_start(args, format); + ggml_log_internal_v(level, format, args); + va_end(args); +} + +void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + fputs(text, stderr); + fflush(stderr); +} + #if (GGML_DEBUG >= 1) -#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__) +#define GGML_PRINT_DEBUG(...) GGML_LOG_DEBUG(__VA_ARGS__) #else #define GGML_PRINT_DEBUG(...) #endif #if (GGML_DEBUG >= 5) -#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__) +#define GGML_PRINT_DEBUG_5(...) GGML_LOG_DEBUG(__VA_ARGS__) #else #define GGML_PRINT_DEBUG_5(...) #endif #if (GGML_DEBUG >= 10) -#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__) +#define GGML_PRINT_DEBUG_10(...) GGML_LOG_DEBUG(__VA_ARGS__) #else #define GGML_PRINT_DEBUG_10(...) #endif -#define GGML_PRINT(...) printf(__VA_ARGS__) - // // end of logging block // @@ -352,22 +385,40 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) { //#define GGML_SOFT_MAX_ACCELERATE #endif + +void * ggml_aligned_malloc(size_t size) { #if defined(_MSC_VER) || defined(__MINGW32__) -#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN) -#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) + return _aligned_malloc(size, TENSOR_ALIGNMENT); #else -inline static void * ggml_aligned_malloc(size_t size) { if (size == 0) { - GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n"); + GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n"); return NULL; } void * aligned_memory = NULL; #ifdef GGML_USE_CPU_HBM - int result = hbw_posix_memalign(&aligned_memory, 16, size); + int result = hbw_posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size); +#elif TARGET_OS_OSX + kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE); + int result = EFAULT; + switch (alloc_status) { + case KERN_SUCCESS: + result = 0; + break; + case KERN_INVALID_ADDRESS: + result = EINVAL; + break; + case KERN_NO_SPACE: + result = ENOMEM; + break; + default: + result = EFAULT; + break; + } #elif GGML_USE_METAL - int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size); + const long page_size = sysconf(_SC_PAGESIZE); + int result = posix_memalign(&aligned_memory, MAX(TENSOR_ALIGNMENT, page_size), size); #else - int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size); + int result = posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size); #endif if (result != 0) { // Handle allocation failure @@ -380,28 +431,40 @@ inline static void * ggml_aligned_malloc(size_t size) { error_desc = "insufficient memory"; break; } - GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0)); + GGML_LOG_ERROR("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0)); GGML_ABORT("fatal error"); return NULL; } return aligned_memory; +#endif } -#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size) -#ifdef GGML_USE_CPU_HBM -#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr) + +void ggml_aligned_free(void * ptr, size_t size) { + GGML_UNUSED(size); +#if defined(_MSC_VER) || defined(__MINGW32__) + _aligned_free(ptr); +#elif GGML_USE_CPU_HBM + if (ptr != NULL) { + hbw_free(ptr); + } +#elif TARGET_OS_OSX + if (ptr != NULL) { + vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ptr, size); + } #else -#define GGML_ALIGNED_FREE(ptr) free(ptr) -#endif + free(ptr); #endif +} + inline static void * ggml_malloc(size_t size) { if (size == 0) { - GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n"); + GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n"); return NULL; } void * result = malloc(size); if (result == NULL) { - GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0)); + GGML_LOG_ERROR("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0)); GGML_ABORT("fatal error"); } return result; @@ -410,12 +473,12 @@ inline static void * ggml_malloc(size_t size) { // calloc inline static void * ggml_calloc(size_t num, size_t size) { if (num == 0 || size == 0) { - GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n"); + GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n"); return NULL; } void * result = calloc(num, size); if (result == NULL) { - GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0)); + GGML_LOG_ERROR("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0)); GGML_ABORT("fatal error"); } return result; @@ -455,7 +518,16 @@ static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16]; // precomputed f32 table for f16 (256 KB) (ggml-impl.h) float ggml_table_f32_f16[1 << 16]; -GGML_CALL const char * ggml_status_to_string(enum ggml_status status) { +#if defined(__ARM_ARCH) +struct ggml_arm_arch_features_type { + int has_neon; + int has_i8mm; + int has_sve; + int sve_cnt; +} ggml_arm_arch_features = {-1, -1, -1, 0}; +#endif + +const char * ggml_status_to_string(enum ggml_status status) { switch (status) { case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)"; case GGML_STATUS_FAILED: return "GGML status: error (operation failed)"; @@ -686,7 +758,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc); static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc); -static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { +static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { [GGML_TYPE_I8] = { .type_name = "i8", .blck_size = 1, @@ -1108,9 +1180,9 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { }; // For internal test use -ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { +const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) { GGML_ASSERT(type < GGML_TYPE_COUNT); - return type_traits[type]; + return &type_traits[type]; } // @@ -2951,6 +3023,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "SUM_ROWS", "MEAN", "ARGMAX", + "COUNT_EQUAL", "REPEAT", "REPEAT_BACK", "CONCAT", @@ -3024,7 +3097,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "OPT_STEP_ADAMW", }; -static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80"); +static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -3045,6 +3118,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "Σx_k", "Σx/n", "argmax(x)", + "count_equal(x)", "repeat(x)", "repeat_back(x)", "concat(x, y)", @@ -3118,7 +3192,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "adamw(x)", }; -static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80"); +static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -3341,7 +3415,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) { if (fptr != NULL) { char buf[42]; if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) { - GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n"); + GGML_LOG_WARN("/proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n"); } fclose(fptr); } @@ -3359,36 +3433,36 @@ bool ggml_is_numa(void) { //////////////////////////////////////////////////////////////////////////////// void ggml_print_object(const struct ggml_object * obj) { - GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n", + GGML_LOG_INFO(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n", obj->type, obj->offs, obj->size, (const void *) obj->next); } void ggml_print_objects(const struct ggml_context * ctx) { struct ggml_object * obj = ctx->objects_begin; - GGML_PRINT("%s: objects in context %p:\n", __func__, (const void *) ctx); + GGML_LOG_INFO("%s: objects in context %p:\n", __func__, (const void *) ctx); while (obj != NULL) { ggml_print_object(obj); obj = obj->next; } - GGML_PRINT("%s: --- end ---\n", __func__); + GGML_LOG_INFO("%s: --- end ---\n", __func__); } -GGML_CALL int64_t ggml_nelements(const struct ggml_tensor * tensor) { +int64_t ggml_nelements(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; } -GGML_CALL int64_t ggml_nrows(const struct ggml_tensor * tensor) { +int64_t ggml_nrows(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; } -GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) { +size_t ggml_nbytes(const struct ggml_tensor * tensor) { size_t nbytes; size_t blck_size = ggml_blck_size(tensor->type); if (blck_size == 1) { @@ -3411,15 +3485,15 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) { return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN); } -GGML_CALL int64_t ggml_blck_size(enum ggml_type type) { +int64_t ggml_blck_size(enum ggml_type type) { return type_traits[type].blck_size; } -GGML_CALL size_t ggml_type_size(enum ggml_type type) { +size_t ggml_type_size(enum ggml_type type) { return type_traits[type].type_size; } -GGML_CALL size_t ggml_row_size(enum ggml_type type, int64_t ne) { +size_t ggml_row_size(enum ggml_type type, int64_t ne) { assert(ne % ggml_blck_size(type) == 0); return ggml_type_size(type)*ne/ggml_blck_size(type); } @@ -3428,15 +3502,15 @@ double ggml_type_sizef(enum ggml_type type) { return ((double)(type_traits[type].type_size))/type_traits[type].blck_size; } -GGML_CALL const char * ggml_type_name(enum ggml_type type) { +const char * ggml_type_name(enum ggml_type type) { return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE"; } -GGML_CALL bool ggml_is_quantized(enum ggml_type type) { +bool ggml_is_quantized(enum ggml_type type) { return type_traits[type].is_quantized; } -GGML_CALL const char * ggml_op_name(enum ggml_op op) { +const char * ggml_op_name(enum ggml_op op) { return GGML_OP_NAME[op]; } @@ -3448,7 +3522,7 @@ const char * ggml_unary_op_name(enum ggml_unary_op op) { return GGML_UNARY_OP_NAME[op]; } -GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) { +const char * ggml_op_desc(const struct ggml_tensor * t) { if (t->op == GGML_OP_UNARY) { enum ggml_unary_op uop = ggml_get_unary_op(t); return ggml_unary_op_name(uop); @@ -3456,7 +3530,7 @@ GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) { return ggml_op_name(t->op); } -GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor) { +size_t ggml_element_size(const struct ggml_tensor * tensor) { return ggml_type_size(tensor->type); } @@ -3549,7 +3623,7 @@ size_t ggml_tensor_overhead(void) { return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE; } -GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) { +bool ggml_is_transposed(const struct ggml_tensor * tensor) { return tensor->nb[0] > tensor->nb[1]; } @@ -3575,23 +3649,23 @@ static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) { return true; } -GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) { +bool ggml_is_contiguous(const struct ggml_tensor * tensor) { return ggml_is_contiguous_0(tensor); } -GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) { +bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) { return ggml_is_contiguous_n(tensor, 0); } -GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) { +bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) { return ggml_is_contiguous_n(tensor, 1); } -GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) { +bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) { return ggml_is_contiguous_n(tensor, 2); } -GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) { +bool ggml_is_permuted(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3]; @@ -3606,7 +3680,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } -GGML_CALL bool ggml_is_empty(const struct ggml_tensor * tensor) { +bool ggml_is_empty(const struct ggml_tensor * tensor) { for (int i = 0; i < GGML_MAX_DIMS; ++i) { if (tensor->ne[i] == 0) { // empty if any dimension has no elements @@ -3673,6 +3747,70 @@ static inline int ggml_up(int n, int m) { //////////////////////////////////////////////////////////////////////////////// +#if defined(__ARM_ARCH) + +#if defined(__linux__) && defined(__aarch64__) +#include +#elif defined(__APPLE__) +#include +#endif + +#if !defined(HWCAP2_I8MM) +#define HWCAP2_I8MM 0 +#endif + +static void ggml_init_arm_arch_features(void) { +#if defined(__linux__) && defined(__aarch64__) + uint32_t hwcap = getauxval(AT_HWCAP); + uint32_t hwcap2 = getauxval(AT_HWCAP2); + + ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD); + ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM); + ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE); + +#if defined(__ARM_FEATURE_SVE) + ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL); +#endif +#elif defined(__APPLE__) + int oldp = 0; + size_t size = sizeof(oldp); + if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) { + oldp = 0; + } + ggml_arm_arch_features.has_neon = oldp; + + if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) { + oldp = 0; + } + ggml_arm_arch_features.has_i8mm = oldp; + + ggml_arm_arch_features.has_sve = 0; + ggml_arm_arch_features.sve_cnt = 0; +#else +// Run-time CPU feature detection not implemented for this platform, fallback to compile time +#if defined(__ARM_NEON) + ggml_arm_arch_features.has_neon = 1; +#else + ggml_arm_arch_features.has_neon = 0; +#endif + +#if defined(__ARM_FEATURE_MATMUL_INT8) + ggml_arm_arch_features.has_i8mm = 1; +#else + ggml_arm_arch_features.has_i8mm = 0; +#endif + +#if defined(__ARM_FEATURE_SVE) + ggml_arm_arch_features.has_sve = 1; + ggml_arm_arch_features.sve_cnt = 16; +#else + ggml_arm_arch_features.has_sve = 0; + ggml_arm_arch_features.sve_cnt = 0; +#endif +#endif +} +#endif + struct ggml_context * ggml_init(struct ggml_init_params params) { // make this function thread safe ggml_critical_section_start(); @@ -3723,6 +3861,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } +#if defined(__ARM_ARCH) + ggml_init_arm_arch_features(); +#endif + is_first_call = false; } @@ -3756,7 +3898,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { *ctx = (struct ggml_context) { /*.mem_size =*/ mem_size, - /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size), + /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size), /*.mem_buffer_owned =*/ params.mem_buffer ? false : true, /*.no_alloc =*/ params.no_alloc, /*.no_alloc_save =*/ params.no_alloc, @@ -3771,12 +3913,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_ASSERT_ALIGNED(ctx->mem_buffer); -#if defined(__ARM_FEATURE_SVE) - if (!ggml_sve_cnt_b) { - ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL); - } -#endif - GGML_PRINT_DEBUG("%s: context initialized\n", __func__); ggml_critical_section_end(); @@ -3802,7 +3938,7 @@ void ggml_free(struct ggml_context * ctx) { __func__, i, ggml_used_mem(ctx)); if (ctx->mem_buffer_owned) { - GGML_ALIGNED_FREE(ctx->mem_buffer); + ggml_aligned_free(ctx->mem_buffer, ctx->mem_size); } found = true; @@ -3894,7 +4030,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end); if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) { - GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", + GGML_LOG_WARN("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size); assert(false); return NULL; @@ -3958,7 +4094,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( if (ctx->scratch.data != NULL) { // allocate tensor data in the scratch buffer if (ctx->scratch.offs + data_size > ctx->scratch.size) { - GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n", + GGML_LOG_WARN("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n", __func__, ctx->scratch.offs + data_size, ctx->scratch.size); assert(false); return NULL; @@ -4127,9 +4263,13 @@ static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, floa } struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) { + if (ggml_is_empty(tensor)) { + return tensor; + } if (tensor->buffer) { ggml_backend_tensor_memset(tensor, 0, 0, ggml_nbytes(tensor)); } else { + GGML_ASSERT(tensor->data); memset(tensor->data, 0, ggml_nbytes(tensor)); } return tensor; @@ -4560,7 +4700,7 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) { return (float *)(tensor->data); } -GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) { +enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) { GGML_ASSERT(tensor->op == GGML_OP_UNARY); return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0); } @@ -4657,18 +4797,11 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam static struct ggml_tensor * ggml_dup_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad)) { - is_node = true; - } - + struct ggml_tensor * a, + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = GGML_OP_DUP; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_DUP; result->src[0] = a; return result; @@ -4676,13 +4809,13 @@ static struct ggml_tensor * ggml_dup_impl( struct ggml_tensor * ggml_dup( struct ggml_context * ctx, - struct ggml_tensor * a) { + struct ggml_tensor * a) { return ggml_dup_impl(ctx, a, false); } struct ggml_tensor * ggml_dup_inplace( struct ggml_context * ctx, - struct ggml_tensor * a) { + struct ggml_tensor * a) { return ggml_dup_impl(ctx, a, true); } @@ -4690,21 +4823,14 @@ struct ggml_tensor * ggml_dup_inplace( static struct ggml_tensor * ggml_add_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - bool inplace) { + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { GGML_ASSERT(ggml_can_repeat(b, a)); - bool is_node = false; - - if (!inplace && (a->grad || b->grad)) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = GGML_OP_ADD; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_ADD; result->src[0] = a; result->src[1] = b; @@ -4713,15 +4839,15 @@ static struct ggml_tensor * ggml_add_impl( struct ggml_tensor * ggml_add( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { + struct ggml_tensor * a, + struct ggml_tensor * b) { return ggml_add_impl(ctx, a, b, false); } struct ggml_tensor * ggml_add_inplace( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { + struct ggml_tensor * a, + struct ggml_tensor * b) { return ggml_add_impl(ctx, a, b, true); } @@ -4729,9 +4855,9 @@ struct ggml_tensor * ggml_add_inplace( static struct ggml_tensor * ggml_add_cast_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - enum ggml_type type) { + struct ggml_tensor * a, + struct ggml_tensor * b, + enum ggml_type type) { // TODO: support less-strict constraint // GGML_ASSERT(ggml_can_repeat(b, a)); GGML_ASSERT(ggml_can_repeat_rows(b, a)); @@ -4741,18 +4867,9 @@ static struct ggml_tensor * ggml_add_cast_impl( a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16); - bool is_node = false; - - if (a->grad || b->grad) { - // TODO: support backward pass for broadcasting - GGML_ASSERT(ggml_are_same_shape(a, b)); - is_node = true; - } - struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne); - result->op = GGML_OP_ADD; - result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne) : NULL; + result->op = GGML_OP_ADD; result->src[0] = a; result->src[1] = b; @@ -4761,9 +4878,9 @@ static struct ggml_tensor * ggml_add_cast_impl( struct ggml_tensor * ggml_add_cast( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - enum ggml_type type) { + struct ggml_tensor * a, + struct ggml_tensor * b, + enum ggml_type type) { return ggml_add_cast_impl(ctx, a, b, type); } @@ -4771,22 +4888,15 @@ struct ggml_tensor * ggml_add_cast( static struct ggml_tensor * ggml_add1_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - bool inplace) { + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { GGML_ASSERT(ggml_is_scalar(b)); GGML_ASSERT(ggml_is_padded_1d(a)); - bool is_node = false; - - if (a->grad || b->grad) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = GGML_OP_ADD1; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_ADD1; result->src[0] = a; result->src[1] = b; @@ -4795,15 +4905,15 @@ static struct ggml_tensor * ggml_add1_impl( struct ggml_tensor * ggml_add1( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { + struct ggml_tensor * a, + struct ggml_tensor * b) { return ggml_add1_impl(ctx, a, b, false); } struct ggml_tensor * ggml_add1_inplace( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { + struct ggml_tensor * a, + struct ggml_tensor * b) { return ggml_add1_impl(ctx, a, b, true); } @@ -4811,31 +4921,24 @@ struct ggml_tensor * ggml_add1_inplace( static struct ggml_tensor * ggml_acc_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - size_t nb1, - size_t nb2, - size_t nb3, - size_t offset, - bool inplace) { + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset, + bool inplace) { GGML_ASSERT(ggml_nelements(b) <= ggml_nelements(a)); GGML_ASSERT(ggml_is_contiguous(a)); GGML_ASSERT(a->type == GGML_TYPE_F32); GGML_ASSERT(b->type == GGML_TYPE_F32); - bool is_node = false; - - if (!inplace && (a->grad || b->grad)) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_ACC; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_ACC; result->src[0] = a; result->src[1] = b; @@ -4844,23 +4947,23 @@ static struct ggml_tensor * ggml_acc_impl( struct ggml_tensor * ggml_acc( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - size_t nb1, - size_t nb2, - size_t nb3, - size_t offset) { + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset) { return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false); } struct ggml_tensor * ggml_acc_inplace( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - size_t nb1, - size_t nb2, - size_t nb3, - size_t offset) { + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset) { return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true); } @@ -4868,23 +4971,14 @@ struct ggml_tensor * ggml_acc_inplace( static struct ggml_tensor * ggml_sub_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - bool inplace) { + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { GGML_ASSERT(ggml_can_repeat(b, a)); - bool is_node = false; - - if (!inplace && (a->grad || b->grad)) { - // TODO: support backward pass for broadcasting - GGML_ASSERT(ggml_are_same_shape(a, b)); - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = GGML_OP_SUB; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_SUB; result->src[0] = a; result->src[1] = b; @@ -4893,15 +4987,15 @@ static struct ggml_tensor * ggml_sub_impl( struct ggml_tensor * ggml_sub( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { + struct ggml_tensor * a, + struct ggml_tensor * b) { return ggml_sub_impl(ctx, a, b, false); } struct ggml_tensor * ggml_sub_inplace( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { + struct ggml_tensor * a, + struct ggml_tensor * b) { return ggml_sub_impl(ctx, a, b, true); } @@ -4909,27 +5003,14 @@ struct ggml_tensor * ggml_sub_inplace( static struct ggml_tensor * ggml_mul_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - bool inplace) { + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { GGML_ASSERT(ggml_can_repeat(b, a)); - bool is_node = false; - - if (!inplace && (a->grad || b->grad)) { - // TODO: support backward pass for broadcasting - GGML_ASSERT(ggml_are_same_shape(a, b)); - is_node = true; - } - - if (inplace) { - GGML_ASSERT(!is_node); - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = GGML_OP_MUL; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_MUL; result->src[0] = a; result->src[1] = b; @@ -4954,25 +5035,14 @@ struct ggml_tensor * ggml_mul_inplace( static struct ggml_tensor * ggml_div_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - bool inplace) { + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { GGML_ASSERT(ggml_can_repeat(b, a)); - bool is_node = false; - - if (!inplace && (a->grad || b->grad)) { - is_node = true; - } - - if (inplace) { - GGML_ASSERT(!is_node); - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = GGML_OP_DIV; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_DIV; result->src[0] = a; result->src[1] = b; @@ -4997,18 +5067,11 @@ struct ggml_tensor * ggml_div_inplace( static struct ggml_tensor * ggml_sqr_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad)) { - is_node = true; - } - + struct ggml_tensor * a, + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = GGML_OP_SQR; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_SQR; result->src[0] = a; return result; @@ -5030,18 +5093,11 @@ struct ggml_tensor * ggml_sqr_inplace( static struct ggml_tensor * ggml_sqrt_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad)) { - is_node = true; - } - + struct ggml_tensor * a, + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = GGML_OP_SQRT; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_SQRT; result->src[0] = a; return result; @@ -5064,17 +5120,10 @@ struct ggml_tensor * ggml_sqrt_inplace( static struct ggml_tensor * ggml_log_impl( struct ggml_context * ctx, struct ggml_tensor * a, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad)) { - is_node = true; - } - + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = GGML_OP_LOG; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_LOG; result->src[0] = a; return result; @@ -5097,17 +5146,10 @@ struct ggml_tensor * ggml_log_inplace( static struct ggml_tensor * ggml_sin_impl( struct ggml_context * ctx, struct ggml_tensor * a, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad)) { - is_node = true; - } - + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = GGML_OP_SIN; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_SIN; result->src[0] = a; return result; @@ -5130,17 +5172,10 @@ struct ggml_tensor * ggml_sin_inplace( static struct ggml_tensor * ggml_cos_impl( struct ggml_context * ctx, struct ggml_tensor * a, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad)) { - is_node = true; - } - + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = GGML_OP_COS; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_COS; result->src[0] = a; return result; @@ -5162,17 +5197,10 @@ struct ggml_tensor * ggml_cos_inplace( struct ggml_tensor * ggml_sum( struct ggml_context * ctx, - struct ggml_tensor * a) { - bool is_node = false; - - if (a->grad) { - is_node = true; - } - + struct ggml_tensor * a) { struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1); - result->op = GGML_OP_SUM; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_SUM; result->src[0] = a; return result; @@ -5182,13 +5210,7 @@ struct ggml_tensor * ggml_sum( struct ggml_tensor * ggml_sum_rows( struct ggml_context * ctx, - struct ggml_tensor * a) { - bool is_node = false; - - if (a->grad) { - is_node = true; - } - + struct ggml_tensor * a) { int64_t ne[GGML_MAX_DIMS] = { 1 }; for (int i = 1; i < GGML_MAX_DIMS; ++i) { ne[i] = a->ne[i]; @@ -5196,8 +5218,7 @@ struct ggml_tensor * ggml_sum_rows( struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne); - result->op = GGML_OP_SUM_ROWS; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_SUM_ROWS; result->src[0] = a; return result; @@ -5207,19 +5228,11 @@ struct ggml_tensor * ggml_sum_rows( struct ggml_tensor * ggml_mean( struct ggml_context * ctx, - struct ggml_tensor * a) { - bool is_node = false; - - if (a->grad) { - GGML_ABORT("fatal error"); // TODO: implement - is_node = true; - } - + struct ggml_tensor * a) { int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - result->op = GGML_OP_MEAN; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_MEAN; result->src[0] = a; return result; @@ -5229,42 +5242,45 @@ struct ggml_tensor * ggml_mean( struct ggml_tensor * ggml_argmax( struct ggml_context * ctx, - struct ggml_tensor * a) { + struct ggml_tensor * a) { GGML_ASSERT(ggml_is_matrix(a)); - bool is_node = false; - - if (a->grad) { - GGML_ABORT("fatal error"); - is_node = true; - } struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]); - result->op = GGML_OP_ARGMAX; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_ARGMAX; result->src[0] = a; return result; } +// ggml_count_equal + +struct ggml_tensor * ggml_count_equal( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_are_same_shape(a, b)); + + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, 1); + + result->op = GGML_OP_COUNT_EQUAL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + // ggml_repeat struct ggml_tensor * ggml_repeat( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { + struct ggml_tensor * a, + struct ggml_tensor * b) { GGML_ASSERT(ggml_can_repeat(a, b)); - bool is_node = false; - - if (a->grad) { - is_node = true; - } - struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne); - result->op = GGML_OP_REPEAT; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_REPEAT; result->src[0] = a; return result; @@ -5274,24 +5290,13 @@ struct ggml_tensor * ggml_repeat( struct ggml_tensor * ggml_repeat_back( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { + struct ggml_tensor * a, + struct ggml_tensor * b) { GGML_ASSERT(ggml_can_repeat(b, a)); - bool is_node = false; - - if (a->grad) { - is_node = true; - } - - if (ggml_are_same_shape(a, b) && !is_node) { - return a; - } - struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne); - result->op = GGML_OP_REPEAT_BACK; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_REPEAT_BACK; result->src[0] = a; return result; @@ -5301,9 +5306,9 @@ struct ggml_tensor * ggml_repeat_back( struct ggml_tensor * ggml_concat( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int dim) { + struct ggml_tensor * a, + struct ggml_tensor * b, + int dim) { GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS); int64_t ne[GGML_MAX_DIMS]; @@ -5316,19 +5321,11 @@ struct ggml_tensor * ggml_concat( ne[d] = a->ne[d]; } - bool is_node = false; - - if (a->grad || b->grad) { - GGML_ABORT("fatal error"); // TODO: implement - is_node = true; - } - struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne); ggml_set_op_params_i32(result, 0, dim); - result->op = GGML_OP_CONCAT; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_CONCAT; result->src[0] = a; result->src[1] = b; @@ -5437,20 +5434,14 @@ struct ggml_tensor * ggml_relu_inplace( struct ggml_tensor * ggml_leaky_relu( struct ggml_context * ctx, - struct ggml_tensor * a, float negative_slope, bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad)) { - GGML_ABORT("fatal error"); // TODO: not implemented - is_node = true; - } - + struct ggml_tensor * a, + float negative_slope, + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params(result, &negative_slope, sizeof(negative_slope)); - result->op = GGML_OP_LEAKY_RELU; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_LEAKY_RELU; result->src[0] = a; return result; @@ -5518,17 +5509,9 @@ struct ggml_tensor * ggml_silu_back( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { - bool is_node = false; - - if (a->grad || b->grad) { - // TODO: implement backward - is_node = true; - } - struct ggml_tensor * result = ggml_dup_tensor(ctx, a); - result->op = GGML_OP_SILU_BACK; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_SILU_BACK; result->src[0] = a; result->src[1] = b; @@ -5536,6 +5519,7 @@ struct ggml_tensor * ggml_silu_back( } // ggml hardswish + struct ggml_tensor * ggml_hardswish( struct ggml_context * ctx, struct ggml_tensor * a) { @@ -5543,6 +5527,7 @@ struct ggml_tensor * ggml_hardswish( } // ggml hardsigmoid + struct ggml_tensor * ggml_hardsigmoid( struct ggml_context * ctx, struct ggml_tensor * a) { @@ -5550,6 +5535,7 @@ struct ggml_tensor * ggml_hardsigmoid( } // ggml exp + struct ggml_tensor * ggml_exp( struct ggml_context * ctx, struct ggml_tensor * a) { @@ -5567,21 +5553,13 @@ struct ggml_tensor * ggml_exp_inplace( static struct ggml_tensor * ggml_norm_impl( struct ggml_context * ctx, struct ggml_tensor * a, - float eps, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad)) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - + float eps, + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params(result, &eps, sizeof(eps)); - result->op = GGML_OP_NORM; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_NORM; result->src[0] = a; return result; @@ -5590,14 +5568,14 @@ static struct ggml_tensor * ggml_norm_impl( struct ggml_tensor * ggml_norm( struct ggml_context * ctx, struct ggml_tensor * a, - float eps) { + float eps) { return ggml_norm_impl(ctx, a, eps, false); } struct ggml_tensor * ggml_norm_inplace( struct ggml_context * ctx, struct ggml_tensor * a, - float eps) { + float eps) { return ggml_norm_impl(ctx, a, eps, true); } @@ -5606,20 +5584,13 @@ struct ggml_tensor * ggml_norm_inplace( static struct ggml_tensor * ggml_rms_norm_impl( struct ggml_context * ctx, struct ggml_tensor * a, - float eps, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad)) { - is_node = true; - } - + float eps, + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params(result, &eps, sizeof(eps)); - result->op = GGML_OP_RMS_NORM; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_RMS_NORM; result->src[0] = a; return result; @@ -5628,14 +5599,14 @@ static struct ggml_tensor * ggml_rms_norm_impl( struct ggml_tensor * ggml_rms_norm( struct ggml_context * ctx, struct ggml_tensor * a, - float eps) { + float eps) { return ggml_rms_norm_impl(ctx, a, eps, false); } struct ggml_tensor * ggml_rms_norm_inplace( struct ggml_context * ctx, struct ggml_tensor * a, - float eps) { + float eps) { return ggml_rms_norm_impl(ctx, a, eps, true); } @@ -5645,20 +5616,12 @@ struct ggml_tensor * ggml_rms_norm_back( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - float eps) { - bool is_node = false; - - if (a->grad) { - // TODO: implement backward - is_node = true; - } - + float eps) { struct ggml_tensor * result = ggml_dup_tensor(ctx, a); ggml_set_op_params(result, &eps, sizeof(eps)); - result->op = GGML_OP_RMS_NORM_BACK; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_RMS_NORM_BACK; result->src[0] = a; result->src[1] = b; @@ -5668,43 +5631,35 @@ struct ggml_tensor * ggml_rms_norm_back( // ggml_group_norm static struct ggml_tensor * ggml_group_norm_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, - int n_groups, - float eps, - bool inplace) { - - bool is_node = false; - if (!inplace && (a->grad)) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups, + float eps, + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params_i32(result, 0, n_groups); ggml_set_op_params_f32(result, 1, eps); - result->op = GGML_OP_GROUP_NORM; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_GROUP_NORM; result->src[0] = a; return result; } struct ggml_tensor * ggml_group_norm( - struct ggml_context * ctx, - struct ggml_tensor * a, - int n_groups, - float eps) { + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups, + float eps) { return ggml_group_norm_impl(ctx, a, n_groups, eps, false); } struct ggml_tensor * ggml_group_norm_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - int n_groups, - float eps) { + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups, + float eps) { return ggml_group_norm_impl(ctx, a, n_groups, eps, true); } @@ -5717,17 +5672,10 @@ struct ggml_tensor * ggml_mul_mat( GGML_ASSERT(ggml_can_mul_mat(a, b)); GGML_ASSERT(!ggml_is_transposed(a)); - bool is_node = false; - - if (a->grad || b->grad) { - is_node = true; - } - const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - result->op = GGML_OP_MUL_MAT; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_MUL_MAT; result->src[0] = a; result->src[1] = b; @@ -5773,17 +5721,10 @@ struct ggml_tensor * ggml_mul_mat_id( GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast - bool is_node = false; - - if (as->grad || b->grad) { - is_node = true; - } - const int64_t ne[4] = { as->ne[1], ids->ne[0], b->ne[2], 1 }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - result->op = GGML_OP_MUL_MAT_ID; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_MUL_MAT_ID; result->src[0] = as; result->src[1] = b; result->src[2] = ids; @@ -5800,18 +5741,11 @@ struct ggml_tensor * ggml_out_prod( GGML_ASSERT(ggml_can_out_prod(a, b)); GGML_ASSERT(!ggml_is_transposed(a)); - bool is_node = false; - - if (a->grad || b->grad) { - is_node = true; - } - // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3] const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - result->op = GGML_OP_OUT_PROD; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_OUT_PROD; result->src[0] = a; result->src[1] = b; @@ -5824,21 +5758,14 @@ static struct ggml_tensor * ggml_scale_impl( struct ggml_context * ctx, struct ggml_tensor * a, float s, - bool inplace) { + bool inplace) { GGML_ASSERT(ggml_is_padded_1d(a)); - bool is_node = false; - - if (a->grad) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params(result, &s, sizeof(s)); - result->op = GGML_OP_SCALE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_SCALE; result->src[0] = a; return result; @@ -5846,15 +5773,15 @@ static struct ggml_tensor * ggml_scale_impl( struct ggml_tensor * ggml_scale( struct ggml_context * ctx, - struct ggml_tensor * a, - float s) { + struct ggml_tensor * a, + float s) { return ggml_scale_impl(ctx, a, s, false); } struct ggml_tensor * ggml_scale_inplace( struct ggml_context * ctx, - struct ggml_tensor * a, - float s) { + struct ggml_tensor * a, + float s) { return ggml_scale_impl(ctx, a, s, true); } @@ -5868,15 +5795,9 @@ static struct ggml_tensor * ggml_set_impl( size_t nb2, size_t nb3, size_t offset, - bool inplace) { + bool inplace) { GGML_ASSERT(ggml_nelements(a) >= ggml_nelements(b)); - bool is_node = false; - - if (a->grad || b->grad) { - is_node = true; - } - // make a view of the destination struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); @@ -5884,8 +5805,7 @@ static struct ggml_tensor * ggml_set_impl( int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_SET; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_SET; result->src[0] = a; result->src[1] = b; @@ -5894,8 +5814,8 @@ static struct ggml_tensor * ggml_set_impl( struct ggml_tensor * ggml_set( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, @@ -5905,8 +5825,8 @@ struct ggml_tensor * ggml_set( struct ggml_tensor * ggml_set_inplace( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, @@ -5916,24 +5836,24 @@ struct ggml_tensor * ggml_set_inplace( struct ggml_tensor * ggml_set_1d( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t offset) { return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, false); } struct ggml_tensor * ggml_set_1d_inplace( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t offset) { return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, true); } struct ggml_tensor * ggml_set_2d( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t nb1, size_t offset) { return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false); @@ -5941,8 +5861,8 @@ struct ggml_tensor * ggml_set_2d( struct ggml_tensor * ggml_set_2d_inplace( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t nb1, size_t offset) { return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true); @@ -5956,13 +5876,6 @@ static struct ggml_tensor * ggml_cpy_impl( struct ggml_tensor * b) { GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b)); - bool is_node = false; - - if (a->grad || b->grad) { - // inplace is false and either one have a grad - is_node = true; - } - // make a view of the destination struct ggml_tensor * result = ggml_view_tensor(ctx, b); if (strlen(b->name) > 0) { @@ -5971,8 +5884,7 @@ static struct ggml_tensor * ggml_cpy_impl( ggml_format_name(result, "%s (copy)", a->name); } - result->op = GGML_OP_CPY; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_CPY; result->src[0] = a; result->src[1] = b; @@ -5990,13 +5902,10 @@ struct ggml_tensor * ggml_cast( struct ggml_context * ctx, struct ggml_tensor * a, enum ggml_type type) { - bool is_node = false; - struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne); ggml_format_name(result, "%s (copy)", a->name); - result->op = GGML_OP_CPY; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_CPY; result->src[0] = a; result->src[1] = result; @@ -6008,17 +5917,10 @@ struct ggml_tensor * ggml_cast( static struct ggml_tensor * ggml_cont_impl( struct ggml_context * ctx, struct ggml_tensor * a) { - bool is_node = false; - - if (a->grad) { - is_node = true; - } - struct ggml_tensor * result = ggml_dup_tensor(ctx, a); ggml_format_name(result, "%s (cont)", a->name); - result->op = GGML_OP_CONT; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_CONT; result->src[0] = a; return result; @@ -6064,13 +5966,10 @@ struct ggml_tensor * ggml_cont_4d( int64_t ne3) { GGML_ASSERT(ggml_nelements(a) == (ne0*ne1*ne2*ne3)); - bool is_node = false; - struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3); ggml_format_name(result, "%s (cont)", a->name); - result->op = GGML_OP_CONT; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_CONT; result->src[0] = a; return result; @@ -6086,22 +5985,10 @@ struct ggml_tensor * ggml_reshape( // as only the shape of b is relevant, and not its memory layout, b is allowed to be non contiguous. GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b)); - bool is_node = false; - - if (a->grad) { - is_node = true; - } - - if (b->grad) { - // gradient propagation is not supported - //GGML_ABORT("fatal error"); - } - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0); ggml_format_name(result, "%s (reshaped)", a->name); - result->op = GGML_OP_RESHAPE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_RESHAPE; result->src[0] = a; return result; @@ -6114,18 +6001,11 @@ struct ggml_tensor * ggml_reshape_1d( GGML_ASSERT(ggml_is_contiguous(a)); GGML_ASSERT(ggml_nelements(a) == ne0); - bool is_node = false; - - if (a->grad) { - is_node = true; - } - const int64_t ne[1] = { ne0 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0); ggml_format_name(result, "%s (reshaped)", a->name); - result->op = GGML_OP_RESHAPE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_RESHAPE; result->src[0] = a; return result; @@ -6139,18 +6019,11 @@ struct ggml_tensor * ggml_reshape_2d( GGML_ASSERT(ggml_is_contiguous(a)); GGML_ASSERT(ggml_nelements(a) == ne0*ne1); - bool is_node = false; - - if (a->grad) { - is_node = true; - } - const int64_t ne[2] = { ne0, ne1 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0); ggml_format_name(result, "%s (reshaped)", a->name); - result->op = GGML_OP_RESHAPE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_RESHAPE; result->src[0] = a; return result; @@ -6165,18 +6038,11 @@ struct ggml_tensor * ggml_reshape_3d( GGML_ASSERT(ggml_is_contiguous(a)); GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2); - bool is_node = false; - - if (a->grad) { - is_node = true; - } - const int64_t ne[3] = { ne0, ne1, ne2 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0); ggml_format_name(result, "%s (reshaped)", a->name); - result->op = GGML_OP_RESHAPE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_RESHAPE; result->src[0] = a; return result; @@ -6192,18 +6058,11 @@ struct ggml_tensor * ggml_reshape_4d( GGML_ASSERT(ggml_is_contiguous(a)); GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2*ne3); - bool is_node = false; - - if (a->grad) { - is_node = true; - } - const int64_t ne[4] = { ne0, ne1, ne2, ne3 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0); ggml_format_name(result, "%s (reshaped)", a->name); - result->op = GGML_OP_RESHAPE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_RESHAPE; result->src[0] = a; return result; @@ -6215,20 +6074,12 @@ static struct ggml_tensor * ggml_view_impl( int n_dims, const int64_t * ne, size_t offset) { - - bool is_node = false; - - if (a->grad) { - is_node = true; - } - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset); ggml_format_name(result, "%s (view)", a->name); ggml_set_op_params(result, &offset, sizeof(offset)); - result->op = GGML_OP_VIEW; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_VIEW; result->src[0] = a; return result; @@ -6241,7 +6092,6 @@ struct ggml_tensor * ggml_view_1d( struct ggml_tensor * a, int64_t ne0, size_t offset) { - struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset); return result; @@ -6256,7 +6106,6 @@ struct ggml_tensor * ggml_view_2d( int64_t ne1, size_t nb1, size_t offset) { - const int64_t ne[2] = { ne0, ne1 }; struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset); @@ -6279,7 +6128,6 @@ struct ggml_tensor * ggml_view_3d( size_t nb1, size_t nb2, size_t offset) { - const int64_t ne[3] = { ne0, ne1, ne2 }; struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset); @@ -6304,7 +6152,6 @@ struct ggml_tensor * ggml_view_4d( size_t nb2, size_t nb3, size_t offset) { - const int64_t ne[4] = { ne0, ne1, ne2, ne3 }; struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset); @@ -6337,12 +6184,6 @@ struct ggml_tensor * ggml_permute( GGML_ASSERT(axis1 != axis3); GGML_ASSERT(axis2 != axis3); - bool is_node = false; - - if (a->grad) { - is_node = true; - } - struct ggml_tensor * result = ggml_view_tensor(ctx, a); ggml_format_name(result, "%s (permuted)", a->name); @@ -6369,8 +6210,7 @@ struct ggml_tensor * ggml_permute( result->nb[2] = nb[2]; result->nb[3] = nb[3]; - result->op = GGML_OP_PERMUTE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_PERMUTE; result->src[0] = a; int32_t params[] = { axis0, axis1, axis2, axis3 }; @@ -6384,12 +6224,6 @@ struct ggml_tensor * ggml_permute( struct ggml_tensor * ggml_transpose( struct ggml_context * ctx, struct ggml_tensor * a) { - bool is_node = false; - - if (a->grad) { - is_node = true; - } - struct ggml_tensor * result = ggml_view_tensor(ctx, a); ggml_format_name(result, "%s (transposed)", a->name); @@ -6399,8 +6233,7 @@ struct ggml_tensor * ggml_transpose( result->nb[0] = a->nb[1]; result->nb[1] = a->nb[0]; - result->op = GGML_OP_TRANSPOSE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_TRANSPOSE; result->src[0] = a; return result; @@ -6416,12 +6249,6 @@ struct ggml_tensor * ggml_get_rows( GGML_ASSERT(b->ne[3] == 1); GGML_ASSERT(b->type == GGML_TYPE_I32); - bool is_node = false; - - if (a->grad || b->grad) { - is_node = true; - } - // TODO: implement non F32 return enum ggml_type type = GGML_TYPE_F32; if (a->type == GGML_TYPE_I32) { @@ -6429,8 +6256,7 @@ struct ggml_tensor * ggml_get_rows( } struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type, a->ne[0], b->ne[0], b->ne[1], b->ne[2]); - result->op = GGML_OP_GET_ROWS; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_GET_ROWS; result->src[0] = a; result->src[1] = b; @@ -6447,18 +6273,11 @@ struct ggml_tensor * ggml_get_rows_back( GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32); GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0])); - bool is_node = false; - - if (a->grad || b->grad) { - is_node = true; - } - // TODO: implement non F32 return //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]); - result->op = GGML_OP_GET_ROWS_BACK; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_GET_ROWS_BACK; result->src[0] = a; result->src[1] = b; @@ -6471,17 +6290,11 @@ struct ggml_tensor * ggml_diag( struct ggml_context * ctx, struct ggml_tensor * a) { GGML_ASSERT(a->ne[1] == 1); - bool is_node = false; - - if (a->grad) { - is_node = true; - } const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] }; struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne); - result->op = GGML_OP_DIAG; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_DIAG; result->src[0] = a; return result; @@ -6494,19 +6307,12 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl( struct ggml_tensor * a, int n_past, bool inplace) { - bool is_node = false; - - if (a->grad) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); int32_t params[] = { n_past }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_DIAG_MASK_INF; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_DIAG_MASK_INF; result->src[0] = a; return result; @@ -6533,19 +6339,12 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl( struct ggml_tensor * a, int n_past, bool inplace) { - bool is_node = false; - - if (a->grad) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); int32_t params[] = { n_past }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_DIAG_MASK_ZERO; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_DIAG_MASK_ZERO; result->src[0] = a; return result; @@ -6588,19 +6387,12 @@ static struct ggml_tensor * ggml_soft_max_impl( GGML_ASSERT(mask); } - bool is_node = false; - - if (a->grad) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); float params[] = { scale, max_bias }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_SOFT_MAX; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_SOFT_MAX; result->src[0] = a; result->src[1] = mask; @@ -6635,16 +6427,9 @@ static struct ggml_tensor * ggml_soft_max_back_impl( struct ggml_tensor * a, struct ggml_tensor * b, bool inplace) { - bool is_node = false; - - if (a->grad || b->grad) { - is_node = true; // TODO : implement backward pass - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = GGML_OP_SOFT_MAX_BACK; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_SOFT_MAX_BACK; result->src[0] = a; result->src[1] = b; @@ -6693,12 +6478,6 @@ static struct ggml_tensor * ggml_rope_impl( GGML_ASSERT(c->ne[0] >= n_dims / 2); } - bool is_node = false; - - if (a->grad) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig }; @@ -6710,8 +6489,7 @@ static struct ggml_tensor * ggml_rope_impl( memcpy(params + 10, &beta_slow, sizeof(float)); ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_ROPE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_ROPE; result->src[0] = a; result->src[1] = b; result->src[2] = c; @@ -6839,13 +6617,6 @@ struct ggml_tensor * ggml_rope_back( GGML_ASSERT(b->type == GGML_TYPE_I32); GGML_ASSERT(a->ne[2] == b->ne[0]); - bool is_node = false; - - if (a->grad) { - GGML_ASSERT(false && "backwards pass not implemented"); - is_node = false; - } - struct ggml_tensor * result = ggml_dup_tensor(ctx, a); int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig }; @@ -6857,8 +6628,7 @@ struct ggml_tensor * ggml_rope_back( memcpy(params + 10, &beta_slow, sizeof(float)); ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_ROPE_BACK; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_ROPE_BACK; result->src[0] = a; result->src[1] = b; result->src[2] = c; @@ -6873,21 +6643,13 @@ struct ggml_tensor * ggml_clamp( struct ggml_tensor * a, float min, float max) { - bool is_node = false; - - if (a->grad) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - // TODO: when implement backward, fix this: struct ggml_tensor * result = ggml_view_tensor(ctx, a); float params[] = { min, max }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_CLAMP; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_CLAMP; result->src[0] = a; return result; @@ -6949,13 +6711,6 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d( GGML_ASSERT(p0 == 0); GGML_ASSERT(d0 == 1); - bool is_node = false; - - if (a->grad || b->grad) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - const int64_t ne[4] = { ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/), a->ne[1], b->ne[2], 1, @@ -6965,8 +6720,7 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d( int32_t params[] = { s0, p0, d0 }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_CONV_TRANSPOSE_1D; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_CONV_TRANSPOSE_1D; result->src[0] = a; result->src[1] = b; @@ -6974,17 +6728,17 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d( } // ggml_conv_depthwise -struct ggml_tensor * ggml_conv_depthwise_2d( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int s0, - int s1, - int p0, - int p1, - int d0, - int d1) { +struct ggml_tensor * ggml_conv_depthwise_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1) { struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]); struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]), @@ -7004,29 +6758,23 @@ struct ggml_tensor * ggml_conv_depthwise_2d( // b: [N, IC, IH, IW] // result: [N, OH, OW, IC*KH*KW] struct ggml_tensor * ggml_im2col( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int s0, - int s1, - int p0, - int p1, - int d0, - int d1, - bool is_2D, - enum ggml_type dst_type) { - + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1, + bool is_2D, + enum ggml_type dst_type) { if(is_2D) { GGML_ASSERT(a->ne[2] == b->ne[2]); } else { GGML_ASSERT(a->ne[1] == b->ne[1]); GGML_ASSERT(b->ne[3] == 1); } - bool is_node = false; - - if (/*a->grad ||*/ b->grad) { // a is only used for its shape, not its data - is_node = true; - } const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0; const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); @@ -7045,8 +6793,7 @@ struct ggml_tensor * ggml_im2col( int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_IM2COL; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_IM2COL; result->src[0] = a; result->src[1] = b; @@ -7054,30 +6801,22 @@ struct ggml_tensor * ggml_im2col( } struct ggml_tensor * ggml_im2col_back( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int64_t * ne, - int s0, - int s1, - int p0, - int p1, - int d0, - int d1, - bool is_2D) { - - bool is_node = false; - - if (/*a->grad ||*/ b->grad) { // a is only used for its shape, not its data - is_node = true; - } - + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int64_t * ne, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1, + bool is_2D) { struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_IM2COL_BACK; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_IM2COL_BACK; result->src[0] = a; result->src[1] = b; @@ -7091,12 +6830,12 @@ struct ggml_tensor * ggml_conv_2d( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - int s0, - int s1, - int p0, - int p1, - int d0, - int d1) { + int s0, + int s1, + int p0, + int p1, + int d0, + int d1) { struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, a->type); // [N, OH, OW, IC * KH * KW] struct ggml_tensor * result = @@ -7112,6 +6851,7 @@ struct ggml_tensor * ggml_conv_2d( } // ggml_conv_2d_sk_p0 + struct ggml_tensor * ggml_conv_2d_sk_p0( struct ggml_context * ctx, struct ggml_tensor * a, @@ -7141,13 +6881,6 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0( int stride) { GGML_ASSERT(a->ne[3] == b->ne[2]); - bool is_node = false; - - if (a->grad || b->grad) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - const int64_t ne[4] = { ggml_calc_conv_transpose_output_size(b->ne[0], a->ne[0], stride, 0 /*p0*/), ggml_calc_conv_transpose_output_size(b->ne[1], a->ne[1], stride, 0 /*p1*/), @@ -7158,8 +6891,7 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0( ggml_set_op_params_i32(result, 0, stride); - result->op = GGML_OP_CONV_TRANSPOSE_2D; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_CONV_TRANSPOSE_2D; result->src[0] = a; result->src[1] = b; @@ -7181,14 +6913,6 @@ struct ggml_tensor * ggml_pool_1d( int k0, int s0, int p0) { - - bool is_node = false; - - if (a->grad) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - const int64_t ne[4] = { ggml_calc_pool_output_size(a->ne[0], k0, s0, p0), a->ne[1], @@ -7200,8 +6924,7 @@ struct ggml_tensor * ggml_pool_1d( int32_t params[] = { op, k0, s0, p0 }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_POOL_1D; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_POOL_1D; result->src[0] = a; return result; @@ -7219,13 +6942,6 @@ struct ggml_tensor * ggml_pool_2d( int s1, float p0, float p1) { - - bool is_node = false; - - if (a->grad) { - is_node = true; - } - struct ggml_tensor * result; const int64_t ne[4] = { ggml_calc_pool_output_size(a->ne[0], k0, s0, p0), @@ -7238,9 +6954,9 @@ struct ggml_tensor * ggml_pool_2d( int32_t params[] = { op, k0, k1, s0, s1, p0, p1 }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_POOL_2D; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_POOL_2D; result->src[0] = a; + return result; } @@ -7255,100 +6971,74 @@ struct ggml_tensor * ggml_pool_2d_back( int s1, float p0, float p1) { - - bool is_node = false; - - if (a->grad) { - is_node = true; - } - struct ggml_tensor * result; result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, af->ne); int32_t params[] = { op, k0, k1, s0, s1, p0, p1 }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_POOL_2D_BACK; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_POOL_2D_BACK; result->src[0] = a; result->src[1] = af; + return result; } // ggml_upscale static struct ggml_tensor * ggml_upscale_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, - int ne0, - int ne1, - int ne2, - int ne3) { - bool is_node = false; - - if (a->grad) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - + struct ggml_context * ctx, + struct ggml_tensor * a, + int ne0, + int ne1, + int ne2, + int ne3) { GGML_ASSERT(a->ne[0] <= ne0); GGML_ASSERT(a->ne[1] <= ne1); GGML_ASSERT(a->ne[2] <= ne2); GGML_ASSERT(a->ne[3] <= ne3); - struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, - ne0, - ne1, - ne2, - ne3 - ); + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3); - result->op = GGML_OP_UPSCALE; - - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_UPSCALE; result->src[0] = a; return result; } struct ggml_tensor * ggml_upscale( - struct ggml_context * ctx, - struct ggml_tensor * a, - int scale_factor) { + struct ggml_context * ctx, + struct ggml_tensor * a, + int scale_factor) { return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3]); } struct ggml_tensor * ggml_upscale_ext( - struct ggml_context * ctx, - struct ggml_tensor * a, - int ne0, - int ne1, - int ne2, - int ne3) { + struct ggml_context * ctx, + struct ggml_tensor * a, + int ne0, + int ne1, + int ne2, + int ne3) { return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3); } // ggml_pad struct ggml_tensor * ggml_pad( - struct ggml_context * ctx, - struct ggml_tensor * a, - int p0, int p1, int p2, int p3) { - bool is_node = false; - - if (a->grad) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - + struct ggml_context * ctx, + struct ggml_tensor * a, + int p0, + int p1, + int p2, + int p3) { struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0] + p0, a->ne[1] + p1, a->ne[2] + p2, a->ne[3] + p3); - result->op = GGML_OP_PAD; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_PAD; result->src[0] = a; return result; @@ -7357,39 +7047,32 @@ struct ggml_tensor * ggml_pad( // ggml_arange struct ggml_tensor * ggml_arange( - struct ggml_context * ctx, - float start, - float stop, - float step) { - + struct ggml_context * ctx, + float start, + float stop, + float step) { GGML_ASSERT(stop > start); const int64_t steps = (int64_t) ceilf((stop - start) / step); struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps); - result->op = GGML_OP_ARANGE; ggml_set_op_params_f32(result, 0, start); ggml_set_op_params_f32(result, 1, stop); ggml_set_op_params_f32(result, 2, step); + result->op = GGML_OP_ARANGE; + return result; } // ggml_timestep_embedding struct ggml_tensor * ggml_timestep_embedding( - struct ggml_context * ctx, - struct ggml_tensor * timesteps, - int dim, - int max_period) { - bool is_node = false; - - if (timesteps->grad) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - + struct ggml_context * ctx, + struct ggml_tensor * timesteps, + int dim, + int max_period) { int actual_dim = dim; if (dim % 2 != 0) { actual_dim = dim + 1; @@ -7397,11 +7080,10 @@ struct ggml_tensor * ggml_timestep_embedding( struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]); - result->op = GGML_OP_TIMESTEP_EMBEDDING; ggml_set_op_params_i32(result, 0, dim); ggml_set_op_params_i32(result, 1, max_period); - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_TIMESTEP_EMBEDDING; result->src[0] = timesteps; return result; @@ -7410,22 +7092,14 @@ struct ggml_tensor * ggml_timestep_embedding( // ggml_argsort struct ggml_tensor * ggml_argsort( - struct ggml_context * ctx, - struct ggml_tensor * a, - enum ggml_sort_order order) { - bool is_node = false; - - if (a->grad) { - GGML_ABORT("fatal error"); // TODO: not implemented - is_node = true; - } - + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_sort_order order) { struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne); ggml_set_op_params_i32(result, 0, (int32_t) order); - result->op = GGML_OP_ARGSORT; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_ARGSORT; result->src[0] = a; return result; @@ -7478,10 +7152,6 @@ struct ggml_tensor * ggml_flash_attn_ext( bool is_node = false; - if (q->grad || k->grad || v->grad) { - is_node = true; - } - // permute(0, 2, 1, 3) int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); @@ -7608,17 +7278,9 @@ struct ggml_tensor * ggml_ssm_conv( GGML_ASSERT(sx->ne[1] == d_inner); GGML_ASSERT(n_t >= 0); - bool is_node = false; - - if (sx->grad || c->grad) { - GGML_ABORT("fatal error"); // TODO: implement - is_node = true; - } - struct ggml_tensor * result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_t, n_s); - result->op = GGML_OP_SSM_CONV; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_SSM_CONV; result->src[0] = sx; result->src[1] = c; @@ -7662,18 +7324,10 @@ struct ggml_tensor * ggml_ssm_scan( GGML_ASSERT(B->ne[2] == n_seqs); } - bool is_node = false; - - if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad) { - GGML_ABORT("fatal error"); // TODO: implement - is_node = true; - } - // concatenated y + ssm_states struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ggml_nelements(x) + ggml_nelements(s)); result->op = GGML_OP_SSM_SCAN; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = s; result->src[1] = x; result->src[2] = dt; @@ -7693,13 +7347,6 @@ struct ggml_tensor * ggml_win_part( GGML_ASSERT(a->ne[3] == 1); GGML_ASSERT(a->type == GGML_TYPE_F32); - bool is_node = false; - - if (a->grad) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - // padding const int px = (w - a->ne[1]%w)%w; const int py = (w - a->ne[2]%w)%w; @@ -7714,8 +7361,7 @@ struct ggml_tensor * ggml_win_part( int32_t params[] = { npx, npy, w }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_WIN_PART; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_WIN_PART; result->src[0] = a; return result; @@ -7731,21 +7377,13 @@ struct ggml_tensor * ggml_win_unpart( int w) { GGML_ASSERT(a->type == GGML_TYPE_F32); - bool is_node = false; - - if (a->grad) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - const int64_t ne[4] = { a->ne[0], w0, h0, 1, }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); int32_t params[] = { w }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_WIN_UNPART; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_WIN_UNPART; result->src[0] = a; return result; @@ -7761,18 +7399,10 @@ struct ggml_tensor * ggml_get_rel_pos( GGML_ASSERT(qh == kh); GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]); - bool is_node = false; - - if (a->grad) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - const int64_t ne[4] = { a->ne[0], kh, qh, 1, }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 3, ne); - result->op = GGML_OP_GET_REL_POS; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_GET_REL_POS; result->src[0] = a; return result; @@ -7796,17 +7426,10 @@ static struct ggml_tensor * ggml_add_rel_pos_impl( GGML_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]); GGML_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]); - bool is_node = false; - - if (!inplace && (a->grad || pw->grad || ph->grad)) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params_i32(result, 0, inplace ? 1 : 0); - result->op = GGML_OP_ADD_REL_POS; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_ADD_REL_POS; result->src[0] = a; result->src[1] = pw; result->src[2] = ph; @@ -7834,12 +7457,12 @@ struct ggml_tensor * ggml_add_rel_pos_inplace( struct ggml_tensor * ggml_rwkv_wkv( struct ggml_context * ctx, - struct ggml_tensor * k, - struct ggml_tensor * v, - struct ggml_tensor * r, - struct ggml_tensor * tf, - struct ggml_tensor * td, - struct ggml_tensor * state) { + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * r, + struct ggml_tensor * tf, + struct ggml_tensor * td, + struct ggml_tensor * state) { GGML_ASSERT(ggml_is_contiguous(k)); GGML_ASSERT(ggml_is_contiguous(v)); GGML_ASSERT(ggml_is_contiguous(r)); @@ -7860,19 +7483,11 @@ struct ggml_tensor * ggml_rwkv_wkv( GGML_ASSERT(ggml_nelements(state) == S * S * H * n_seqs); } - bool is_node = false; - - if (k->grad || v->grad || r->grad || tf->grad || td->grad || state->grad) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - // concat output and new_state const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - result->op = GGML_OP_RWKV_WKV; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_RWKV_WKV; result->src[0] = k; result->src[1] = v; result->src[2] = r; @@ -7887,23 +7502,16 @@ struct ggml_tensor * ggml_rwkv_wkv( static struct ggml_tensor * ggml_unary_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - enum ggml_unary_op op, - bool inplace) { + struct ggml_tensor * a, + enum ggml_unary_op op, + bool inplace) { GGML_ASSERT(ggml_is_contiguous_1(a)); - bool is_node = false; - - if (!inplace && (a->grad)) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params_i32(result, 0, (int32_t) op); - result->op = GGML_OP_UNARY; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_UNARY; result->src[0] = a; return result; @@ -7912,14 +7520,14 @@ static struct ggml_tensor * ggml_unary_impl( struct ggml_tensor * ggml_unary( struct ggml_context * ctx, struct ggml_tensor * a, - enum ggml_unary_op op) { + enum ggml_unary_op op) { return ggml_unary_impl(ctx, a, op, false); } struct ggml_tensor * ggml_unary_inplace( struct ggml_context * ctx, struct ggml_tensor * a, - enum ggml_unary_op op) { + enum ggml_unary_op op) { return ggml_unary_impl(ctx, a, op, true); } @@ -7928,20 +7536,13 @@ struct ggml_tensor * ggml_unary_inplace( static struct ggml_tensor * ggml_map_unary_impl_f32( struct ggml_context * ctx, struct ggml_tensor * a, - const ggml_unary_op_f32_t fun, - bool inplace) { - bool is_node = false; - - if (!inplace && a->grad) { - is_node = true; - } - + const ggml_unary_op_f32_t fun, + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); - result->op = GGML_OP_MAP_UNARY; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_MAP_UNARY; result->src[0] = a; return result; @@ -7950,14 +7551,14 @@ static struct ggml_tensor * ggml_map_unary_impl_f32( struct ggml_tensor * ggml_map_unary_f32( struct ggml_context * ctx, struct ggml_tensor * a, - const ggml_unary_op_f32_t fun) { + const ggml_unary_op_f32_t fun) { return ggml_map_unary_impl_f32(ctx, a, fun, false); } struct ggml_tensor * ggml_map_unary_inplace_f32( struct ggml_context * ctx, struct ggml_tensor * a, - const ggml_unary_op_f32_t fun) { + const ggml_unary_op_f32_t fun) { return ggml_map_unary_impl_f32(ctx, a, fun, true); } @@ -7967,22 +7568,15 @@ static struct ggml_tensor * ggml_map_binary_impl_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - const ggml_binary_op_f32_t fun, - bool inplace) { + const ggml_binary_op_f32_t fun, + bool inplace) { GGML_ASSERT(ggml_are_same_shape(a, b)); - bool is_node = false; - - if (!inplace && (a->grad || b->grad)) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); - result->op = GGML_OP_MAP_BINARY; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_MAP_BINARY; result->src[0] = a; result->src[1] = b; @@ -7993,7 +7587,7 @@ struct ggml_tensor * ggml_map_binary_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - const ggml_binary_op_f32_t fun) { + const ggml_binary_op_f32_t fun) { return ggml_map_binary_impl_f32(ctx, a, b, fun, false); } @@ -8001,7 +7595,7 @@ struct ggml_tensor * ggml_map_binary_inplace_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - const ggml_binary_op_f32_t fun) { + const ggml_binary_op_f32_t fun) { return ggml_map_binary_impl_f32(ctx, a, b, fun, true); } @@ -8011,19 +7605,12 @@ static struct ggml_tensor * ggml_map_custom1_impl_f32( struct ggml_context * ctx, struct ggml_tensor * a, const ggml_custom1_op_f32_t fun, - bool inplace) { - bool is_node = false; - - if (!inplace && a->grad) { - is_node = true; - } - + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); - result->op = GGML_OP_MAP_CUSTOM1_F32; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_MAP_CUSTOM1_F32; result->src[0] = a; return result; @@ -8050,19 +7637,12 @@ static struct ggml_tensor * ggml_map_custom2_impl_f32( struct ggml_tensor * a, struct ggml_tensor * b, const ggml_custom2_op_f32_t fun, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad || b->grad)) { - is_node = true; - } - + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); - result->op = GGML_OP_MAP_CUSTOM2_F32; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_MAP_CUSTOM2_F32; result->src[0] = a; result->src[1] = b; @@ -8093,19 +7673,12 @@ static struct ggml_tensor * ggml_map_custom3_impl_f32( struct ggml_tensor * b, struct ggml_tensor * c, const ggml_custom3_op_f32_t fun, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad || b->grad || c->grad)) { - is_node = true; - } - + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); - result->op = GGML_OP_MAP_CUSTOM3_F32; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_MAP_CUSTOM3_F32; result->src[0] = a; result->src[1] = b; result->src[2] = c; @@ -8133,26 +7706,20 @@ struct ggml_tensor * ggml_map_custom3_inplace_f32( // ggml_map_custom1 struct ggml_map_custom1_op_params { - ggml_custom1_op_t fun; - int n_tasks; - void * userdata; + ggml_custom1_op_t fun; + int n_tasks; + void * userdata; }; static struct ggml_tensor * ggml_map_custom1_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, - const ggml_custom1_op_t fun, - int n_tasks, - void * userdata, - bool inplace) { + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_t fun, + int n_tasks, + void * userdata, + bool inplace) { GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0); - bool is_node = false; - - if (!inplace && a->grad) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); struct ggml_map_custom1_op_params params = { @@ -8162,55 +7729,48 @@ static struct ggml_tensor * ggml_map_custom1_impl( }; ggml_set_op_params(result, (const void *) ¶ms, sizeof(params)); - result->op = GGML_OP_MAP_CUSTOM1; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_MAP_CUSTOM1; result->src[0] = a; return result; } struct ggml_tensor * ggml_map_custom1( - struct ggml_context * ctx, - struct ggml_tensor * a, - const ggml_custom1_op_t fun, - int n_tasks, - void * userdata) { + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_t fun, + int n_tasks, + void * userdata) { return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false); } struct ggml_tensor * ggml_map_custom1_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - const ggml_custom1_op_t fun, - int n_tasks, - void * userdata) { + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_t fun, + int n_tasks, + void * userdata) { return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true); } // ggml_map_custom2 struct ggml_map_custom2_op_params { - ggml_custom2_op_t fun; - int n_tasks; - void * userdata; + ggml_custom2_op_t fun; + int n_tasks; + void * userdata; }; static struct ggml_tensor * ggml_map_custom2_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - const ggml_custom2_op_t fun, - int n_tasks, - void * userdata, - bool inplace) { + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_t fun, + int n_tasks, + void * userdata, + bool inplace) { GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0); - bool is_node = false; - - if (!inplace && (a->grad || b->grad)) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); struct ggml_map_custom2_op_params params = { @@ -8220,8 +7780,7 @@ static struct ggml_tensor * ggml_map_custom2_impl( }; ggml_set_op_params(result, (const void *) ¶ms, sizeof(params)); - result->op = GGML_OP_MAP_CUSTOM2; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_MAP_CUSTOM2; result->src[0] = a; result->src[1] = b; @@ -8229,22 +7788,22 @@ static struct ggml_tensor * ggml_map_custom2_impl( } struct ggml_tensor * ggml_map_custom2( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - const ggml_custom2_op_t fun, - int n_tasks, - void * userdata) { + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_t fun, + int n_tasks, + void * userdata) { return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false); } struct ggml_tensor * ggml_map_custom2_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - const ggml_custom2_op_t fun, - int n_tasks, - void * userdata) { + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_t fun, + int n_tasks, + void * userdata) { return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true); } @@ -8257,22 +7816,16 @@ struct ggml_map_custom3_op_params { }; static struct ggml_tensor * ggml_map_custom3_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - struct ggml_tensor * c, - const ggml_custom3_op_t fun, - int n_tasks, - void * userdata, - bool inplace) { + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_t fun, + int n_tasks, + void * userdata, + bool inplace) { GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0); - bool is_node = false; - - if (!inplace && (a->grad || b->grad || c->grad)) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); struct ggml_map_custom3_op_params params = { @@ -8282,8 +7835,7 @@ static struct ggml_tensor * ggml_map_custom3_impl( }; ggml_set_op_params(result, (const void *) ¶ms, sizeof(params)); - result->op = GGML_OP_MAP_CUSTOM3; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_MAP_CUSTOM3; result->src[0] = a; result->src[1] = b; result->src[2] = c; @@ -8292,44 +7844,38 @@ static struct ggml_tensor * ggml_map_custom3_impl( } struct ggml_tensor * ggml_map_custom3( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - struct ggml_tensor * c, - const ggml_custom3_op_t fun, - int n_tasks, - void * userdata) { + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_t fun, + int n_tasks, + void * userdata) { return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false); } struct ggml_tensor * ggml_map_custom3_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - struct ggml_tensor * c, - const ggml_custom3_op_t fun, - int n_tasks, - void * userdata) { + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_t fun, + int n_tasks, + void * userdata) { return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true); } // ggml_cross_entropy_loss struct ggml_tensor * ggml_cross_entropy_loss( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { GGML_ASSERT(ggml_are_same_shape(a, b)); - bool is_node = false; - - if (a->grad || b->grad) { - is_node = true; - } struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1); - result->op = GGML_OP_CROSS_ENTROPY_LOSS; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_CROSS_ENTROPY_LOSS; result->src[0] = a; result->src[1] = b; @@ -8339,17 +7885,16 @@ struct ggml_tensor * ggml_cross_entropy_loss( // ggml_cross_entropy_loss_back struct ggml_tensor * ggml_cross_entropy_loss_back( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - struct ggml_tensor * c) { + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c) { GGML_ASSERT(ggml_are_same_shape(a, b)); GGML_ASSERT(ggml_is_scalar(c)); struct ggml_tensor * result = ggml_dup_tensor(ctx, a); - result->op = GGML_OP_CROSS_ENTROPY_LOSS_BACK; - result->grad = NULL; + result->op = GGML_OP_CROSS_ENTROPY_LOSS_BACK; result->src[0] = a; result->src[1] = b; result->src[2] = c; @@ -8362,12 +7907,14 @@ struct ggml_tensor * ggml_cross_entropy_loss_back( struct ggml_tensor * ggml_opt_step_adamw( struct ggml_context * ctx, struct ggml_tensor * a, + struct ggml_tensor * grad, float alpha, float beta1, float beta2, float eps, float wd) { - GGML_ASSERT(a->grad); + GGML_ASSERT(a->flags & GGML_TENSOR_FLAG_PARAM); + GGML_ASSERT(ggml_are_same_shape(a, grad)); GGML_ASSERT(alpha > 0.0f); GGML_ASSERT(beta1 >= 0.0f && beta1 <= 1.0f); GGML_ASSERT(beta2 >= 0.0f && beta2 <= 1.0f); @@ -8376,13 +7923,6 @@ struct ggml_tensor * ggml_opt_step_adamw( struct ggml_tensor * result = ggml_view_tensor(ctx, a); - result->op = GGML_OP_OPT_STEP_ADAMW; - result->grad = NULL; - result->src[0] = a; - result->src[1] = a->grad; - result->src[2] = ggml_dup_tensor(ctx, a->grad); - result->src[3] = ggml_dup_tensor(ctx, a->grad); - const int64_t iter = 1; memcpy(&result->op_params[0], &iter, sizeof(int64_t)); ggml_set_op_params_f32(result, 2, alpha); @@ -8391,26 +7931,17 @@ struct ggml_tensor * ggml_opt_step_adamw( ggml_set_op_params_f32(result, 5, eps); ggml_set_op_params_f32(result, 6, wd); + result->op = GGML_OP_OPT_STEP_ADAMW; + result->src[0] = a; + result->src[1] = grad; + result->src[2] = ggml_dup_tensor(ctx, grad); + result->src[3] = ggml_dup_tensor(ctx, grad); + return result; } //////////////////////////////////////////////////////////////////////////////// -void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor) { - tensor->flags |= GGML_TENSOR_FLAG_PARAM; - - GGML_ASSERT(tensor->grad == NULL); - tensor->grad = ggml_dup_tensor(ctx, tensor); - ggml_format_name(tensor->grad, "%s (grad)", tensor->name); -} - -void ggml_set_loss(struct ggml_tensor * tensor) { - GGML_ASSERT(ggml_is_scalar(tensor)); - GGML_ASSERT(tensor->type == GGML_TYPE_F32); - GGML_ASSERT(tensor->grad); - tensor->flags |= GGML_TENSOR_FLAG_LOSS; -} - // ggml_compute_forward_dup static void ggml_compute_forward_dup_same_cont( @@ -11326,6 +10857,86 @@ static void ggml_compute_forward_argmax( } } +// ggml_compute_forward_count_equal + +static void ggml_compute_forward_count_equal_i32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS; + + GGML_ASSERT(src0->type == GGML_TYPE_I32); + GGML_ASSERT(src1->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_are_same_shape(src0, src1)); + GGML_ASSERT(ggml_is_scalar(dst)); + GGML_ASSERT(dst->type == GGML_TYPE_I64); + + const int64_t nr = ggml_nrows(src0); + + const int ith = params->ith; + const int nth = params->nth; + + int64_t * sums = (int64_t *) params->wdata; + int64_t sum_thread = 0; + + // rows per thread + const int64_t dr = (nr + nth - 1)/nth; + + // row range for this thread + const int64_t ir0 = dr*ith; + const int64_t ir1 = MIN(ir0 + dr, nr); + + for (int64_t ir = ir0; ir < ir1; ++ir) { + const int64_t i03 = ir / (ne02*ne01); + const int64_t i02 = (ir - i03*ne03) / ne01; + const int64_t i01 = ir - i03*ne03 - i02*ne02; + + const char * data0 = (const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01; + const char * data1 = (const char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11; + + for (int64_t i00 = 0; i00 < ne00; ++i00) { + const int32_t val0 = *((const int32_t *) (data0 + i00*nb00)); + const int32_t val1 = *((const int32_t *) (data1 + i00*nb10)); + + sum_thread += val0 == val1; + } + } + if (ith != 0) { + sums[ith] = sum_thread; + } + ggml_barrier(params->threadpool); + + if (ith != 0) { + return; + } + + for (int ith_other = 1; ith_other < nth; ++ith_other) { + sum_thread += sums[ith_other]; + } + *((int64_t *) dst->data) = sum_thread; +} + +static void ggml_compute_forward_count_equal( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_I32: + { + ggml_compute_forward_count_equal_i32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + // ggml_compute_forward_repeat static void ggml_compute_forward_repeat_f32( @@ -13289,6 +12900,10 @@ static void ggml_compute_forward_out_prod_f32( GGML_TENSOR_BINARY_OP_LOCALS + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + const int ith = params->ith; const int nth = params->nth; @@ -14618,7 +14233,7 @@ static void ggml_rope_cache_init( } } -GGML_CALL void ggml_rope_yarn_corr_dims( +void ggml_rope_yarn_corr_dims( int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2] ) { // start and end correction dims @@ -16109,6 +15724,9 @@ static void ggml_compute_forward_flash_attn_ext_f16( ggml_vec_dot_t const kq_vec_dot = type_traits[k->type].vec_dot; ggml_to_float_t const v_to_float = type_traits[v->type].to_float; + GGML_ASSERT(q_to_vec_dot && "fattn: unsupported K-type"); + GGML_ASSERT(v_to_float && "fattn: unsupported V-type"); + // loop over n_batch and n_head for (int ir = ir0; ir < ir1; ++ir) { // q indices @@ -17368,41 +16986,40 @@ static void ggml_compute_forward_cross_entropy_loss_f32( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(src1)); - GGML_ASSERT(ggml_is_scalar(dst)); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); + GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type)); GGML_ASSERT(ggml_are_same_shape(src0, src1)); + GGML_ASSERT(ggml_is_scalar(dst)); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + // TODO: handle transposed/permuted matrices + const int64_t nc = src0->ne[0]; + const int64_t nr = ggml_nrows(src0); const int ith = params->ith; const int nth = params->nth; - float * sums = (float *) params->wdata; - - // TODO: handle transposed/permuted matrices - const int nc = src0->ne[0]; - const int nr = ggml_nrows(src0); + float * sums = (float *) params->wdata; + float * st = ((float *) params->wdata) + nth + ith*nc; + float sum_thread = 0.0f; GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc)); - if (ith == 0) { - memset(sums, 0, sizeof(float) * (nth + nth * nc)); - } - ggml_barrier(params->threadpool); - // rows per thread - const int dr = (nr + nth - 1)/nth; + const int64_t dr = (nr + nth - 1)/nth; // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); + const int64_t ir0 = dr*ith; + const int64_t ir1 = MIN(ir0 + dr, nr); - for (int i1 = ir0; i1 < ir1; i1++) { - float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]); - float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]); - float * st = ((float *) params->wdata) + nth + ith*nc; + for (int64_t i1 = ir0; i1 < ir1; ++i1) { + const float * s0 = (const float *)((const char *) src0->data + i1*src0->nb[1]); + const float * s1 = (const float *)((const char *) src1->data + i1*src1->nb[1]); #ifndef NDEBUG - for (int i = 0; i < nc; ++i) { + for (int64_t i = 0; i < nc; ++i) { //printf("p[%d] = %f\n", i, p[i]); assert(!isnan(s0[i])); assert(!isnan(s1[i])); @@ -17411,23 +17028,24 @@ static void ggml_compute_forward_cross_entropy_loss_f32( float max = -INFINITY; ggml_vec_max_f32(nc, &max, s0); - ggml_float sum = ggml_vec_log_soft_max_f32(nc, st, s0, max); - assert(sum >= 0.0); + const ggml_float sum_softmax = ggml_vec_log_soft_max_f32(nc, st, s0, max); + assert(sum_softmax >= 0.0); - ggml_vec_add1_f32(nc, st, st, -sum); + ggml_vec_add1_f32(nc, st, st, -sum_softmax); ggml_vec_mul_f32(nc, st, st, s1); - float st_sum = 0.0f; - ggml_vec_sum_f32(nc, &st_sum, st); - sums[ith] += st_sum; + float sum_st = 0.0f; + ggml_vec_sum_f32(nc, &sum_st, st); + sum_thread += sum_st; #ifndef NDEBUG - for (int i = 0; i < nc; ++i) { + for (int64_t i = 0; i < nc; ++i) { assert(!isnan(st[i])); assert(!isinf(st[i])); } #endif } + sums[ith] = sum_thread; ggml_barrier(params->threadpool); if (ith == 0) { @@ -17493,7 +17111,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]); #ifndef NDEBUG - for (int i = 0; i < nc; ++i) { + for (int64_t i = 0; i < nc; ++i) { //printf("p[%d] = %f\n", i, p[i]); assert(!isnan(s0[i])); assert(!isnan(s1[i])); @@ -17512,7 +17130,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( ggml_vec_scale_f32(nc, ds0, d_by_nr); #ifndef NDEBUG - for (int i = 0; i < nc; ++i) { + for (int64_t i = 0; i < nc; ++i) { assert(!isnan(ds0[i])); assert(!isinf(ds0[i])); } @@ -17700,6 +17318,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_argmax(params, tensor); } break; + case GGML_OP_COUNT_EQUAL: + { + ggml_compute_forward_count_equal(params, tensor); + } break; case GGML_OP_REPEAT: { ggml_compute_forward_repeat(params, tensor); @@ -18130,7 +17752,7 @@ void ggml_build_backward_gradient_checkpointing( struct ggml_tensor * * checkpoints, int n_checkpoints) { ggml_graph_cpy(gf, gb_tmp); - ggml_build_backward_expand(ctx, gf, gb_tmp, false, true); + ggml_build_backward_expand(ctx, gf, gb_tmp, false); if (n_checkpoints <= 0) { ggml_graph_cpy(gb_tmp, gb); @@ -18450,6 +18072,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_MEAN: case GGML_OP_ARGMAX: + case GGML_OP_COUNT_EQUAL: { GGML_ABORT("fatal error"); // TODO: implement } @@ -18782,7 +18405,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor ggml_soft_max_back(ctx, tensor->grad, tensor), zero_table, acc_table); } - + GGML_ASSERT((!src1 || !src1->grad) && "backward pass for softmax mask not implemented"); } break; case GGML_OP_SOFT_MAX_BACK: { @@ -18823,6 +18446,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor beta_slow), zero_table, acc_table); } + GGML_ASSERT((!src2 || !src2->grad) && "gradients for freq factors not implemented"); } break; case GGML_OP_ROPE_BACK: { @@ -18944,6 +18568,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } case GGML_OP_FLASH_ATTN_EXT: { + GGML_ABORT("FA backward pass not adapted after rework"); struct ggml_tensor * flash_grad = NULL; if (src0->grad || src1->grad || tensor->src[2]->grad) { int32_t t = ggml_get_op_params_i32(tensor, 0); @@ -19118,6 +18743,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor tensor->grad), zero_table, acc_table); } + GGML_ASSERT(!src1->grad && "backward pass for labels not implemented"); } break; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { @@ -19168,7 +18794,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * } } - if (node->op == GGML_OP_NONE && node->grad == NULL) { + if (node->op == GGML_OP_NONE && !(node->flags & GGML_TENSOR_FLAG_PARAM)) { // reached a leaf node, not part of the gradient graph (e.g. a constant) GGML_ASSERT(cgraph->n_leafs < cgraph->size); @@ -19186,9 +18812,6 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * } cgraph->nodes[cgraph->n_nodes] = node; - if (cgraph->grads) { - cgraph->grads[cgraph->n_nodes] = node->grad; - } cgraph->n_nodes++; } } @@ -19216,20 +18839,62 @@ void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * ggml_build_forward_impl(cgraph, tensor, true); } -void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate, bool keep) { +void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate) { GGML_ASSERT(gf->n_nodes > 0); GGML_ASSERT(gf->grads); - // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph - if (keep) { - for (int i = 0; i < gf->n_nodes; i++) { - struct ggml_tensor * node = gf->nodes[i]; + for (int i = 0; i < gf->n_nodes; ++i) { + struct ggml_tensor * node = gf->nodes[i]; - if (node->grad) { - node->grad = ggml_dup_tensor(ctx, node); - gf->grads[i] = node->grad; - } + if (node->type == GGML_TYPE_I32) { + continue; } + + bool needs_grad = node->flags & GGML_TENSOR_FLAG_PARAM; + bool ignore_src[GGML_MAX_SRC] = {false}; + switch (node->op) { + // gradients in node->src[0] for one reason or another have no effect on output gradients + case GGML_OP_IM2COL: // only used for its shape + case GGML_OP_IM2COL_BACK: // same as IM2COL + ignore_src[0] = true; + break; + case GGML_OP_UNARY: { + const enum ggml_unary_op uop = ggml_get_unary_op(node); + // SGN and STEP unary ops are piecewise constant + if (uop == GGML_UNARY_OP_SGN || uop == GGML_UNARY_OP_STEP) { + ignore_src[0] = true; + } + } break; + + // gradients in node->src[1] for one reason or another have no effect on output gradients + case GGML_OP_CPY: // gradients in CPY target are irrelevant + case GGML_OP_GET_ROWS: // row indices not differentiable + case GGML_OP_GET_ROWS_BACK: // same as for GET_ROWS + case GGML_OP_ROPE: // positions not differentiable + ignore_src[1] = true; + break; + + default: + break; + } + for (int j = 0; j < GGML_MAX_SRC; ++j) { + if (!node->src[j] || !node->src[j]->grad || ignore_src[j]) { + continue; + } + GGML_ASSERT(node->src[j]->type == GGML_TYPE_F32 || node->src[j]->type == GGML_TYPE_F16); + needs_grad = true; + break; + } + if (!needs_grad) { + continue; + } + + // inplace operations are currently not supported + GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW || + node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE); + + // create a new tensor with the same type and shape as the node and set it as grad + node->grad = ggml_dup_tensor(ctx, node); } // keep tables of original gradients for replacement/accumulation logic @@ -19291,7 +18956,7 @@ void ggml_build_opt_adamw( if (node->flags & GGML_TENSOR_FLAG_PARAM) { GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node); - struct ggml_tensor * opt_step = ggml_opt_step_adamw(ctx, node, alpha, beta1, beta2, eps, wd); + struct ggml_tensor * opt_step = ggml_opt_step_adamw(ctx, node, node->grad, alpha, beta1, beta2, eps, wd); ggml_build_forward_expand(gb, opt_step); } } @@ -19588,6 +19253,13 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_SUM_ROWS: case GGML_OP_MEAN: case GGML_OP_ARGMAX: + { + n_tasks = 1; + } break; + case GGML_OP_COUNT_EQUAL: + { + n_tasks = n_threads; + } break; case GGML_OP_REPEAT: case GGML_OP_REPEAT_BACK: case GGML_OP_LEAKY_RELU: @@ -19968,9 +19640,10 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask void ggml_threadpool_free(struct ggml_threadpool* threadpool) { if (!threadpool) return; + const int n_threads = threadpool->n_threads_max; + #ifndef GGML_USE_OPENMP struct ggml_compute_state* workers = threadpool->workers; - const int n_threads = threadpool->n_threads_max; ggml_mutex_lock(&threadpool->mutex); @@ -19990,8 +19663,9 @@ void ggml_threadpool_free(struct ggml_threadpool* threadpool) { ggml_cond_destroy(&threadpool->cond); #endif // GGML_USE_OPENMP - GGML_ALIGNED_FREE(threadpool->workers); - GGML_ALIGNED_FREE(threadpool); + const size_t workers_size = sizeof(struct ggml_compute_state) * n_threads; + ggml_aligned_free(threadpool->workers, workers_size); + ggml_aligned_free(threadpool, sizeof(struct ggml_threadpool)); } #ifndef GGML_USE_OPENMP @@ -20086,6 +19760,10 @@ struct ggml_cplan ggml_graph_plan( cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks; } } break; + case GGML_OP_COUNT_EQUAL: + { + cur = ggml_type_size(node->type)*n_tasks; + } break; case GGML_OP_MUL_MAT: { const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type; @@ -20419,7 +20097,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl( struct ggml_cplan * cplan) { struct ggml_threadpool * threadpool = - GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool)); + ggml_aligned_malloc(sizeof(struct ggml_threadpool)); { threadpool->cgraph = cgraph; threadpool->cplan = cplan; @@ -20440,7 +20118,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl( // Allocate and init workers state const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads; - struct ggml_compute_state * workers = GGML_ALIGNED_MALLOC(workers_size); + struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size); memset(workers, 0, workers_size); for (int j = 0; j < tpp->n_threads; j++) { @@ -20529,7 +20207,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl } #else if (n_threads > threadpool->n_threads_max) { - GGML_PRINT("WARNING: cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max); + GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max); n_threads = threadpool->n_threads_max; } @@ -21068,30 +20746,30 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * } void ggml_graph_print(const struct ggml_cgraph * cgraph) { - GGML_PRINT("=== GRAPH ===\n"); + GGML_LOG_INFO("=== GRAPH ===\n"); - GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes); + GGML_LOG_INFO("n_nodes = %d\n", cgraph->n_nodes); for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; - GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s\n", + GGML_LOG_INFO(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s\n", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " "); } - GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs); + GGML_LOG_INFO("n_leafs = %d\n", cgraph->n_leafs); for (int i = 0; i < cgraph->n_leafs; i++) { struct ggml_tensor * node = cgraph->leafs[i]; - GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n", + GGML_LOG_INFO(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n", i, node->ne[0], node->ne[1], ggml_op_name(node->op), ggml_get_name(node)); } - GGML_PRINT("========================================\n"); + GGML_LOG_INFO("========================================\n"); } // check if node is part of the graph @@ -21262,7 +20940,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph fclose(fp); - GGML_PRINT("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename); + GGML_LOG_INFO("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename); } //////////////////////////////////////////////////////////////////////////////// @@ -22094,8 +21772,6 @@ enum ggml_opt_result ggml_opt( struct ggml_context * ctx, struct ggml_opt_params params, struct ggml_tensor * f) { - GGML_ASSERT(f->grad && "ggml_set_param called for at least one parent tensor."); - bool free_ctx = false; if (ctx == NULL) { struct ggml_init_params params_ctx = { @@ -22136,7 +21812,7 @@ enum ggml_opt_result ggml_opt_resume( ggml_build_forward_expand(gf, f); struct ggml_cgraph * gb = ggml_graph_dup(ctx, gf); - ggml_build_backward_expand(ctx, gf, gb, false, true); + ggml_build_backward_expand(ctx, gf, gb, false); return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL); } @@ -22189,6 +21865,17 @@ void ggml_set_output(struct ggml_tensor * tensor) { tensor->flags |= GGML_TENSOR_FLAG_OUTPUT; } +void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor) { + GGML_UNUSED(ctx); // TODO: remove this parameter + tensor->flags |= GGML_TENSOR_FLAG_PARAM; +} + +void ggml_set_loss(struct ggml_tensor * tensor) { + GGML_ASSERT(ggml_is_scalar(tensor)); + GGML_ASSERT(tensor->type == GGML_TYPE_F32); + tensor->flags |= GGML_TENSOR_FLAG_LOSS; +} + //////////////////////////////////////////////////////////////////////////////// void ggml_quantize_init(enum ggml_type type) { @@ -23569,6 +23256,14 @@ int ggml_cpu_has_avx512_bf16(void) { #endif } +int ggml_cpu_has_amx_int8(void) { +#if defined(__AMX_INT8__) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_fma(void) { #if defined(__FMA__) return 1; @@ -23578,16 +23273,16 @@ int ggml_cpu_has_fma(void) { } int ggml_cpu_has_neon(void) { -#if defined(__ARM_NEON) - return 1; +#if defined(__ARM_ARCH) + return ggml_arm_arch_features.has_neon; #else return 0; #endif } int ggml_cpu_has_sve(void) { -#if defined(__ARM_FEATURE_SVE) - return 1; +#if defined(__ARM_ARCH) + return ggml_arm_arch_features.has_sve; #else return 0; #endif @@ -23734,11 +23429,23 @@ int ggml_cpu_has_vsx(void) { } int ggml_cpu_has_matmul_int8(void) { -#if defined(__ARM_FEATURE_MATMUL_INT8) - return 1; +#if defined(__ARM_ARCH) + return ggml_arm_arch_features.has_i8mm; #else return 0; #endif } +int ggml_cpu_get_sve_cnt(void) { +#if defined(__ARM_ARCH) + return ggml_arm_arch_features.sve_cnt; +#else + return 0; +#endif +} + +void ggml_log_set(ggml_log_callback log_callback, void * user_data) { + g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default; + g_logger_state.log_callback_user_data = user_data; +} //////////////////////////////////////////////////////////////////////////////// diff --git a/ggml/src/vulkan-shaders/argsort.comp b/ggml/src/vulkan-shaders/argsort.comp index e55414b03..d4fa45b1e 100644 --- a/ggml/src/vulkan-shaders/argsort.comp +++ b/ggml/src/vulkan-shaders/argsort.comp @@ -29,20 +29,18 @@ void main() { const int col = int(gl_LocalInvocationID.x); const uint row = gl_WorkGroupID.y; - if (col >= p.ncols_pad) { - return; - } - const uint row_offset = row * p.ncols; // initialize indices - dst_row[col] = col; + if (col < p.ncols_pad) { + dst_row[col] = col; + } barrier(); for (uint k = 2; k <= p.ncols_pad; k *= 2) { for (uint j = k / 2; j > 0; j /= 2) { const uint ixj = col ^ j; - if (ixj > col) { + if (col < p.ncols_pad && ixj > col) { if ((col & k) == 0) { if (dst_row[col] >= p.ncols || (dst_row[ixj] < p.ncols && (p.order == ASC ? diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 560eee916..7ab08b036 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -94,6 +94,7 @@ class Keys: DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping" FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping" + SWIN_NORM = "{arch}.swin_norm" RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers" TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim" TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim" @@ -151,6 +152,8 @@ class Keys: MERGES = "tokenizer.ggml.merges" BOS_ID = "tokenizer.ggml.bos_token_id" EOS_ID = "tokenizer.ggml.eos_token_id" + EOT_ID = "tokenizer.ggml.eot_token_id" + EOM_ID = "tokenizer.ggml.eom_token_id" UNK_ID = "tokenizer.ggml.unknown_token_id" SEP_ID = "tokenizer.ggml.seperator_token_id" PAD_ID = "tokenizer.ggml.padding_token_id" @@ -167,11 +170,16 @@ class Keys: CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}" CHAT_TEMPLATES = "tokenizer.chat_templates" # FIM/Infill special tokens constants + FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id" + FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id" + FIM_MID_ID = "tokenizer.ggml.fim_mid_token_id" + FIM_PAD_ID = "tokenizer.ggml.fim_pad_token_id" + FIM_REP_ID = "tokenizer.ggml.fim_rep_token_id" + FIM_SEP_ID = "tokenizer.ggml.fim_sep_token_id" + # deprecated: PREFIX_ID = "tokenizer.ggml.prefix_token_id" SUFFIX_ID = "tokenizer.ggml.suffix_token_id" MIDDLE_ID = "tokenizer.ggml.middle_token_id" - EOT_ID = "tokenizer.ggml.eot_token_id" - EOM_ID = "tokenizer.ggml.eom_token_id" class Adapter: TYPE = "adapter.type" @@ -236,6 +244,7 @@ class MODEL_ARCH(IntEnum): EXAONE = auto() GRANITE = auto() GRANITE_MOE = auto() + CHAMELEON = auto() class MODEL_TENSOR(IntEnum): @@ -343,6 +352,8 @@ class MODEL_TENSOR(IntEnum): ENC_FFN_DOWN = auto() ENC_FFN_UP = auto() ENC_OUTPUT_NORM = auto() + CLS = auto() # classifier + CLS_OUT = auto() # classifier output projection MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { @@ -394,6 +405,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.EXAONE: "exaone", MODEL_ARCH.GRANITE: "granite", MODEL_ARCH.GRANITE_MOE: "granitemoe", + MODEL_ARCH.CHAMELEON: "chameleon", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -501,6 +513,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down", MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up", MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm", + MODEL_TENSOR.CLS: "cls", + MODEL_TENSOR.CLS_OUT: "cls.output", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -610,6 +624,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, MODEL_TENSOR.LAYER_OUT_NORM, + MODEL_TENSOR.CLS, + MODEL_TENSOR.CLS_OUT, ], MODEL_ARCH.NOMIC_BERT: [ MODEL_TENSOR.TOKEN_EMBD, @@ -641,6 +657,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.LAYER_OUT_NORM, + MODEL_TENSOR.CLS, ], MODEL_ARCH.MPT: [ MODEL_TENSOR.TOKEN_EMBD, @@ -804,6 +821,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FACTORS_LONG, + MODEL_TENSOR.ROPE_FACTORS_SHORT, MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_QKV, MODEL_TENSOR.ATTN_Q, @@ -882,6 +901,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FACTORS_LONG, + MODEL_TENSOR.ROPE_FACTORS_SHORT, MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_Q_A, MODEL_TENSOR.ATTN_Q_B, @@ -1260,6 +1281,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.CHAMELEON: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], # TODO } @@ -1549,6 +1586,8 @@ KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID +KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID +KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID @@ -1556,8 +1595,15 @@ KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV -KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID + +KEY_TOKENIZER_FIM_PRE_ID = Keys.Tokenizer.FIM_PRE_ID +KEY_TOKENIZER_FIM_SUF_ID = Keys.Tokenizer.FIM_SUF_ID +KEY_TOKENIZER_FIM_MID_ID = Keys.Tokenizer.FIM_MID_ID +KEY_TOKENIZER_FIM_PAD_ID = Keys.Tokenizer.FIM_PAD_ID +KEY_TOKENIZER_FIM_REP_ID = Keys.Tokenizer.FIM_REP_ID +KEY_TOKENIZER_FIM_SEP_ID = Keys.Tokenizer.FIM_SEP_ID + +# deprecated +KEY_TOKENIZER_PREFIX_ID = Keys.Tokenizer.PREFIX_ID KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID -KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID -KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index bd059b45c..0d8d8a0b0 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -670,6 +670,9 @@ class GGUFWriter: def add_expert_weights_scale(self, value: float) -> None: self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value) + def add_swin_norm(self, value: bool) -> None: + self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value) + def add_rescale_every_n_layers(self, count: int) -> None: self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count) @@ -840,15 +843,6 @@ class GGUFWriter: self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value) - def add_prefix_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.PREFIX_ID, id) - - def add_suffix_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.SUFFIX_ID, id) - - def add_middle_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.MIDDLE_ID, id) - def add_eot_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.EOT_ID, id) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 4e850726e..f4a787c56 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -87,6 +87,9 @@ class TensorNameMap: "rope.freqs", # llama-pth "rotary_pos_emb.inv_freq", # chatglm ), + + MODEL_TENSOR.ROPE_FACTORS_LONG: (), + MODEL_TENSOR.ROPE_FACTORS_SHORT: (), } block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { @@ -380,7 +383,7 @@ class TensorNameMap: MODEL_TENSOR.ATTN_Q_NORM: ( "language_model.encoder.layers.{bid}.self_attention.q_layernorm", "model.layers.{bid}.self_attn.q_layernorm", # persimmon - "model.layers.{bid}.self_attn.q_norm", # cohere olmoe + "model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon "transformer.blocks.{bid}.attn.q_ln", # sea-lion "encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2 "transformer.layers.{bid}.attn.q_norm", # openelm @@ -389,7 +392,7 @@ class TensorNameMap: MODEL_TENSOR.ATTN_K_NORM: ( "language_model.encoder.layers.{bid}.self_attention.k_layernorm", "model.layers.{bid}.self_attn.k_layernorm", # persimmon - "model.layers.{bid}.self_attn.k_norm", # cohere olmoe + "model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon "transformer.blocks.{bid}.attn.k_ln", # sea-lion "encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2 "transformer.layers.{bid}.attn.k_norm", # openelm @@ -679,6 +682,15 @@ class TensorNameMap: MODEL_TENSOR.ENC_OUTPUT_NORM: ( "encoder.final_layer_norm", # t5 ), + + MODEL_TENSOR.CLS: ( + "classifier", # jina + "classifier.dense", # roberta + ), + + MODEL_TENSOR.CLS_OUT: ( + "classifier.out_proj", # roberta + ), } # architecture-specific block mappings diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index dc5749913..f2645f921 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -122,8 +122,30 @@ class SpecialVocab: tokenizer = json.load(f) if self.load_merges: merges = tokenizer.get('model', {}).get('merges') - if isinstance(merges, list) and merges and isinstance(merges[0], str): - self.merges = merges + if isinstance(merges, list) and merges: + if isinstance(merges[0], str): + self.merges = merges + elif isinstance(merges[0], list) and len(merges[0]) == 2 and isinstance(merges[0][0], str): + # New format since transformers 4.45 to support spaces in merges + # ref: https://github.com/ggerganov/llama.cpp/issues/9692 + # TODO: internally store as the new format instead of converting to old + if any(' ' in s for pair in merges for s in pair): + logger.warning(f'Spaces in merges detected, encoding as {chr(ord(" ") + 256)!r}') + self.merges = [ + ' '.join( + [ + # ensure the spaces are properly encoded + ''.join( + chr(ord(c) + 256) if c == ' ' else c + for c in part + ) + for part in pair + ] + ) + for pair in merges + ] + else: + raise ValueError("Unknown tokenizer merges format") added_tokens = tokenizer.get('added_tokens', {}) else: added_tokens = {} diff --git a/include/llama.h b/include/llama.h index d94aeda0a..2b13e8e8d 100644 --- a/include/llama.h +++ b/include/llama.h @@ -102,6 +102,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_BLOOM = 23, LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24, LLAMA_VOCAB_PRE_TYPE_EXAONE = 25, + LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26, }; enum llama_rope_type { @@ -192,6 +193,7 @@ extern "C" { LLAMA_POOLING_TYPE_MEAN = 1, LLAMA_POOLING_TYPE_CLS = 2, LLAMA_POOLING_TYPE_LAST = 3, + LLAMA_POOLING_TYPE_RANK = 4, // used by reranking models to attach the classification head to the graph }; enum llama_attention_type { @@ -201,9 +203,9 @@ extern "C" { }; enum llama_split_mode { - LLAMA_SPLIT_MODE_NONE = 0, // single GPU - LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs - LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs + LLAMA_SPLIT_MODE_NONE = 0, // single GPU + LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs + LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs }; // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979) @@ -215,6 +217,7 @@ extern "C" { typedef struct llama_token_data_array { // TODO: consider SoA + // NOTE: this pointer can be modified by the samplers llama_token_data * data; size_t size; int64_t selected; // this is the index in the data array (i.e. not the token id) @@ -230,8 +233,11 @@ extern "C" { // - token : the token ids of the input (used when embd is NULL) // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL) // - pos : the positions of the respective token in the sequence + // (if set to NULL, the token position will be tracked automatically by llama_decode) // - seq_id : the sequence to which the respective token belongs + // (if set to NULL, the sequence ID will be assumed to be 0) // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output + // (if set to NULL, only the logits for last token will be returned) // typedef struct llama_batch { int32_t n_tokens; @@ -242,15 +248,6 @@ extern "C" { int32_t * n_seq_id; llama_seq_id ** seq_id; int8_t * logits; // TODO: rename this to "output" - - // NOTE: helpers for smooth API transition - can be deprecated in the future - // for future-proof code, use the above fields instead and ignore everything below - // - // pos[i] = all_pos_0 + i*all_pos_1 - // - llama_pos all_pos_0; // used if pos == NULL - llama_pos all_pos_1; // used if pos == NULL - llama_seq_id all_seq_id; // used if seq_id == NULL } llama_batch; enum llama_model_kv_override_type { @@ -431,6 +428,7 @@ extern "C" { LLAMA_API bool llama_supports_mmap (void); LLAMA_API bool llama_supports_mlock (void); LLAMA_API bool llama_supports_gpu_offload(void); + LLAMA_API bool llama_supports_rpc (void); LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); @@ -773,15 +771,15 @@ extern "C" { // Decoding // - // Return batch for single sequence of tokens starting at pos_0 + // Return batch for single sequence of tokens + // The sequence ID will be fixed to 0 + // The position of the tokens will be tracked automatically by llama_decode // // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it // LLAMA_API struct llama_batch llama_batch_get_one( llama_token * tokens, - int32_t n_tokens, - llama_pos pos_0, - llama_seq_id seq_id); + int32_t n_tokens); // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens // Each token can be assigned up to n_seq_max sequence ids @@ -871,7 +869,8 @@ extern "C" { // Get the embeddings for a sequence id // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE - // shape: [n_embd] (1-dimensional) + // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence + // otherwise: float[n_embd] (1-dimensional) LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id); // @@ -893,6 +892,7 @@ extern "C" { // Special tokens LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence + LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line @@ -901,15 +901,23 @@ extern "C" { LLAMA_API bool llama_add_bos_token(const struct llama_model * model); LLAMA_API bool llama_add_eos_token(const struct llama_model * model); - // Codellama infill tokens - LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix - LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle - LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix - LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle + // infill tokens + DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead"); + DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead"); + DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead"); + + LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model); + LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model); + LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model); + LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model); + LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model); + LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model); // // Tokenization // + // The API is thread-safe. + // /// @details Convert the provided text into tokens. /// @param tokens The tokens pointer must be large enough to hold the resulting tokens. @@ -1064,12 +1072,13 @@ extern "C" { // available samplers: - LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void); - LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed); + LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void); + LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed); /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first. - LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void); + DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void), + "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)"); /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k); @@ -1085,11 +1094,16 @@ extern "C" { /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. LLAMA_API struct llama_sampler * llama_sampler_init_typical (float p, size_t min_keep); + + /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf LLAMA_API struct llama_sampler * llama_sampler_init_temp (float t); /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772. LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent); + /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335 + LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed); + /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. @@ -1136,6 +1150,28 @@ extern "C" { int32_t n_logit_bias, const llama_logit_bias * logit_bias); + // this sampler is meant to be used for fill-in-the-middle infilling + // it's supposed to be used after top_k + top_p sampling + // + // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG + // 2. combine probs of tokens that have the same prefix + // + // example: + // + // - before: + // "hel": 0.5 + // "hell": 0.2 + // "hello": 0.1 + // "dummy": 0.1 + // + // - after: + // "hel": 0.8 + // "dummy": 0.1 + // + // 3. discard non-EOG tokens with low prob + // 4. if no tokens are left -> pick EOT + // + LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model); // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl); diff --git a/models/ggml-vocab-chameleon.gguf.inp b/models/ggml-vocab-chameleon.gguf.inp new file mode 100644 index 000000000..9baf7d77a --- /dev/null +++ b/models/ggml-vocab-chameleon.gguf.inp @@ -0,0 +1,112 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + + +__ggml_vocab_test__ + + + +__ggml_vocab_test__ + + + + +__ggml_vocab_test__ + + +__ggml_vocab_test__ +Hello world +__ggml_vocab_test__ + Hello world +__ggml_vocab_test__ +Hello World +__ggml_vocab_test__ + Hello World +__ggml_vocab_test__ + Hello World! +__ggml_vocab_test__ +Hello, world! +__ggml_vocab_test__ + Hello, world! +__ggml_vocab_test__ + this is 🦙.cpp +__ggml_vocab_test__ +w048 7tuijk dsdfhu +__ggml_vocab_test__ +нещо на Български +__ggml_vocab_test__ +កាន់តែពិសេសអាចខលចេញ +__ggml_vocab_test__ +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) +__ggml_vocab_test__ +Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello + Hello +__ggml_vocab_test__ + ( +__ggml_vocab_test__ + + = +__ggml_vocab_test__ +' era +__ggml_vocab_test__ +Hello, y'all! How are you 😁 ?我想在apple工作1314151天~ +__ggml_vocab_test__ +!!!!!! +__ggml_vocab_test__ +3 +__ggml_vocab_test__ +33 +__ggml_vocab_test__ +333 +__ggml_vocab_test__ +3333 +__ggml_vocab_test__ +33333 +__ggml_vocab_test__ +333333 +__ggml_vocab_test__ +3333333 +__ggml_vocab_test__ +33333333 +__ggml_vocab_test__ +333333333 +__ggml_vocab_test__ +Cửa Việt +__ggml_vocab_test__ + discards +__ggml_vocab_test__ + + + + + + + + + + + +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL +__ggml_vocab_test__ diff --git a/models/ggml-vocab-chameleon.gguf.out b/models/ggml-vocab-chameleon.gguf.out new file mode 100644 index 000000000..7c5413fee --- /dev/null +++ b/models/ggml-vocab-chameleon.gguf.out @@ -0,0 +1,46 @@ + 17245 16604 16403 16604 33583 18355 + 16421 51153 + + 16604 + 16650 + 16650 16604 + 16581 + 16582 + 16582 16582 + 16582 16582 16582 + 16581 16582 + 31596 17394 + 34926 17394 + 31596 18671 + 34926 18671 + 34926 18671 16384 + 31596 16395 17394 16384 + 34926 16395 17394 16384 + 16811 16704 20410 16483 16631 16397 52854 + 16470 16399 16403 16407 16604 16406 35764 38185 51595 22592 26639 + 29479 23955 17012 20103 25527 27670 17408 19005 21473 24774 + 54254 42231 48084 29409 16617 61889 29409 16608 21954 16628 21954 16499 58445 29409 16607 58445 21954 16479 42231 21954 16611 21954 16607 21954 16633 21954 16611 29409 16607 21954 16615 + 52351 16604 16391 25825 16392 23686 16498 39161 18885 16618 16488 30853 16604 16391 54124 17153 25134 16656 18476 26169 16895 16392 62193 16611 16604 16391 24664 17153 57169 16721 16872 17073 17304 28729 16392 + 31596 + 34926 + 16650 31596 + 16650 34926 + 16696 31596 + 16696 31596 16582 16696 31596 + 16604 16391 + 16582 16604 16412 + 16390 22623 + 31596 16395 16712 16390 16828 16384 17674 16769 16732 23686 16607 16604 16414 24427 16623 41809 16495 28999 36469 45292 30197 16400 16402 16400 16403 16400 16404 16400 43969 65211 16636 + 16384 16384 16384 16384 16384 16384 + 16402 + 16402 16402 + 16402 16402 16402 + 16402 16402 16402 16402 + 16402 16402 16402 16402 16402 + 16402 16402 16402 16402 16402 16402 + 16402 16402 16402 16402 16402 16402 16402 + 16402 16402 16402 16402 16402 16402 16402 16402 + 16402 16402 16402 16402 16402 16402 16402 16402 16402 + 16418 19038 16639 16448 24315 33727 16467 + 18765 17981 + 16582 16604 16582 16582 16604 16582 16582 16582 16604 16581 16604 16581 16581 16604 16581 16582 16650 16582 16650 16604 16582 16696 16582 16696 16604 16582 52351 16604 16391 25825 16392 23686 16498 39161 18885 16618 16488 30853 16604 16391 54124 17153 25134 16656 18476 26169 16895 16392 62193 16611 20410 16483 16631 18885 16483 16631 16604 16402 16604 16402 16402 16604 16402 16402 16402 16604 16402 16402 16402 16402 16604 16402 16402 16402 16402 16402 16604 16402 16402 16402 16402 16402 16402 16604 16402 16402 16402 16402 16402 16402 16402 16604 16402 16402 16402 16402 16402 16402 16402 16402 16604 16402 16397 16402 16604 16402 16397 16397 16402 16604 16402 16397 16397 16397 16402 16604 54254 42231 48084 29409 16617 61889 29409 16608 21954 16628 21954 16499 58445 29409 16607 58445 21954 16479 42231 21954 16611 27683 16607 16604 16414 24427 16623 41809 16495 28999 36469 45292 30197 16400 16402 16400 16403 16400 16404 16400 43969 65211 16636 16604 16396 16396 16396 16396 16396 16396 16412 16412 16412 16412 16412 16412 16412 27268 23955 17012 20103 25527 27670 17408 19005 21473 24774 16604 16390 16390 16390 16390 16390 16390 16447 16447 16447 16447 16447 16447 16447 16385 16385 16385 16385 16397 16397 16397 16397 16397 16397 16384 16384 16384 16384 16384 16384 16414 16414 16414 16414 16414 16414 16687 16390 16690 16992 16604 16390 61797 16733 16390 16466 16986 16395 16604 16390 17879 16732 17811 16414 16604 16390 16428 16804 17811 16687 16390 16683 17190 16728 16395 16604 16390 16419 16732 16945 16991 25251 16414 17119 16390 38127 16641 16390 16459 16427 diff --git a/pocs/vdot/q8dot.cpp b/pocs/vdot/q8dot.cpp index 1a52ff5e9..131d7c177 100644 --- a/pocs/vdot/q8dot.cpp +++ b/pocs/vdot/q8dot.cpp @@ -136,7 +136,7 @@ int main(int argc, char** argv) { auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1; - auto funcs = ggml_internal_get_type_traits(ggml_type); + const auto * funcs = ggml_get_type_traits(ggml_type); Stat simple, ggml; @@ -156,8 +156,8 @@ int main(int argc, char** argv) { t1 = std::chrono::high_resolution_clock::now(); float fs; - if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1); - else funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1); + if (type == 0) funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1); + else funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1); t2 = std::chrono::high_resolution_clock::now(); t = 1e-3*std::chrono::duration_cast(t2-t1).count(); if (iloop > 3) ggml.addResult(fs, t); diff --git a/pocs/vdot/vdot.cpp b/pocs/vdot/vdot.cpp index 17e9e4482..88e66ea13 100644 --- a/pocs/vdot/vdot.cpp +++ b/pocs/vdot/vdot.cpp @@ -236,7 +236,7 @@ int main(int argc, char** argv) { int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64); int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64); - auto funcs = useQ4_1 ? ggml_internal_get_type_traits(GGML_TYPE_Q4_1) : ggml_internal_get_type_traits(GGML_TYPE_Q4_0); + const auto * funcs = useQ4_1 ? ggml_get_type_traits(GGML_TYPE_Q4_1) : ggml_get_type_traits(GGML_TYPE_Q4_0); std::vector q40; std::vector q41; @@ -261,9 +261,9 @@ int main(int argc, char** argv) { // Note, we do not include this in the timing as in practical application // we already have the quantized model weights. if (useQ4_1) { - funcs.from_float(x1.data(), q41.data(), kVecSize); + funcs->from_float(x1.data(), q41.data(), kVecSize); } else { - funcs.from_float(x1.data(), q40.data(), kVecSize); + funcs->from_float(x1.data(), q40.data(), kVecSize); } // Now measure time the dot product needs using the "scalar" version above @@ -282,10 +282,10 @@ int main(int argc, char** argv) { dot_q4_q8(kVecSize, &result, q40.data(), q8.data()); } else { - auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type); - vdot.from_float(y1.data(), q8.data(), kVecSize); - if (useQ4_1) funcs.vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1); - else funcs.vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1); + const auto * vdot = ggml_get_type_traits(funcs->vec_dot_type); + vdot->from_float(y1.data(), q8.data(), kVecSize); + if (useQ4_1) funcs->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1); + else funcs->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1); } sumq += result; t2 = std::chrono::high_resolution_clock::now(); diff --git a/pyrightconfig.json b/pyrightconfig.json index 6016f4b6d..9acbbeb78 100644 --- a/pyrightconfig.json +++ b/pyrightconfig.json @@ -5,7 +5,8 @@ "reportUnusedImport": "warning", "reportDuplicateImport": "error", "reportDeprecated": "warning", - "reportUnnecessaryTypeIgnoreComment": "warning", + "reportUnnecessaryTypeIgnoreComment": "information", + "disableBytesTypePromotions": false, // TODO: change once Python 3.12 is the minimum "executionEnvironments": [ { // TODO: make this version override work correctly diff --git a/requirements/requirements-convert_legacy_llama.txt b/requirements/requirements-convert_legacy_llama.txt index 1d07b0952..859204b27 100644 --- a/requirements/requirements-convert_legacy_llama.txt +++ b/requirements/requirements-convert_legacy_llama.txt @@ -1,5 +1,5 @@ numpy~=1.26.4 sentencepiece~=0.2.0 -transformers>=4.40.1,<5.0.0 +transformers>=4.45.1,<5.0.0 gguf>=0.1.0 protobuf>=4.21.0,<5.0.0 diff --git a/scripts/debug-test.sh b/scripts/debug-test.sh index 91946c514..c6c1e988a 100755 --- a/scripts/debug-test.sh +++ b/scripts/debug-test.sh @@ -110,7 +110,7 @@ rm -rf "$build_dir" && mkdir "$build_dir" || abort "Failed to make $build_dir" ########################################################### # Note: test-eval-callback requires -DLLAMA_CURL -cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DGGML_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build enviroment" +cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DGGML_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build environment" pushd "$build_dir" make -j || abort "Failed to compile" popd > /dev/null || exit 1 @@ -127,7 +127,7 @@ printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n" pushd "$build_dir" tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1')) if [ ${#tests[@]} -eq 0 ]; then - abort "No tests avaliable... check your compliation process..." + abort "No tests available... check your compilation process..." fi popd > /dev/null || exit 1 @@ -137,7 +137,7 @@ popd > /dev/null || exit 1 # Select test number if [ -z $test_number ]; then - # List out avaliable tests + # List out available tests printf "Which test would you like to debug?\n" id=0 for s in "${tests[@]}" diff --git a/scripts/run-with-preset.py b/scripts/run-with-preset.py index ee21eab37..47cacb432 100755 --- a/scripts/run-with-preset.py +++ b/scripts/run-with-preset.py @@ -15,7 +15,7 @@ CLI_ARGS_LLAMA_CLI_PERPLEXITY = [ "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag", "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base", - "low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock", + "low-vram", "main-gpu", "mirostat", "mirostat-ent", "mirostat-lr", "mlock", "model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q", "np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt", "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "repeat-last-n", @@ -25,12 +25,12 @@ CLI_ARGS_LLAMA_CLI_PERPLEXITY = [ ] CLI_ARGS_LLAMA_BENCH = [ - "batch-size", "memory-f32", "low-vram", "model", "mul-mat-q", "n-gen", "n-gpu-layers", + "batch-size", "low-vram", "model", "mul-mat-q", "n-gen", "n-gpu-layers", "n-prompt", "output", "repetitions", "tensor-split", "threads", "verbose" ] CLI_ARGS_LLAMA_SERVER = [ - "alias", "batch-size", "ctx-size", "embedding", "host", "memory-f32", "lora", "lora-base", + "alias", "batch-size", "ctx-size", "embedding", "host", "lora", "lora-base", "low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q", "numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split", "threads", "verbose" diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index f16336594..ffce2aab0 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -122,7 +122,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # src/ggml-aarch64.h -> ggml/src/ggml-aarch64.h # src/ggml-alloc.c -> ggml/src/ggml-alloc.c # src/ggml-backend-impl.h -> ggml/src/ggml-backend-impl.h - # src/ggml-backend.c -> ggml/src/ggml-backend.c + # src/ggml-backend.cpp -> ggml/src/ggml-backend.cpp # src/ggml-cann/* -> ggml/src/ggml-cann/ # src/ggml-cann.cpp -> ggml/src/ggml-cann.cpp # src/ggml-common.h -> ggml/src/ggml-common.h @@ -169,7 +169,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.h/\1ggml\/src\/ggml-aarch64.h/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-alloc\.c/\1ggml\/src\/ggml-alloc.c/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-backend-impl\.h/\1ggml\/src\/ggml-backend-impl.h/g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.c/\1ggml\/src\/ggml-backend.c/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.cpp/\1ggml\/src\/ggml-backend.cpp/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-cann\.cpp/\1ggml\/src\/ggml-cann.cpp/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-common\.h/\1ggml\/src\/ggml-common.h/g' \ diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 36eeed0cc..6d31b21b9 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -336c10a4c3c8ec99af484b25a0cddd397a09cdb2 +2327bda7a55ac6b72614ac5ebd5c5a5e02553b9b diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index 30a62e088..f6ff5e683 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -9,7 +9,7 @@ cp -rpv ../ggml/src/ggml-aarch64.c ./ggml/src/ggml-aarch64.c cp -rpv ../ggml/src/ggml-aarch64.h ./ggml/src/ggml-aarch64.h cp -rpv ../ggml/src/ggml-alloc.c ./ggml/src/ggml-alloc.c cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml/src/ggml-backend-impl.h -cp -rpv ../ggml/src/ggml-backend.c ./ggml/src/ggml-backend.c +cp -rpv ../ggml/src/ggml-backend.cpp ./ggml/src/ggml-backend.cpp cp -rpv ../ggml/src/ggml-cann/* ./ggml/src/ggml-cann/ cp -rpv ../ggml/src/ggml-cann.cpp ./ggml/src/ggml-cann.cpp cp -rpv ../ggml/src/ggml-common.h ./ggml/src/ggml-common.h diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 26ce63e2c..094195106 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -63,6 +63,30 @@ static void llama_log_softmax(float * array, size_t size) { } */ +static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp) { + if (temp <= 0.0f) { + // find the token with the highest logit and set the rest to -inf + size_t max_i = 0; + float max_l = cur_p->data[0].logit; + + for (size_t i = 1; i < cur_p->size; ++i) { + if (cur_p->data[i ].logit > max_l) { + cur_p->data[max_i].logit = -INFINITY; + max_i = i; + max_l = cur_p->data[i].logit; + } else { + cur_p->data[i].logit = -INFINITY; + } + } + + return; + } + + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].logit /= temp; + } +} + static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) { GGML_ASSERT(cur_p->size > 0); @@ -435,6 +459,9 @@ static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl* static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { auto * ctx = (llama_sampler_dist *) smpl->ctx; + + llama_sampler_softmax_impl(cur_p); + cur_p->selected = llama_sample_dist(cur_p, ctx->rng); } @@ -927,9 +954,8 @@ static const char * llama_sampler_temp_name(const struct llama_sampler * /*smpl* static void llama_sampler_temp_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { const auto * ctx = (llama_sampler_temp *) smpl->ctx; - for (size_t i = 0; i < cur_p->size; ++i) { - cur_p->data[i].logit /= ctx->temp; - } + + llama_sampler_temp_impl(cur_p, ctx->temp); } static struct llama_sampler * llama_sampler_temp_clone(const struct llama_sampler * smpl) { @@ -977,6 +1003,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke if (ctx->delta > 0) { const float min_temp = std::max(0.0f, ctx->temp - ctx->delta); const float max_temp = ctx->temp + ctx->delta; + float exponent_val = ctx->exponent; // no need to do anything if there is only one (or zero) candidates @@ -1014,9 +1041,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke #endif // Apply the dynamically calculated temperature scaling - for (size_t i = 0; i < cur_p->size; ++i) { - cur_p->data[i].logit /= dyn_temp; - } + llama_sampler_temp_impl(cur_p, dyn_temp); // Re-compute softmax probabilities after scaling logits with dynamic temperature const double max_l_double = cur_p->data[0].logit; @@ -1040,9 +1065,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke } #endif } else { - for (size_t i = 0; i < cur_p->size; ++i) { - cur_p->data[i].logit /= ctx->temp; - } + llama_sampler_temp_impl(cur_p, ctx->temp); } } @@ -1076,6 +1099,101 @@ struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, floa }; } +// xtc + +struct llama_sampler_xtc { + const float probability; + const float threshold; + const size_t min_keep; + + const uint32_t seed; + uint32_t seed_cur; + + std::mt19937 rng; +}; + +static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) { + return "xtc"; +} + +static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_xtc *) smpl->ctx; + + if (ctx->probability <= 0.0f + || ctx->threshold > 0.5f + || cur_p->size < 2) { + return; + } + + std::uniform_real_distribution distribution(0.0f, 1.0f); + float chance = distribution(ctx->rng); + if (chance > ctx->probability) return; + + // in case it's not sorted/recalculated yet + llama_sampler_softmax_impl(cur_p); + + int pos_last = 0; + + for (size_t i = 0; i < cur_p->size; ++i) { + if (cur_p->data[i].p >= ctx->threshold) { + pos_last = i; + } else break; + } + + if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) { + cur_p->data += pos_last; + cur_p->size -= pos_last; + } +} + +static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_xtc *) smpl->ctx; + auto * result = llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->min_keep, ctx->seed); + + // copy the state + { + auto * result_ctx = (llama_sampler_xtc *) result->ctx; + + result_ctx->rng = ctx->rng; + } + + return result; +} + +static void llama_sampler_xtc_free(struct llama_sampler * smpl) { + delete (llama_sampler_xtc *) smpl->ctx; +} + +static void llama_sampler_xtc_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_xtc *) smpl->ctx; + ctx->seed_cur = get_rng_seed(ctx->seed); + ctx->rng.seed(ctx->seed_cur); +} + +static struct llama_sampler_i llama_sampler_xtc_i = { + /* .name = */ llama_sampler_xtc_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sample_xtc_apply, + /* .reset = */ llama_sampler_xtc_reset, + /* .clone = */ llama_sampler_xtc_clone, + /* .free = */ llama_sampler_xtc_free, +}; + +struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) { + auto seed_cur = get_rng_seed(seed); + return new llama_sampler { + /* .iface = */ &llama_sampler_xtc_i, + /* .ctx = */ new llama_sampler_xtc { + /* .probability = */ p, + /* .threshold = */ t, + /* .min_keep = */ min_keep, + /* .seed = */ seed, + /* .seed_cur = */ seed_cur, + /* .rng = */ std::mt19937(seed_cur), + }, + }; +} + // mirostat struct llama_sampler_mirostat { @@ -1678,6 +1796,229 @@ struct llama_sampler * llama_sampler_init_logit_bias( }; } +// infill + +//#define GGML_DEBUG_SAMPLER_INFILL + +struct llama_sampler_infill { + const struct llama_vocab * vocab; + + std::vector buf0; + std::vector buf1; +}; + +static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) { + return "infill"; +} + +static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_infill *) smpl->ctx; + + llama_sampler_softmax_impl(cur_p); + +#if defined(GGML_DEBUG_SAMPLER_INFILL) +#define LOG_DBG_CUR LLAMA_LOG_DEBUG +#else +#define LOG_DBG_CUR(...) +#endif + + for (size_t i = 0; i < cur_p->size; ++i) { + LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); + } + + float p_txt_sum = 0.0f; + float p_eog_sum = 0.0f; + + for (size_t i = 0; i < cur_p->size; ++i) { + if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) { + p_eog_sum += cur_p->data[i].p; + } else { + p_txt_sum += cur_p->data[i].p; + } + } + + const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat); + + LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size); + + if (3*p_eog_sum*cur_p->size > p_txt_sum) { + LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum); + + // keep just the EOG tokens + const auto size_org = cur_p->size; + + cur_p->size = 0; + + float p_sum = 0.0f; + + for (size_t i = 0; i < size_org; ++i) { + if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) { + p_sum += cur_p->data[i].p; + + cur_p->data[cur_p->size++] = cur_p->data[i]; + } + } + + // normalize probs + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= p_sum; + } + + return; + } + + size_t n_combined = 0; GGML_UNUSED(n_combined); + + // combine tokens with common prefix + for (size_t i0 = 0; i0 < cur_p->size; ++i0) { + for (size_t i1 = 0; i1 < cur_p->size; ++i1) { + if (cur_p->data[i0].logit == -INFINITY) { + break; + } + + if (i0 == i1 || cur_p->data[i1].logit == -INFINITY) { + continue; + } + + int len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false); + if (len0 < 0) { + ctx->buf0.resize(len0); + len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false); + assert(len0 > 0); + } + + int len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false); + if (len1 < 0) { + ctx->buf1.resize(len1); + len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false); + assert(len1 > 0); + } + + // token i0 is a prefix of token i1 + if (len0 > 0 && len0 <= len1 && memcmp(ctx->buf0.data(), ctx->buf1.data(), len0) == 0) { + int dst = i0; + int src = i1; + + // merge into the token with higher probability + if (cur_p->data[i1].p > cur_p->data[i0].p) { + std::swap(dst, src); + } + + cur_p->data[dst].p += cur_p->data[src].p; + cur_p->data[src].logit = -INFINITY; + cur_p->data[src].p = 0.0f; + + n_combined++; + } + } + } + + size_t n_non_eog = 0; + + size_t size_org = cur_p->size; + + float p_sum = 0.0f; + float thold = 0.2f; + + cur_p->size = 0; + + LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold); + + for (size_t i = 0; i < size_org; ++i) { + const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id); + + if (cur_p->data[i].p < thold && !is_eog) { + continue; + } + + if (!is_eog) { + ++n_non_eog; + } + + p_sum += cur_p->data[i].p; + + // keep this token + cur_p->data[cur_p->size++] = cur_p->data[i]; + } + + LOG_DBG_CUR("%s: n_non_eog = %zu\n", __func__, n_non_eog); + + // if no non-EOG tokens are left -> reduce cur_p to single EOT token + if (n_non_eog == 0) { + cur_p->size = 1; + cur_p->data[0].id = llama_token_eot_impl(*ctx->vocab); + cur_p->data[0].logit = 1.0f; + + return; + } + + // normalize probs + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= p_sum; + + LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); + } + + size_org = cur_p->size; + p_sum = 0.0f; + thold = 1.0/(n_non_eog + 1); + + cur_p->size = 0; + + LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold); + + for (size_t i = 0; i < size_org; ++i) { + const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id); + + if (cur_p->data[i].p < thold && !is_eog) { + continue; + } + + p_sum += cur_p->data[i].p; + + cur_p->data[cur_p->size++] = cur_p->data[i]; + } + + // normalize probs + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= p_sum; + + LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); + } + +#undef LOG_DBG_CUR +} + +static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_infill *) smpl->ctx; + return llama_sampler_init_infill_impl(*ctx->vocab); +} + +static void llama_sampler_infill_free(struct llama_sampler * smpl) { + delete (llama_sampler_infill *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_infill_i = { + /* .name = */ llama_sampler_infill_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_infill_apply, + /* .reset = */ nullptr, + /* .clone = */ llama_sampler_infill_clone, + /* .free = */ llama_sampler_infill_free, +}; + +struct llama_sampler * llama_sampler_init_infill_impl( + const struct llama_vocab & vocab) { + return new llama_sampler { + /* .iface = */ &llama_sampler_infill_i, + /* .ctx = */ new llama_sampler_infill { + /* .vocab = */ &vocab, + /* .buf0 = */ std::vector(512), + /* .buf1 = */ std::vector(512), + }, + }; +} + // utils uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) { diff --git a/src/llama-sampling.h b/src/llama-sampling.h index 07f8a66a2..57f100678 100644 --- a/src/llama-sampling.h +++ b/src/llama-sampling.h @@ -4,8 +4,6 @@ #include "llama-grammar.h" -#include - struct llama_vocab; struct llama_grammar; @@ -29,3 +27,6 @@ struct llama_sampler * llama_sampler_init_grammar_impl( const char * grammar_root); bool llama_sampler_is_grammar_empty_impl(struct llama_sampler * gsmpl); + +struct llama_sampler * llama_sampler_init_infill_impl( + const struct llama_vocab & vocab); diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index a771eccda..0a49ddbe3 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -50,7 +50,7 @@ struct naive_trie { res.first->second.insert(key + 1, len - 1, value); } } - std::pair get_longest_prefix(const char * key, size_t len, size_t offset = 0) { + std::pair get_longest_prefix(const char * key, size_t len, size_t offset = 0) const { if (len == 0 || offset == len) { return std::make_pair(key, offset); } @@ -79,6 +79,15 @@ struct naive_trie { // impl // +struct llm_tokenizer { + llm_tokenizer() {} + virtual ~llm_tokenizer() = default; +}; + +llama_vocab::~llama_vocab() { + delete tokenizer; +} + int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const { GGML_ASSERT(token_left.find(' ') == std::string::npos); GGML_ASSERT(token_left.find('\n') == std::string::npos); @@ -187,10 +196,15 @@ struct llm_bigram_spm { size_t size; }; -struct llm_tokenizer_spm { - llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {} +struct llm_tokenizer_spm : llm_tokenizer { + llm_tokenizer_spm(const llama_vocab & /*vocab*/) : llm_tokenizer() {} +}; + +struct llm_tokenizer_spm_session { + llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {} void tokenize(const std::string & text, std::vector & output) { + // split string into utf8 chars int index = 0; size_t offs = 0; @@ -207,7 +221,7 @@ struct llm_tokenizer_spm { } // seed the work queue with all possible 2-character tokens. - for (size_t i = 1; i < symbols.size(); ++i) { + for (int i = 1; i < (int) symbols.size(); ++i) { try_add_bigram(i - 1, i); } @@ -271,7 +285,7 @@ private: return; } - resegment(symbols[p->second.first], output); + resegment(symbols[p->second.first], output); resegment(symbols[p->second.second], output); } @@ -279,7 +293,6 @@ private: if (left == -1 || right == -1) { return; } - const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n); auto token = vocab.token_to_id.find(text); @@ -306,10 +319,11 @@ private: } const llama_vocab & vocab; + // currently unused + // const llm_tokenizer_spm * spm_tokenizer; std::vector symbols; llm_bigram_spm::queue work_queue; - std::map> rev_merge; }; @@ -352,8 +366,8 @@ struct llm_bigram_bpe { size_t size; }; -struct llm_tokenizer_bpe { - llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) { +struct llm_tokenizer_bpe : llm_tokenizer { + llm_tokenizer_bpe(const llama_vocab & vocab) : llm_tokenizer() { GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE); switch (vocab.type_pre) { case LLAMA_VOCAB_PRE_TYPE_LLAMA3: @@ -450,6 +464,20 @@ struct llm_tokenizer_bpe { "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }; break; + case LLAMA_VOCAB_PRE_TYPE_CHAMELEON: + // Note: in theory, the special token (sentinel and image token) regex_exprs below + // are unnecessary, as they are split in `tokenizer_st_partition` anyway. + // However, since the upstream pre-tokenizer uses them, they are also + // included here (see https://huggingface.co/facebook/chameleon-7b). + regex_exprs = { + "", // Sentinel tokens + "(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z", // Image tokens + "([\\t\\n]| | )", // directly from tokenizer.json + "\\p{N}", // Individual digits + "[\\p{P}!-/:-@\\[-`{-~]", // Punctuation, Isolated + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + }; + break; default: // default regex for BPE tokenization pre-processing regex_exprs = { @@ -462,7 +490,14 @@ struct llm_tokenizer_bpe { } } - void append(const llama_vocab::id token_id, std::vector & output) const { + std::vector regex_exprs; +}; + +struct llm_tokenizer_bpe_session { + llm_tokenizer_bpe_session(const llama_vocab & vocab) : vocab(vocab), + bpe_tokenizer(static_cast(vocab.tokenizer)) {} + + static void append(const llama_vocab::id token_id, std::vector & output) { output.push_back(token_id); } @@ -501,12 +536,11 @@ struct llm_tokenizer_bpe { void tokenize(const std::string & text, std::vector & output) { int final_prev_index = -1; - - const auto word_collection = unicode_regex_split(text, regex_exprs); + const auto word_collection = unicode_regex_split(text, bpe_tokenizer->regex_exprs); symbols_final.clear(); - for (auto & word : word_collection) { + for (const auto & word : word_collection) { work_queue = llm_bigram_bpe::queue(); symbols.clear(); @@ -529,7 +563,7 @@ struct llm_tokenizer_bpe { index++; symbols.emplace_back(sym); } - for (size_t i = 1; i < symbols.size(); ++i) { + for (int i = 1; i < (int) symbols.size(); ++i) { add_new_bigram(i - 1, i); } @@ -609,7 +643,6 @@ private: if (left == -1 || right == -1) { return; } - std::string left_token = std::string(symbols[left].text, symbols[left].n); std::string right_token = std::string(symbols[right].text, symbols[right].n); @@ -633,12 +666,10 @@ private: } const llama_vocab & vocab; - - std::vector regex_exprs; + const llm_tokenizer_bpe * bpe_tokenizer; std::vector symbols; std::vector symbols_final; - llm_bigram_bpe::queue work_queue; }; @@ -646,15 +677,17 @@ private: // WPM tokenizer // -struct llm_tokenizer_wpm { - llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {} +struct llm_tokenizer_wpm : llm_tokenizer { + llm_tokenizer_wpm(const llama_vocab & /*vocab*/) : llm_tokenizer() {} +}; - void tokenize(const std::string & text, std::vector & output) const { +struct llm_tokenizer_wpm_session { + llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {} + + void tokenize(const std::string & text, std::vector & output) { const auto & token_map = vocab.token_to_id; - // normalize and split by whitespace std::vector words = preprocess(text); - // bos token prepended already // find the longest tokens that form the words @@ -699,7 +732,7 @@ struct llm_tokenizer_wpm { } // TODO: reduce string copies by using cpts_offs array - std::vector preprocess(const std::string & text) const { + static std::vector preprocess(const std::string & text) { const std::vector cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text)); std::vector words(1, ""); @@ -751,15 +784,18 @@ struct llm_tokenizer_wpm { //(cpt >= 0xFF00 && cpt <= 0xFFEF); } +private: const llama_vocab & vocab; + // currently unused + // const llm_tokenizer_wpm * wpm_tokenizer; }; // // UGM tokenizer // -struct llm_tokenizer_ugm { - llm_tokenizer_ugm(const llama_vocab & vocab) : vocab(vocab) { +struct llm_tokenizer_ugm : llm_tokenizer { + llm_tokenizer_ugm(const llama_vocab & vocab) : llm_tokenizer() { if (vocab.precompiled_charsmap.size() > 0) { size_t charsmap_offset = 0; @@ -805,6 +841,30 @@ struct llm_tokenizer_ugm { unknown_token_score = min_score - unknown_token_score_penalty; } + // escaped space symbol - U+2581 (Lower One Eighth Block) + const std::string escaped_space = "\xE2\x96\x81"; + + const char * prefix_replacements = NULL; + size_t prefix_replacements_size = 0; + + const uint32_t * xcda_array = NULL; + size_t xcda_array_size = 0; + + struct naive_trie user_defined_token_matcher; + + float min_score = FLT_MAX; + float max_score = -FLT_MAX; + + float unknown_token_score_penalty = 10.0; + float unknown_token_score; + + struct naive_trie token_matcher; +}; + +struct llm_tokenizer_ugm_session { + llm_tokenizer_ugm_session(const llama_vocab & vocab) : vocab(vocab), + ugm_tokenizer(static_cast(vocab.tokenizer)) {} + /* This implementation is based on SentencePiece optimized Viterbi algorithm for * unigram language models. The general idea is to: * - move along the input sequence in steps of one UTF code point, @@ -843,7 +903,7 @@ struct llm_tokenizer_ugm { // traverse the token matcher trie to find a matching token bool single_codepoint_token_found = false; const struct best_tokenization & current_best = tokenization_results[input_offset]; - const struct naive_trie * node = token_matcher.traverse(normalized[prefix_offset++]); + const struct naive_trie * node = ugm_tokenizer->token_matcher.traverse(normalized[prefix_offset++]); while (prefix_offset <= input_len && node != NULL) { // check if we found valid token in prefix @@ -873,7 +933,7 @@ struct llm_tokenizer_ugm { // if we didn't find a valid token corresponding to the whole UTF code point // then use unknown token as the tokenization of this UTF code point if (!single_codepoint_token_found) { - const double challenger_score = current_best.score_sum + unknown_token_score; + const double challenger_score = current_best.score_sum + ugm_tokenizer->unknown_token_score; prefix_offset = input_offset + n_utf8_code_units; struct best_tokenization & current_champ = tokenization_results[prefix_offset]; if (challenger_score > current_champ.score_sum) { @@ -905,7 +965,6 @@ struct llm_tokenizer_ugm { } private: - const llama_vocab & vocab; // helper structure for returning normalization results struct normalization_result { @@ -918,7 +977,7 @@ private: normalized->clear(); normalized->reserve(input.size() * 3); - const std::string space = vocab.tokenizer_escape_whitespaces ? escaped_space : " "; + const std::string space = vocab.tokenizer_escape_whitespaces ? ugm_tokenizer->escaped_space : " "; bool shall_prepend_space = !vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix; bool shall_append_space = vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix; @@ -1000,13 +1059,21 @@ private: size_t xcda_array_size; }; + // this structure stores the best tokenization so far at input_offset + struct best_tokenization { + llama_token token_id; + size_t input_offset; + float score_sum; + }; + struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) { if (input_offset == input.size()) { return { &input[input_offset], 0, 0 }; } // if input prefix matches some user-defined token return this token as normalization result - auto user_defined_token_match = user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset); + auto user_defined_token_match = + ugm_tokenizer->user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset); if (user_defined_token_match.second > 0) { return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second }; } @@ -1014,8 +1081,8 @@ private: size_t longest_prefix_length = 0; size_t longest_prefix_offset = 0; - if (xcda_array_size > 0) { - struct xcda_array_view xcda_view(xcda_array, xcda_array_size); + if (ugm_tokenizer->xcda_array_size > 0) { + struct xcda_array_view xcda_view(ugm_tokenizer->xcda_array, ugm_tokenizer->xcda_array_size); // Find the longest normalized sequence matching the input prefix by walking // the XOR-compressed compact double array (XCDA) starting from the root node @@ -1051,50 +1118,27 @@ private: if (longest_prefix_length > 0) { // we have a match, so return the replacement sequence - if (longest_prefix_offset >= prefix_replacements_size) { + if (longest_prefix_offset >= ugm_tokenizer->prefix_replacements_size) { throw std::runtime_error("Index out of array bounds in precompiled charsmap!"); } - const char * prefix_replacement = &prefix_replacements[longest_prefix_offset]; + const char * prefix_replacement = &(ugm_tokenizer->prefix_replacements)[longest_prefix_offset]; return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length }; - } else { - // check if the input prefix contains a valid sequence of UTF-8 code units - try { - // if yes, return this sequence unmodified - size_t prefix_offset = input_offset; - unicode_cpt_from_utf8(input, prefix_offset); - return { &input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset }; - } catch (std::invalid_argument & /*ex*/) { - // if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER - return { "\xEF\xBF\xBD", 3, 1 }; - } + } + + // check if the input prefix contains a valid sequence of UTF-8 code units + try { + // if yes, return this sequence unmodified + size_t prefix_offset = input_offset; + unicode_cpt_from_utf8(input, prefix_offset); + return { &input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset }; + } catch (std::invalid_argument & /*ex*/) { + // if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER + return { "\xEF\xBF\xBD", 3, 1 }; } } - // escaped space symbol - U+2581 (Lower One Eighth Block) - const std::string escaped_space = "\xE2\x96\x81"; - - const char * prefix_replacements = NULL; - size_t prefix_replacements_size = 0; - - const uint32_t * xcda_array = NULL; - size_t xcda_array_size = 0; - - struct naive_trie user_defined_token_matcher; - - // this structure stores the best tokenization so far at input_offset - struct best_tokenization { - llama_token token_id; - size_t input_offset; - float score_sum; - }; - - float min_score = FLT_MAX; - float max_score = -FLT_MAX; - - float unknown_token_score_penalty = 10.0; - float unknown_token_score; - - struct naive_trie token_matcher; + const llama_vocab & vocab; + const llm_tokenizer_ugm * ugm_tokenizer; }; // @@ -1155,8 +1199,8 @@ static std::vector llama_unescape_rwkv_token(const std::string & escape return output; } -struct llm_tokenizer_rwkv { - llm_tokenizer_rwkv(const llama_vocab & vocab): vocab(vocab) { +struct llm_tokenizer_rwkv : llm_tokenizer { + llm_tokenizer_rwkv(const llama_vocab & vocab) : llm_tokenizer() { // RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens. // For now, we decode the vocab here into the lookup we'll use for tokenization. @@ -1168,11 +1212,17 @@ struct llm_tokenizer_rwkv { } } + struct naive_trie token_matcher; +}; + +struct llm_tokenizer_rwkv_session { + llm_tokenizer_rwkv_session(const llama_vocab & vocab) : vocab(vocab), + rwkv_tokenizer(static_cast(*vocab.tokenizer)) {} + void tokenize(const std::string & text, std::vector & output) { uint32_t position = 0; - while (position < text.size()) { - const struct naive_trie * node = token_matcher.traverse(text[position]); + const struct naive_trie * node = rwkv_tokenizer.token_matcher.traverse(text[position]); if (node == NULL) { // no matching token found, add unknown token output.push_back(vocab.special_unk_id); @@ -1197,11 +1247,33 @@ struct llm_tokenizer_rwkv { } } +private: const llama_vocab & vocab; - - struct naive_trie token_matcher; + const llm_tokenizer_rwkv & rwkv_tokenizer; }; +void llama_vocab::init_tokenizer() { + switch (type) { + case LLAMA_VOCAB_TYPE_SPM: + tokenizer = new llm_tokenizer_spm(*this); + break; + case LLAMA_VOCAB_TYPE_BPE: + tokenizer = new llm_tokenizer_bpe(*this); + break; + case LLAMA_VOCAB_TYPE_WPM: + tokenizer = new llm_tokenizer_wpm(*this); + break; + case LLAMA_VOCAB_TYPE_UGM: + tokenizer = new llm_tokenizer_ugm(*this); + break; + case LLAMA_VOCAB_TYPE_RWKV: + tokenizer = new llm_tokenizer_rwkv(*this); + break; + default: + GGML_ABORT("unsupported vocab type"); + } +} + // // (de-) tokenize // @@ -1263,7 +1335,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< // if a fragment is text ( not yet processed ) if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { - auto & raw_text = fragment.raw_text; + const auto & raw_text = fragment.raw_text; auto raw_text_base_offset = fragment.offset; auto raw_text_base_length = fragment.length; @@ -1362,7 +1434,13 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< } } -std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special) { +std::vector llama_tokenize_internal( + const llama_vocab & vocab, + std::string raw_text, + bool add_special, + bool parse_special) { + GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first."); + std::vector output; std::forward_list fragment_buffer; @@ -1399,9 +1477,9 @@ std::vector llama_tokenize_internal(const llama_vocab & vocab, #ifdef PRETOKENIZERDEBUG LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); #endif - llm_tokenizer_spm tokenizer(vocab); llama_escape_whitespace(raw_text); - tokenizer.tokenize(raw_text, output); + llm_tokenizer_spm_session session(vocab); + session.tokenize(raw_text, output); is_prev_special = false; } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) output.push_back(fragment.token); @@ -1423,10 +1501,11 @@ std::vector llama_tokenize_internal(const llama_vocab & vocab, } break; case LLAMA_VOCAB_TYPE_BPE: { - llm_tokenizer_bpe tokenizer(vocab); - + llm_tokenizer_bpe_session session(vocab); + // it calls some other methods that are not exist in llm_tokenizer, + // here just cast it to bpe tokenizer object if (add_special) { - tokenizer.append_bos(output); + session.append_bos(output); } for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { @@ -1435,15 +1514,15 @@ std::vector llama_tokenize_internal(const llama_vocab & vocab, #ifdef PRETOKENIZERDEBUG LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); #endif - tokenizer.tokenize(raw_text, output); + session.tokenize(raw_text, output); } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) - tokenizer.append(fragment.token, output); + session.append(fragment.token, output); } } if (add_special) { - tokenizer.append_eos(output); - tokenizer.check_double_bos_eos(output); + session.append_eos(output); + session.check_double_bos_eos(output); } } break; case LLAMA_VOCAB_TYPE_WPM: @@ -1453,7 +1532,7 @@ std::vector llama_tokenize_internal(const llama_vocab & vocab, output.push_back(vocab.special_cls_id); } - llm_tokenizer_wpm tokenizer(vocab); + llm_tokenizer_wpm_session session(vocab); for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { @@ -1462,7 +1541,7 @@ std::vector llama_tokenize_internal(const llama_vocab & vocab, #ifdef PRETOKENIZERDEBUG LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); #endif - tokenizer.tokenize(raw_text, output); + session.tokenize(raw_text, output); } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) output.push_back(fragment.token); } @@ -1475,12 +1554,11 @@ std::vector llama_tokenize_internal(const llama_vocab & vocab, } break; case LLAMA_VOCAB_TYPE_UGM: { - llm_tokenizer_ugm tokenizer(vocab); - - if (add_special && vocab.tokenizer_add_bos != 0) { + if (add_special && vocab.tokenizer_add_bos) { GGML_ASSERT(vocab.special_bos_id != -1); output.push_back(vocab.special_bos_id); } + llm_tokenizer_ugm_session session(vocab); for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { @@ -1488,26 +1566,27 @@ std::vector llama_tokenize_internal(const llama_vocab & vocab, #ifdef PRETOKENIZERDEBUG LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); #endif - tokenizer.tokenize(raw_text, output); + session.tokenize(raw_text, output); } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) output.push_back(fragment.token); } } - if (add_special && vocab.tokenizer_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) { + if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) { LLAMA_LOG_WARN( "%s: Added a BOS token to the prompt as specified by the model but the prompt " "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. " "Are you sure this is what you want?\n", __FUNCTION__); } - if (add_special && vocab.tokenizer_add_eos == 1) { + if (add_special && vocab.tokenizer_add_eos) { GGML_ASSERT(vocab.special_eos_id != -1); output.push_back(vocab.special_eos_id); } } break; case LLAMA_VOCAB_TYPE_RWKV: { + llm_tokenizer_rwkv_session session(vocab); for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); @@ -1516,8 +1595,7 @@ std::vector llama_tokenize_internal(const llama_vocab & vocab, LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); #endif - llm_tokenizer_rwkv tokenizer(vocab); - tokenizer.tokenize(raw_text, output); + session.tokenize(raw_text, output); } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) output.push_back(fragment.token); } @@ -1585,6 +1663,14 @@ llama_token llama_token_eos_impl(const struct llama_vocab & vocab) { return vocab.special_eos_id; } +llama_token llama_token_eot_impl(const struct llama_vocab & vocab) { + return vocab.special_eot_id; +} + +llama_token llama_token_eom_impl(const struct llama_vocab & vocab) { + return vocab.special_eom_id; +} + llama_token llama_token_cls_impl(const struct llama_vocab & vocab) { return vocab.special_cls_id; } @@ -1610,33 +1696,49 @@ bool llama_add_eos_token_impl(const struct llama_vocab & vocab) { } llama_token llama_token_prefix_impl(const struct llama_vocab & vocab) { - return vocab.special_prefix_id; + return vocab.special_fim_pre_id; } llama_token llama_token_middle_impl(const struct llama_vocab & vocab) { - return vocab.special_middle_id; + return vocab.special_fim_mid_id; } llama_token llama_token_suffix_impl(const struct llama_vocab & vocab) { - return vocab.special_suffix_id; + return vocab.special_fim_suf_id; } -llama_token llama_token_eot_impl(const struct llama_vocab & vocab) { - return vocab.special_eot_id; +llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab) { + return vocab.special_fim_pre_id; } -llama_token llama_token_eom_impl(const struct llama_vocab & vocab) { - return vocab.special_eom_id; +llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab) { + return vocab.special_fim_suf_id; +} + +llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab) { + return vocab.special_fim_mid_id; +} + +llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab) { + return vocab.special_fim_pad_id; +} + +llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab) { + return vocab.special_fim_rep_id; +} + +llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab) { + return vocab.special_fim_sep_id; } int32_t llama_tokenize_impl( - const struct llama_vocab & vocab, - const char * text, - int32_t text_len, - llama_token * tokens, - int32_t n_tokens_max, - bool add_special, - bool parse_special) { + const struct llama_vocab & vocab, + const char * text, + int32_t text_len, + llama_token * tokens, + int32_t n_tokens_max, + bool add_special, + bool parse_special) { auto res = llama_tokenize_internal(vocab, std::string(text, text_len), add_special, parse_special); if (n_tokens_max < (int) res.size()) { // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); @@ -1713,11 +1815,13 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token // suppressing them like CONTROL tokens. if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) { return _try_copy(token_text.data(), token_text.size()); - } else if (attr & LLAMA_TOKEN_ATTR_NORMAL) { + } + if (attr & LLAMA_TOKEN_ATTR_NORMAL) { std::string result = token_text; llama_unescape_whitespace(result); return _try_copy(result.data(), result.size()); - } else if (attr & LLAMA_TOKEN_ATTR_BYTE) { + } + if (attr & LLAMA_TOKEN_ATTR_BYTE) { char byte = (char) llama_token_to_byte(vocab, token); return _try_copy((char*) &byte, 1); } @@ -1728,7 +1832,8 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token // suppressing them like CONTROL tokens. if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) { return _try_copy(token_text.data(), token_text.size()); - } else if (attr & LLAMA_TOKEN_ATTR_NORMAL) { + } + if (attr & LLAMA_TOKEN_ATTR_NORMAL) { std::string result = llama_decode_text(token_text); return _try_copy(result.data(), result.size()); } @@ -1761,6 +1866,8 @@ int32_t llama_detokenize_impl( int32_t text_len_max, bool remove_special, bool unparse_special) { + GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first."); + int32_t avail = text_len_max; int32_t total = 0; diff --git a/src/llama-vocab.h b/src/llama-vocab.h index cc46f642b..d958d0073 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -8,6 +8,8 @@ #include #include +struct llm_tokenizer; + struct llama_vocab { using id = llama_token; using token = std::string; @@ -35,20 +37,26 @@ struct llama_vocab { std::map, int> bpe_ranks; // default LLaMA special tokens + // TODO: should we set all of these to LLAMA_TOKEN_NULL? id special_bos_id = 1; id special_eos_id = 2; + id special_eot_id = LLAMA_TOKEN_NULL; + id special_eom_id = LLAMA_TOKEN_NULL; id special_unk_id = 0; - id special_sep_id = -1; - id special_pad_id = -1; - id special_cls_id = -1; - id special_mask_id = -1; + id special_sep_id = LLAMA_TOKEN_NULL; + id special_pad_id = LLAMA_TOKEN_NULL; + id special_cls_id = LLAMA_TOKEN_NULL; + id special_mask_id = LLAMA_TOKEN_NULL; - id linefeed_id = 13; - id special_prefix_id = -1; - id special_suffix_id = -1; - id special_middle_id = -1; - id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token - id special_eom_id = -1; + id linefeed_id = 13; + + // fim tokens + id special_fim_pre_id = LLAMA_TOKEN_NULL; + id special_fim_suf_id = LLAMA_TOKEN_NULL; + id special_fim_mid_id = LLAMA_TOKEN_NULL; + id special_fim_pad_id = LLAMA_TOKEN_NULL; + id special_fim_rep_id = LLAMA_TOKEN_NULL; // repo + id special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator // set of all tokens that cause "end of generation" std::set special_eog_ids; @@ -65,7 +73,14 @@ struct llama_vocab { std::vector precompiled_charsmap; + llm_tokenizer * tokenizer = nullptr; + + llama_vocab() = default; + ~llama_vocab(); + int find_bpe_rank(const std::string & token_left, const std::string & token_right) const; + + void init_tokenizer(); }; // @@ -95,19 +110,26 @@ bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token t llama_token llama_token_bos_impl(const struct llama_vocab & vocab); llama_token llama_token_eos_impl(const struct llama_vocab & vocab); +llama_token llama_token_eot_impl(const struct llama_vocab & vocab); +llama_token llama_token_eom_impl(const struct llama_vocab & vocab); llama_token llama_token_cls_impl(const struct llama_vocab & vocab); llama_token llama_token_sep_impl(const struct llama_vocab & vocab); llama_token llama_token_nl_impl (const struct llama_vocab & vocab); llama_token llama_token_pad_impl(const struct llama_vocab & vocab); -bool llama_add_bos_token_impl(const struct llama_vocab & vocab); -bool llama_add_eos_token_impl(const struct llama_vocab & vocab); - llama_token llama_token_prefix_impl(const struct llama_vocab & vocab); llama_token llama_token_middle_impl(const struct llama_vocab & vocab); llama_token llama_token_suffix_impl(const struct llama_vocab & vocab); -llama_token llama_token_eot_impl (const struct llama_vocab & vocab); -llama_token llama_token_eom_impl (const struct llama_vocab & vocab); + +llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab); +llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab); +llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab); +llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab); +llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab); +llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab); + +bool llama_add_bos_token_impl(const struct llama_vocab & vocab); +bool llama_add_eos_token_impl(const struct llama_vocab & vocab); int32_t llama_tokenize_impl( const struct llama_vocab & vocab, @@ -127,6 +149,12 @@ int32_t llama_token_to_piece_impl( int32_t lstrip, bool special); +// check if token0 is contained as a prefix in token1 +bool llama_token_is_prefix_impl( + const struct llama_vocab & vocab, + llama_token token0, + llama_token token1); + int32_t llama_detokenize_impl( const struct llama_vocab & vocab, const llama_token * tokens, diff --git a/src/llama.cpp b/src/llama.cpp index 0fd4f6760..c661c010a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8,28 +8,16 @@ #include "ggml-alloc.h" #include "ggml-backend.h" -#ifdef GGML_USE_RPC -# include "ggml-rpc.h" -#endif - -#ifdef GGML_USE_CUDA -# include "ggml-cuda.h" -#elif defined(GGML_USE_VULKAN) -# include "ggml-vulkan.h" -#elif defined(GGML_USE_SYCL) -# include "ggml-sycl.h" -#elif defined(GGML_USE_KOMPUTE) +#if defined(GGML_USE_KOMPUTE) # include "ggml-kompute.h" -#elif defined(GGML_USE_CANN) -# include "ggml-cann.h" #endif -#ifdef GGML_USE_BLAS -# include "ggml-blas.h" +#ifndef __AMX_INT8__ +#undef GGML_USE_AMX #endif -#ifdef GGML_USE_METAL -# include "ggml-metal.h" +#ifdef GGML_USE_AMX +# include "ggml-amx.h" #endif // TODO: replace with ggml API call @@ -216,6 +204,7 @@ enum llm_arch { LLM_ARCH_RWKV6, LLM_ARCH_GRANITE, LLM_ARCH_GRANITE_MOE, + LLM_ARCH_CHAMELEON, LLM_ARCH_UNKNOWN, }; @@ -268,6 +257,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_RWKV6, "rwkv6" }, { LLM_ARCH_GRANITE, "granite" }, { LLM_ARCH_GRANITE_MOE, "granitemoe" }, + { LLM_ARCH_CHAMELEON, "chameleon" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -304,6 +294,7 @@ enum llm_kv { LLM_KV_DECODER_START_TOKEN_ID, LLM_KV_ATTN_LOGIT_SOFTCAPPING, LLM_KV_FINAL_LOGIT_SOFTCAPPING, + LLM_KV_SWIN_NORM, LLM_KV_RESCALE_EVERY_N_LAYERS, LLM_KV_TIME_MIX_EXTRA_DIM, LLM_KV_TIME_DECAY_EXTRA_DIM, @@ -356,6 +347,8 @@ enum llm_kv { LLM_KV_TOKENIZER_MERGES, LLM_KV_TOKENIZER_BOS_ID, LLM_KV_TOKENIZER_EOS_ID, + LLM_KV_TOKENIZER_EOT_ID, + LLM_KV_TOKENIZER_EOM_ID, LLM_KV_TOKENIZER_UNK_ID, LLM_KV_TOKENIZER_SEP_ID, LLM_KV_TOKENIZER_PAD_ID, @@ -368,14 +361,20 @@ enum llm_kv { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, LLM_KV_TOKENIZER_HF_JSON, LLM_KV_TOKENIZER_RWKV, - LLM_KV_TOKENIZER_PREFIX_ID, - LLM_KV_TOKENIZER_SUFFIX_ID, - LLM_KV_TOKENIZER_MIDDLE_ID, - LLM_KV_TOKENIZER_EOT_ID, - LLM_KV_TOKENIZER_EOM_ID, + LLM_KV_TOKENIZER_FIM_PRE_ID, + LLM_KV_TOKENIZER_FIM_SUF_ID, + LLM_KV_TOKENIZER_FIM_MID_ID, + LLM_KV_TOKENIZER_FIM_PAD_ID, + LLM_KV_TOKENIZER_FIM_REP_ID, + LLM_KV_TOKENIZER_FIM_SEP_ID, LLM_KV_ADAPTER_TYPE, LLM_KV_ADAPTER_LORA_ALPHA, + + // deprecated: + LLM_KV_TOKENIZER_PREFIX_ID, + LLM_KV_TOKENIZER_SUFFIX_ID, + LLM_KV_TOKENIZER_MIDDLE_ID, }; static const std::map LLM_KV_NAMES = { @@ -411,6 +410,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" }, { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" }, { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" }, + { LLM_KV_SWIN_NORM, "%s.swin_norm" }, { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" }, { LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" }, { LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" }, @@ -432,57 +432,65 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, - { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, - { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, - { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, - { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, - { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, - { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" }, - { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, - { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, - { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" }, + { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, + { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, + { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, + { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, + { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, + { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" }, + { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, + { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, + { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" }, - { LLM_KV_SPLIT_NO, "split.no" }, - { LLM_KV_SPLIT_COUNT, "split.count" }, - { LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" }, + { LLM_KV_SPLIT_NO, "split.no" }, + { LLM_KV_SPLIT_COUNT, "split.count" }, + { LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" }, - { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" }, - { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" }, - { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" }, - { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, - { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" }, + { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" }, + { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" }, + { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" }, + { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, + { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" }, - { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" }, + { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" }, - { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, - { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, - { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, - { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" }, - { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" }, - { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" }, - { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" }, - { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" }, - { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" }, - { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" }, - { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" }, - { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" }, - { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" }, - { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" }, - { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" }, - { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" }, - { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" }, - { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" }, - { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" }, - { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, - { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, - { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" }, - { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" }, - { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" }, - { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, - { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" }, + { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, + { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, + { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, + { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" }, + { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" }, + { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" }, + { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" }, + { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" }, + { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" }, + { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, + { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" }, + { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" }, + { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" }, + { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" }, + { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" }, + { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" }, + { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" }, + { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" }, + { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" }, + { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" }, + { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" }, + { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, + { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, + { LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" }, + { LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" }, + { LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" }, + { LLM_KV_TOKENIZER_FIM_PAD_ID, "tokenizer.ggml.fim_pad_token_id" }, + { LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" }, + { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" }, - { LLM_KV_ADAPTER_TYPE, "adapter.type" }, - { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, + { LLM_KV_ADAPTER_TYPE, "adapter.type" }, + { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, + + // deprecated + { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" }, + { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" }, + { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" }, }; struct LLM_KV { @@ -602,9 +610,11 @@ enum llm_tensor { LLM_TENSOR_ENC_FFN_DOWN, LLM_TENSOR_ENC_FFN_UP, LLM_TENSOR_ENC_OUTPUT_NORM, + LLM_TENSOR_CLS, + LLM_TENSOR_CLS_OUT, }; -static const std::map> LLM_TENSOR_NAMES = { +static const std::map> LLM_TENSOR_NAMES = { { LLM_ARCH_LLAMA, { @@ -789,6 +799,8 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_CLS, "cls" }, + { LLM_TENSOR_CLS_OUT, "cls.output" }, }, }, { @@ -824,6 +836,7 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_CLS, "cls" }, }, }, { @@ -1499,6 +1512,25 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, }, }, + { + LLM_ARCH_CHAMELEON, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -1538,32 +1570,32 @@ struct LLM_TN { return LLM_TENSOR_NAMES.at(arch).at(tensor); } - std::string operator()(llm_tensor tensor, const std::string & suffix) const { + std::string operator()(llm_tensor tensor, const char * suffix) const { if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) { return "__missing__"; } - return LLM_TENSOR_NAMES.at(arch).at(tensor) + "." + suffix; + return std::string(LLM_TENSOR_NAMES.at(arch).at(tensor)) + "." + suffix; } std::string operator()(llm_tensor tensor, int bid) const { if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) { return "__missing__"; } - return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid); + return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid); } - std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const { + std::string operator()(llm_tensor tensor, const char * suffix, int bid) const { if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) { return "__missing__"; } - return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid) + "." + suffix; + return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid) + "." + suffix; } - std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const { + std::string operator()(llm_tensor tensor, const char * suffix, int bid, int xid) const { if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) { return "__missing__"; } - return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix; + return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid, xid) + "." + suffix; } }; @@ -2236,59 +2268,16 @@ static std::string llama_token_to_piece(const struct llama_model * model, llama_ return piece; } -static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) { - ggml_backend_buffer_type_t buft = nullptr; - -#if defined(GGML_USE_CUDA) - // host buffers should only be used when data is expected to be copied to/from the GPU - if (host_buffer) { - buft = ggml_backend_cuda_host_buffer_type(); - } -#elif defined(GGML_USE_SYCL) - if (host_buffer) { - buft = ggml_backend_sycl_host_buffer_type(); - } -#elif defined(GGML_USE_CANN) - if (host_buffer) { - buft = ggml_backend_cann_host_buffer_type(); - } -#elif defined(GGML_USE_CPU_HBM) - buft = ggml_backend_cpu_hbm_buffer_type(); -#elif defined(GGML_USE_VULKAN) - if (host_buffer) { - buft = ggml_backend_vk_host_buffer_type(); - } -#endif - - if (buft == nullptr) { - buft = ggml_backend_cpu_buffer_type(); - } - return buft; - - GGML_UNUSED(host_buffer); -} - // // globals // -struct llama_state { - llama_state() { -#ifdef GGML_USE_METAL - ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data); -#elif defined(GGML_USE_CUDA) - ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data); -#elif defined(GGML_USE_CANN) - ggml_backend_cann_log_set_callback(log_callback, log_callback_user_data); -#endif - } - - // We save the log callback globally +struct llama_logger_state { ggml_log_callback log_callback = llama_log_callback_default; void * log_callback_user_data = nullptr; }; -static llama_state g_state; +static llama_logger_state g_logger_state; // available llama models enum e_model { @@ -2362,6 +2351,7 @@ struct llama_hparams { bool vocab_only; bool rope_finetuned; bool use_par_res; + bool swin_norm; uint32_t n_vocab; uint32_t n_ctx_train; // context size the model was trained on @@ -2428,7 +2418,7 @@ struct llama_hparams { // needed by encoder-decoder models (e.g. T5, FLAN-T5) // ref: https://github.com/ggerganov/llama.cpp/pull/8141 - llama_token dec_start_token_id = -1; + llama_token dec_start_token_id = LLAMA_TOKEN_NULL; enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE; enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; @@ -2870,6 +2860,7 @@ struct llama_model { llama_hparams hparams = {}; llama_vocab vocab; + // TODO: should init all tensors to nullptr struct ggml_tensor * tok_embd; struct ggml_tensor * type_embd; struct ggml_tensor * pos_embd; @@ -2882,16 +2873,25 @@ struct llama_model { struct ggml_tensor * output_b; struct ggml_tensor * output_norm_enc; + // classifier + struct ggml_tensor * cls; + struct ggml_tensor * cls_b; + struct ggml_tensor * cls_out = nullptr; + struct ggml_tensor * cls_out_b = nullptr; + std::vector layers; + // gguf metadata + std::unordered_map gguf_kv; + llama_split_mode split_mode; int main_gpu; int n_gpu_layers; - std::vector rpc_servers; + // list of devices used in this model + std::vector devices; - // gguf metadata - std::unordered_map gguf_kv; + std::vector rpc_servers; // layer -> buffer type mapping struct layer_buft { @@ -2934,11 +2934,6 @@ struct llama_model { ggml_free(ctx); } for (ggml_backend_buffer_t buf : bufs) { -#ifdef GGML_USE_CUDA - if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) { - ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf)); - } -#endif ggml_backend_buffer_free(buf); } while (!lora_adapters.empty()) { @@ -2952,9 +2947,6 @@ struct llama_sbatch_seq { llama_seq_id * seq_id; size_t offset; size_t length; - - // helper for smoother batch API transition -- can be deprecated in the future - llama_seq_id all_seq_id; // used if seq_id == NULL }; // sequence-length-aware batch splitting @@ -3049,30 +3041,18 @@ struct llama_sbatch { } else { ubatch.embd = nullptr; } - // from here on, the else branches are deprecated; - // they are helpers for smoother batch API transition - if (batch->pos) { - if (ubatch.equal_seqs) { - for (size_t i = 0; i < length; ++i) { - ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]]; - } - } else { - // simple split - ubatch.pos = batch->pos + seq.offset; + if (ubatch.equal_seqs) { + for (size_t i = 0; i < length; ++i) { + ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]]; } } else { - for (size_t i = 0; i < length; ++i) { - llama_pos bi = ids[seq.offset + i]; - ubatch.pos[ubatch.n_tokens + i] = batch->all_pos_0 + (bi * batch->all_pos_1); - } + // simple split + ubatch.pos = batch->pos + seq.offset; } if (ubatch.equal_seqs) { ubatch.n_seq_id[ubatch.n_seqs] = seq.n_seq_id; if (seq.seq_id) { ubatch.seq_id[ubatch.n_seqs] = seq.seq_id; - } else { - GGML_ASSERT(seq.n_seq_id == 1); - ubatch.seq_id[ubatch.n_seqs] = &seq.all_seq_id; } } else { // simple split @@ -3085,10 +3065,6 @@ struct llama_sbatch { } if (batch->seq_id) { ubatch.seq_id = batch->seq_id + seq.offset; - } else { - for (size_t i = 0; i < length; ++i) { - ubatch.seq_id[ubatch.n_seqs + i] = &seq.all_seq_id; - } } } if (logits_all) { @@ -3207,7 +3183,6 @@ struct llama_sbatch { s.seq_id = nullptr; s.offset = 0; s.length = n_tokens; - s.all_seq_id = batch.all_seq_id; return; } std::sort(ids.begin(), ids.end(), @@ -3230,7 +3205,7 @@ struct llama_sbatch { if (batch.pos) { return batch.pos[a] < batch.pos[b]; } - // no pos, sort by id (assuming batch.all_pos_1 is positive) + // no pos, sort by id return a < b; } // shared prompts go first @@ -3240,30 +3215,25 @@ struct llama_sbatch { // init seq llama_sbatch_seq * last_seq = nullptr; - if (batch.n_seq_id != nullptr && batch.seq_id != nullptr) { - for (size_t i = 0; i < n_tokens; ++i) { - const size_t bi = ids[i]; - const int32_t n_seqs = batch.n_seq_id[bi]; - llama_seq_id * seq_ids = batch.seq_id[bi]; - if (last_seq != nullptr) { - bool same = n_seqs == last_seq->n_seq_id; - for (int32_t j = 0; same && j < n_seqs; ++j) { - if (seq_ids[j] != last_seq->seq_id[j]) { - same = false; - } - } - if (same) { - last_seq->length += 1; - continue; + for (size_t i = 0; i < n_tokens; ++i) { + const size_t bi = ids[i]; + const int32_t n_seqs = batch.n_seq_id[bi]; + llama_seq_id * seq_ids = batch.seq_id[bi]; + if (last_seq != nullptr) { + bool same = n_seqs == last_seq->n_seq_id; + for (int32_t j = 0; same && j < n_seqs; ++j) { + if (seq_ids[j] != last_seq->seq_id[j]) { + same = false; } } - llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1, batch.all_seq_id}; - seq.push_back(new_seq); - last_seq = &seq.back(); + if (same) { + last_seq->length += 1; + continue; + } } - } else { - llama_sbatch_seq new_seq = {1, nullptr, 0, n_tokens, batch.all_seq_id}; + llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1}; seq.push_back(new_seq); + last_seq = &seq.back(); } // keep shared prompts first at the end, then sort by length descending. std::sort(seq.begin(), seq.end(), @@ -3303,12 +3273,8 @@ struct llama_context { std::unordered_map lora_adapters; std::vector backends; -#ifdef GGML_USE_METAL - ggml_backend_t backend_metal = nullptr; -#endif -#ifdef GGML_USE_BLAS - ggml_backend_t backend_blas = nullptr; -#endif + std::vector> set_n_threads_fns; + ggml_backend_t backend_cpu = nullptr; ggml_threadpool_t threadpool = nullptr; @@ -3424,78 +3390,77 @@ struct llama_lora_adapter { } }; -static size_t llama_get_device_count(const llama_model & model) { - size_t count = 1; -#if defined(GGML_USE_CUDA) - count = ggml_backend_cuda_get_device_count(); -#elif defined(GGML_USE_SYCL) - count = ggml_backend_sycl_get_device_count(); -#elif defined(GGML_USE_VULKAN) - count = ggml_backend_vk_get_device_count(); -#elif defined(GGML_USE_CANN) - return ggml_backend_cann_get_device_count(); -#endif +static int llama_get_device_count(const llama_model & model) { + int count = (int) model.devices.size(); + #if defined(GGML_USE_RPC) - count += model.rpc_servers.size(); + count += (int) model.rpc_servers.size(); #endif + return count; + GGML_UNUSED(model); } -static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) { +static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_model & model, bool host_buffer) { ggml_backend_buffer_type_t buft = nullptr; -#ifdef GGML_USE_RPC - int rpc_count = (int)model.rpc_servers.size(); -#else - int rpc_count = 0; -#endif - int local_gpu = gpu - rpc_count; -#if defined(GGML_USE_RPC) - if (gpu < rpc_count) { - const char * endpoint = model.rpc_servers[gpu].c_str(); - return ggml_backend_rpc_buffer_type(endpoint); + if (host_buffer) { + for (auto * dev : model.devices) { + buft = ggml_backend_dev_host_buffer_type(dev); + if (buft != nullptr) { + break; + } + } } -#endif -#if defined(GGML_USE_METAL) - buft = ggml_backend_metal_buffer_type(); -#elif defined(GGML_USE_CUDA) - buft = ggml_backend_cuda_buffer_type(local_gpu); -#elif defined(GGML_USE_VULKAN) - buft = ggml_backend_vk_buffer_type(local_gpu); -#elif defined(GGML_USE_SYCL) - buft = ggml_backend_sycl_buffer_type(local_gpu); -#elif defined(GGML_USE_KOMPUTE) - buft = ggml_backend_kompute_buffer_type(local_gpu); - if (buft == nullptr) { - LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, local_gpu); - } -#elif defined(GGML_USE_CANN) - buft = ggml_backend_cann_buffer_type(local_gpu); + +#if defined(GGML_USE_CPU_HBM) + buft = ggml_backend_cpu_hbm_buffer_type(); #endif if (buft == nullptr) { - buft = llama_default_buffer_type_cpu(true); + buft = ggml_backend_cpu_buffer_type(); } return buft; + + GGML_UNUSED(host_buffer); +} + +static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int device) { + ggml_backend_buffer_type_t buft = nullptr; + + if (device < (int)model.devices.size()) { + return ggml_backend_dev_buffer_type(model.devices[device]); + } + device -= (int)model.devices.size(); + +#if defined(GGML_USE_KOMPUTE) + buft = ggml_backend_kompute_buffer_type(device); +#endif + + if (buft == nullptr) { + buft = llama_default_buffer_type_cpu(model, true); + } + return buft; + GGML_UNUSED(model); - GGML_UNUSED(local_gpu); } static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) { ggml_backend_buffer_type_t buft = nullptr; -#ifdef GGML_USE_CUDA - if (ggml_backend_cuda_get_device_count() > 1) { - buft = ggml_backend_cuda_split_buffer_type(tensor_split); - } -#endif + // find a backend that supports split buffers + for (size_t i = 0; i < ggml_backend_reg_count(); ++i) { + ggml_backend_reg_t reg = ggml_backend_reg_get(i); -#ifdef GGML_USE_SYCL - if (ggml_backend_sycl_get_device_count() > 1) { - buft = ggml_backend_sycl_split_buffer_type(tensor_split); + auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type"); + if (ggml_backend_split_buffer_type_fn) { + buft = ggml_backend_split_buffer_type_fn(tensor_split); + if (buft != nullptr) { + break; + } + } } -#endif if (buft == nullptr) { buft = llama_default_buffer_type_offload(model, fallback_gpu); @@ -3506,46 +3471,24 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo } static size_t llama_get_device_memory(const llama_model & model, int device) { -#ifdef GGML_USE_RPC - int rpc_count = (int)model.rpc_servers.size(); -#else - int rpc_count = 0; -#endif - int local_device = device - rpc_count; -#if defined(GGML_USE_RPC) - if (device < rpc_count) { + if (device < (int)model.devices.size()) { + ggml_backend_dev_t dev = model.devices[device]; size_t total; size_t free; - const char * endpoint = model.rpc_servers[device].c_str(); - ggml_backend_rpc_get_device_memory(endpoint, &free, &total); + ggml_backend_dev_memory(dev, &free, &total); return free; } -#endif -#if defined(GGML_USE_CUDA) - size_t total; - size_t free; - ggml_backend_cuda_get_device_memory(local_device, &free, &total); - return free; -#elif defined(GGML_USE_SYCL) - size_t total; - size_t free; - ggml_backend_sycl_get_device_memory(local_device, &free, &total); - return free; -#elif defined(GGML_USE_VULKAN) - size_t total; - size_t free; - ggml_backend_vk_get_device_memory(local_device, &free, &total); - return free; -#elif defined(GGML_USE_CANN) - size_t total; - size_t free; - ggml_backend_cann_get_device_memory(local_device, &free, &total); - return free; -#else + + if (model.devices.size() > 0) { + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(model.devices[0]); + LLAMA_LOG_WARN("%s: failed to get free memmory of device:%d of backend:%s, for device id is out of range.\n", __func__, device, ggml_backend_reg_name(reg)); + } else { + LLAMA_LOG_WARN("%s: failed to get free memmory of device, no devices in inputted model.\n", __func__); + } return 1; -#endif + GGML_UNUSED(model); - GGML_UNUSED(local_device); + GGML_UNUSED(device); } // @@ -3588,7 +3531,7 @@ static bool llama_kv_cache_init( buft_layer_count[model.buft_layer[i].buft]++; } } else { - buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer; + buft_layer_count[llama_default_buffer_type_cpu(model, true)] = n_layer; } // create a context for each buffer type @@ -4880,7 +4823,7 @@ struct llama_model_loader { static const int TENSOR_NOT_REQUIRED = 1; static const int TENSOR_DUPLICATED = 2; - struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, int flags = 0) { + struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list & ne, int flags = 0) { const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED)); if (cur == NULL) { @@ -4890,7 +4833,7 @@ struct llama_model_loader { return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED); } - struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector & ne, size_t offset, bool required = true) { + struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list & ne, size_t offset, bool required = true) { const struct ggml_tensor * cur = check_tensor_dims(name, ne, required); if (cur == NULL) { @@ -4903,7 +4846,7 @@ struct llama_model_loader { std::array dims; for (size_t i = 0; i < GGML_MAX_DIMS; ++i) { - dims[i] = i < ne.size() ? ne[i] : 1; + dims[i] = i < ne.size() ? ne.begin()[i] : 1; } struct ggml_tensor * tensor = ggml_view_4d(ctx, base, @@ -5001,7 +4944,7 @@ struct llama_model_loader { // Returns false if cancelled by progress_callback bool load_all_data( struct ggml_context * ctx, - llama_buf_map & bufs_mmap, + llama_buf_map & bufs, llama_mlocks * lmlocks, llama_progress_callback progress_callback, void * progress_callback_user_data) { @@ -5010,43 +4953,94 @@ struct llama_model_loader { std::vector> read_buf; std::vector>> validation_result; -#if defined(GGML_USE_CUDA) // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives. // NVMe raid configurations might require more / larger buffers. constexpr size_t n_buffers = 4; constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB std::vector host_buffers; - std::vector host_ptrs; std::vector events; + std::vector host_ptrs; size_t buffer_idx = 0; // buffer to use for async loads - - ggml_backend_t cuda_backend = nullptr; - if (!use_mmap && !check_tensors) { + ggml_backend_t upload_backend = [&](const char * fn) -> ggml_backend_t { + if (use_mmap || check_tensors) { + return nullptr; + } // When not using mmaped io use async uploads from pinned memory to GPU memory. - // First determine if the CUDA backend is active, and if so, determine the device ID. - ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr; - if (buf) { - ggml_backend_buffer_type_t buffer_type = ggml_backend_buffer_get_type(buf); - for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) { - auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i); - if (buffer_type == cuda_buffer_type) { - cuda_backend = ggml_backend_cuda_init(i); - break; - } - } + // First determine if the backend supports the necessary features for async uploads. + auto * buf = bufs.count(0) ? bufs.at(0) : nullptr; + if (!buf) { + LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", fn); + return nullptr; } - // If the cuda backend is active create pinned memory buffers and events for synchronisation. - if (cuda_backend) { - for (size_t idx = 0; idx < n_buffers; ++idx) { - host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size)); - host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx])); - events.emplace_back(ggml_backend_event_new(cuda_backend)); - } + auto * buft = ggml_backend_buffer_get_type(buf); + auto * dev = ggml_backend_buft_get_device(buft); + if (!dev) { + LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", fn, + ggml_backend_buft_name(buft)); + return nullptr; } + + if (buft != ggml_backend_dev_buffer_type(dev)) { + LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", fn, + ggml_backend_buft_name(buft), ggml_backend_dev_name(dev)); + return nullptr; + } + + ggml_backend_dev_props props; + ggml_backend_dev_get_props(dev, &props); + if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) { + LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", fn, + ggml_backend_dev_name(dev)); + return nullptr; + } + + auto * host_buft = ggml_backend_dev_host_buffer_type(dev); + if (!host_buft) { + LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", fn, + ggml_backend_dev_name(dev)); + return nullptr; + } + + // If the backend is supported, create pinned memory buffers and events for synchronisation. + for (size_t idx = 0; idx < n_buffers; ++idx) { + auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size); + if (!buf) { + LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", fn, + ggml_backend_dev_name(dev)); + return nullptr; + } + + host_buffers.emplace_back(buf); + host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf)); + + auto * event = ggml_backend_event_new(dev); + if (!event) { + LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", fn, + ggml_backend_dev_name(dev)); + return nullptr; + } + + events.emplace_back(event); + } + + ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); + if (!backend) { + LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", fn, + ggml_backend_dev_name(dev)); + return nullptr; + } + + return backend; + }(__func__); + + if (upload_backend) { + LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__, + ggml_backend_dev_name(ggml_backend_get_device(upload_backend)), + ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))), + ggml_backend_name(upload_backend)); } -#endif for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { const auto * weight = get_weight(ggml_get_name(cur)); @@ -5066,8 +5060,8 @@ struct llama_model_loader { if (use_mmap) { const auto & mapping = mappings.at(weight->idx); ggml_backend_buffer_t buf_mmap = nullptr; - if (bufs_mmap.count(weight->idx)) { - buf_mmap = bufs_mmap.at(weight->idx); + if (bufs.count(weight->idx)) { + buf_mmap = bufs.at(weight->idx); } uint8_t * data = (uint8_t *) mapping->addr + weight->offs; @@ -5103,9 +5097,8 @@ struct llama_model_loader { })); } } else { -#if defined(GGML_USE_CUDA) - // If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. - if (cuda_backend) { + // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. + if (upload_backend) { file->seek(weight->offs, SEEK_SET); size_t bytes_read = 0; @@ -5115,17 +5108,14 @@ struct llama_model_loader { ggml_backend_event_synchronize(events[buffer_idx]); file->read_raw(host_ptrs[buffer_idx], read_iteration); - ggml_backend_tensor_set_async(cuda_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration); - ggml_backend_event_record(events[buffer_idx]); + ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration); + ggml_backend_event_record(events[buffer_idx], upload_backend); bytes_read += read_iteration; ++buffer_idx; buffer_idx %= n_buffers; } - } - else -#endif - { + } else { read_buf.resize(n_size); file->seek(weight->offs, SEEK_SET); file->read_raw(read_buf.data(), n_size); @@ -5140,17 +5130,15 @@ struct llama_model_loader { size_done += n_size; } -#if defined(GGML_USE_CUDA) - // free temporary resources used for async cuda uploads - if (cuda_backend) { - for (size_t idx = 0; idx < n_buffers;++idx) { - ggml_backend_event_synchronize(events[idx]); - ggml_backend_event_free(events[idx]); - ggml_backend_buffer_free(host_buffers[idx]); - } - ggml_backend_free(cuda_backend); + // free temporary resources used for async uploads + for (auto * event : events) { + ggml_backend_event_synchronize(event); + ggml_backend_event_free(event); } -#endif + for (auto * buf : host_buffers) { + ggml_backend_buffer_free(buf); + } + ggml_backend_free(upload_backend); // check validation results bool validation_failed = false; @@ -5466,8 +5454,10 @@ static void llm_load_hparams( } } else { switch (hparams.n_layer) { + case 16: model.type = e_model::MODEL_1B; break; // Llama 3.2 1B case 22: model.type = e_model::MODEL_1B; break; case 26: model.type = e_model::MODEL_3B; break; + case 28: model.type = e_model::MODEL_3B; break; // Llama 3.2 3B // granite uses a vocab with len 49152 case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break; case 36: model.type = e_model::MODEL_8B; break; // granite @@ -5580,11 +5570,11 @@ static void llm_load_hparams( ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); hparams.f_max_alibi_bias = 8.0f; switch (hparams.n_layer) { - case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small + case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base } } break; @@ -6084,6 +6074,18 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_CHAMELEON: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default + ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm); + + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_7B; break; + case 48: model.type = e_model::MODEL_34B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -6117,14 +6119,14 @@ static void llm_load_vocab( vocab.type = LLAMA_VOCAB_TYPE_NONE; // default special tokens - vocab.special_bos_id = -1; - vocab.special_eos_id = -1; - vocab.special_unk_id = -1; - vocab.special_sep_id = -1; - vocab.special_pad_id = -1; - vocab.special_cls_id = -1; - vocab.special_mask_id = -1; - vocab.linefeed_id = -1; + vocab.special_bos_id = LLAMA_TOKEN_NULL; + vocab.special_eos_id = LLAMA_TOKEN_NULL; + vocab.special_unk_id = LLAMA_TOKEN_NULL; + vocab.special_sep_id = LLAMA_TOKEN_NULL; + vocab.special_pad_id = LLAMA_TOKEN_NULL; + vocab.special_cls_id = LLAMA_TOKEN_NULL; + vocab.special_mask_id = LLAMA_TOKEN_NULL; + vocab.linefeed_id = LLAMA_TOKEN_NULL; // read vocab size from metadata if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) { @@ -6141,16 +6143,16 @@ static void llm_load_vocab( vocab.special_bos_id = 1; vocab.special_eos_id = 2; vocab.special_unk_id = 0; - vocab.special_sep_id = -1; - vocab.special_pad_id = -1; - vocab.special_cls_id = -1; - vocab.special_mask_id = -1; + vocab.special_sep_id = LLAMA_TOKEN_NULL; + vocab.special_pad_id = LLAMA_TOKEN_NULL; + vocab.special_cls_id = LLAMA_TOKEN_NULL; + vocab.special_mask_id = LLAMA_TOKEN_NULL; } else if (tokenizer_model == "bert") { vocab.type = LLAMA_VOCAB_TYPE_WPM; // default special tokens - vocab.special_bos_id = -1; - vocab.special_eos_id = -1; + vocab.special_bos_id = LLAMA_TOKEN_NULL; + vocab.special_eos_id = LLAMA_TOKEN_NULL; vocab.special_unk_id = 100; vocab.special_sep_id = 102; vocab.special_pad_id = 0; @@ -6186,22 +6188,22 @@ static void llm_load_vocab( // default special tokens vocab.special_bos_id = 11; vocab.special_eos_id = 11; - vocab.special_unk_id = -1; - vocab.special_sep_id = -1; - vocab.special_pad_id = -1; - vocab.special_cls_id = -1; - vocab.special_mask_id = -1; + vocab.special_unk_id = LLAMA_TOKEN_NULL; + vocab.special_sep_id = LLAMA_TOKEN_NULL; + vocab.special_pad_id = LLAMA_TOKEN_NULL; + vocab.special_cls_id = LLAMA_TOKEN_NULL; + vocab.special_mask_id = LLAMA_TOKEN_NULL; } else if (tokenizer_model == "t5") { vocab.type = LLAMA_VOCAB_TYPE_UGM; // default special tokens - vocab.special_bos_id = -1; + vocab.special_bos_id = LLAMA_TOKEN_NULL; vocab.special_eos_id = 1; vocab.special_unk_id = 2; - vocab.special_sep_id = -1; + vocab.special_sep_id = LLAMA_TOKEN_NULL; vocab.special_pad_id = 0; - vocab.special_cls_id = -1; - vocab.special_mask_id = -1; + vocab.special_cls_id = LLAMA_TOKEN_NULL; + vocab.special_mask_id = LLAMA_TOKEN_NULL; const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str()); if (precompiled_charsmap_keyidx != -1) { @@ -6224,11 +6226,11 @@ static void llm_load_vocab( vocab.type = LLAMA_VOCAB_TYPE_RWKV; // default special tokens - vocab.special_bos_id = -1; - vocab.special_eos_id = -1; - vocab.special_unk_id = -1; - vocab.special_sep_id = -1; - vocab.special_pad_id = -1; + vocab.special_bos_id = LLAMA_TOKEN_NULL; + vocab.special_eos_id = LLAMA_TOKEN_NULL; + vocab.special_unk_id = LLAMA_TOKEN_NULL; + vocab.special_sep_id = LLAMA_TOKEN_NULL; + vocab.special_pad_id = LLAMA_TOKEN_NULL; } else { throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str())); } @@ -6277,6 +6279,7 @@ static void llm_load_vocab( tokenizer_pre == "phi-2" || tokenizer_pre == "jina-es" || tokenizer_pre == "jina-de" || + tokenizer_pre == "jina-v1-en" || tokenizer_pre == "jina-v2-es" || tokenizer_pre == "jina-v2-de" || tokenizer_pre == "jina-v2-code") { @@ -6311,7 +6314,7 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "chatglm-bpe") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4; - vocab.special_bos_id = -1; + vocab.special_bos_id = LLAMA_TOKEN_NULL; } else if ( tokenizer_pre == "viking") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING; @@ -6341,6 +6344,11 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "exaone") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE; + } else if ( + tokenizer_pre == "chameleon") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON; + vocab.tokenizer_add_bos = true; + vocab.tokenizer_clean_spaces = false; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -6398,7 +6406,12 @@ static void llm_load_vocab( for (uint32_t i = 0; i < n_vocab; i++) { std::string word = gguf_get_arr_str(ctx, token_idx, i); - GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); + + //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); + if (word.empty()) { + LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i); + word = "[EMPTY_" + std::to_string(i) + "]"; + } vocab.token_to_id[word] = i; vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size()); @@ -6423,46 +6436,10 @@ static void llm_load_vocab( } GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size()); + vocab.init_tokenizer(); + // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { - // For Fill-In-the-Middle (FIM)/infill models which where converted - // prior to support of FIM special tokens in GGUF, the following - // will allow those models to continue to work. The general names - // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and - // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once - // new versions of these models have been published. - std::string gen_name; - ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false); - - std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(), - [](unsigned char c){ return std::tolower(c); }); - - if (gen_name.find("code") != std::string::npos) { - if (model.arch == LLM_ARCH_LLAMA - && 32010 < vocab.id_to_token.size() - && vocab.id_to_token[32007].text.find("
") != std::string::npos
-              && vocab.id_to_token[32008].text.find("") != std::string::npos
-              && vocab.id_to_token[32009].text.find("") != std::string::npos
-              && vocab.id_to_token[32010].text.find("") != std::string::npos) {
-                vocab.special_prefix_id = 32007;
-                vocab.special_suffix_id = 32008;
-                vocab.special_middle_id = 32009;
-                vocab.special_eot_id    = 32010;
-            } else if (model.arch == LLM_ARCH_GEMMA
-              && 107 < vocab.id_to_token.size()
-              && vocab.id_to_token[67].text == "<|fim_prefix|>"
-              && vocab.id_to_token[69].text == "<|fim_suffix|>"
-              && vocab.id_to_token[68].text == "<|fim_middle|>"
-              && vocab.id_to_token[107].text == "") {
-                vocab.special_prefix_id = 67;
-                vocab.special_suffix_id = 69;
-                vocab.special_middle_id = 68;
-                // TODO: this is not EOT, it is "file separator" token, needs fix
-                //       https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
-                //vocab.special_eot_id    = 70;
-                vocab.special_eot_id    = 107;
-            }
-        }
         try {
             vocab.linefeed_id = llama_byte_to_token_impl(vocab, '\n');
         } catch (const std::exception & e) {
@@ -6477,25 +6454,39 @@ static void llm_load_vocab(
         vocab.linefeed_id = ids[0];
     } else {
         const std::vector ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
-        GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
-        vocab.linefeed_id = ids[0];
+
+        //GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
+        if (ids.empty()) {
+            LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
+            vocab.linefeed_id = vocab.special_pad_id;
+        } else {
+            vocab.linefeed_id = ids[0];
+        }
     }
 
     // special tokens
     {
         const std::vector> special_token_types = {
-            { LLM_KV_TOKENIZER_BOS_ID,    vocab.special_bos_id    },
-            { LLM_KV_TOKENIZER_EOS_ID,    vocab.special_eos_id    },
-            { LLM_KV_TOKENIZER_UNK_ID,    vocab.special_unk_id    },
-            { LLM_KV_TOKENIZER_SEP_ID,    vocab.special_sep_id    },
-            { LLM_KV_TOKENIZER_PAD_ID,    vocab.special_pad_id    },
-            { LLM_KV_TOKENIZER_CLS_ID,    vocab.special_cls_id    },
-            { LLM_KV_TOKENIZER_MASK_ID,   vocab.special_mask_id   },
-            { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
-            { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
-            { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
-            { LLM_KV_TOKENIZER_EOT_ID,    vocab.special_eot_id    },
-            { LLM_KV_TOKENIZER_EOM_ID,    vocab.special_eom_id    },
+            { LLM_KV_TOKENIZER_BOS_ID,     vocab.special_bos_id     },
+            { LLM_KV_TOKENIZER_EOS_ID,     vocab.special_eos_id     },
+            { LLM_KV_TOKENIZER_EOT_ID,     vocab.special_eot_id     },
+            { LLM_KV_TOKENIZER_EOM_ID,     vocab.special_eom_id     },
+            { LLM_KV_TOKENIZER_UNK_ID,     vocab.special_unk_id     },
+            { LLM_KV_TOKENIZER_SEP_ID,     vocab.special_sep_id     },
+            { LLM_KV_TOKENIZER_PAD_ID,     vocab.special_pad_id     },
+            { LLM_KV_TOKENIZER_CLS_ID,     vocab.special_cls_id     },
+            { LLM_KV_TOKENIZER_MASK_ID,    vocab.special_mask_id    },
+            { LLM_KV_TOKENIZER_FIM_PRE_ID, vocab.special_fim_pre_id },
+            { LLM_KV_TOKENIZER_FIM_SUF_ID, vocab.special_fim_suf_id },
+            { LLM_KV_TOKENIZER_FIM_MID_ID, vocab.special_fim_mid_id },
+            { LLM_KV_TOKENIZER_FIM_PAD_ID, vocab.special_fim_pad_id },
+            { LLM_KV_TOKENIZER_FIM_REP_ID, vocab.special_fim_rep_id },
+            { LLM_KV_TOKENIZER_FIM_SEP_ID, vocab.special_fim_sep_id },
+
+            // deprecated
+            { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_fim_pre_id },
+            { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_fim_suf_id },
+            { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_fim_mid_id },
         };
 
         for (const auto & it : special_token_types) {
@@ -6526,46 +6517,140 @@ static void llm_load_vocab(
             }
         }
 
-        // find EOT token: "<|eot_id|>", "<|im_end|>", "", etc.
-        //
-        // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
-        //       for now, we apply this workaround to find the EOT token based on its text
-        if (vocab.special_eot_id == -1) {
-            for (const auto & t : vocab.token_to_id) {
+        // auto-detect special tokens by text
+        // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
+        //       for now, we apply this workaround to find the tokens based on their text
+
+        for (const auto & t : vocab.token_to_id) {
+            // find EOT token: "<|eot_id|>", "<|im_end|>", "", etc.
+            if (vocab.special_eot_id == LLAMA_TOKEN_NULL) {
                 if (false
-                        // TODO: gemma "" is exported as a normal token, so the following check does not work
-                        //       need to fix convert script
-                        //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
                         || t.first == "<|eot_id|>"
                         || t.first == "<|im_end|>"
                         || t.first == "<|end|>"
                         || t.first == ""
                         || t.first == "<|endoftext|>"
                         || t.first == ""
+                        || t.first == "<|end▁of▁sentence|>" // DeepSeek
                    ) {
                     vocab.special_eot_id = t.second;
                     if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                                __func__, t.first.c_str());
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
                         vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                     }
-                    break;
                 }
             }
-        }
 
-        // find EOM token: "<|eom_id|>"
-        //
-        // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID
-        //       for now, we apply this workaround to find the EOM token based on its text
-        if (vocab.special_eom_id == -1) {
-            const auto & t = vocab.token_to_id.find("<|eom_id|>");
-            if (t != vocab.token_to_id.end()) {
-                vocab.special_eom_id = t->second;
-                if ((vocab.id_to_token[t->second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                    LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                        __func__, t->first.c_str());
-                    vocab.id_to_token[t->second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+            // find EOM token: "<|eom_id|>"
+            if (vocab.special_eom_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|eom_id|>"
+                        ) {
+                    vocab.special_eom_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_PRE token: "<|fim_prefix|>", "", "
", etc.
+            if (vocab.special_fim_pre_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_prefix|>"  // Qwen
+                        || t.first == ""
+                        || t.first == "<|fim▁begin|>" // DeepSeek
+                        || t.first == "
"
+                        ) {
+                    vocab.special_fim_pre_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_SUF token: "<|fim_suffix|>", "", "", etc.
+            if (vocab.special_fim_suf_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_suffix|>" // Qwen
+                        || t.first == ""
+                        || t.first == "<|fim▁hole|>" // DeepSeek
+                        || t.first == ""
+                        ) {
+                    vocab.special_fim_suf_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_MID token: "<|fim_middle|>", "", "", etc.
+            if (vocab.special_fim_mid_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_middle|>" // Qwen
+                        || t.first == ""
+                        || t.first == "<|fim▁end|>"  // DeepSeek
+                        || t.first == ""
+                        ) {
+                    vocab.special_fim_mid_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_PAD token: "<|fim_pad|>", "", "", etc.
+            if (vocab.special_fim_pad_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_pad|>" // Qwen
+                        || t.first == ""
+                        || t.first == ""
+                        ) {
+                    vocab.special_fim_pad_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_REP token: "<|fim_repo|>", "", "", etc.
+            if (vocab.special_fim_rep_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_repo|>"  // Qwen
+                        || t.first == "<|repo_name|>"
+                        || t.first == ""
+                        || t.first == ""
+                        ) {
+                    vocab.special_fim_rep_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_SEP token: "<|file_sep|>"
+            if (vocab.special_fim_sep_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|file_sep|>" // Qwen
+                        ) {
+                    vocab.special_fim_sep_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
                 }
             }
         }
@@ -6574,6 +6659,19 @@ static void llm_load_vocab(
         // this is currently determined based on the token text, which is obviously not ideal
         // ref: https://github.com/ggerganov/llama.cpp/issues/9606
         vocab.special_eog_ids.clear();
+
+        if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_pad_id) == 0) {
+            vocab.special_eog_ids.insert(vocab.special_fim_pad_id);
+        }
+
+        if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_rep_id) == 0) {
+            vocab.special_eog_ids.insert(vocab.special_fim_rep_id);
+        }
+
+        if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) {
+            vocab.special_eog_ids.insert(vocab.special_fim_sep_id);
+        }
+
         for (const auto & t : vocab.token_to_id) {
             if (false
                     || t.first == "<|eot_id|>"
@@ -6586,24 +6684,31 @@ static void llm_load_vocab(
                ) {
                 vocab.special_eog_ids.insert(t.second);
                 if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                    LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                            __func__, t.first.c_str());
+                    LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                            __func__, t.second, t.first.c_str());
                     vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                 }
+            } else {
+                // token is control, but not marked as EOG -> print a debug log
+                if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
+                    LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
+                            __func__, t.second, t.first.c_str());
+                }
             }
         }
 
-        if (vocab.special_eos_id != -1 && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
+        // sanity checks
+        if (vocab.special_eos_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
             vocab.special_eog_ids.insert(vocab.special_eos_id);
             LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
         }
 
-        if (vocab.special_eot_id != -1 && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
+        if (vocab.special_eot_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
             vocab.special_eog_ids.insert(vocab.special_eot_id);
             LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
         }
 
-        if (vocab.special_eom_id != -1 && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
+        if (vocab.special_eom_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
             vocab.special_eog_ids.insert(vocab.special_eom_id);
             LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
         }
@@ -6797,20 +6902,24 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     LLAMA_LOG_INFO("%s: general.name     = %s\n",    __func__, model.name.c_str());
 
     // special tokens
-    if (vocab.special_bos_id    != -1) { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, vocab.special_bos_id,  vocab.id_to_token[vocab.special_bos_id].text.c_str() );  }
-    if (vocab.special_eos_id    != -1) { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, vocab.special_eos_id,  vocab.id_to_token[vocab.special_eos_id].text.c_str() );  }
-    if (vocab.special_unk_id    != -1) { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, vocab.special_unk_id,  vocab.id_to_token[vocab.special_unk_id].text.c_str() );  }
-    if (vocab.special_sep_id    != -1) { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, vocab.special_sep_id,  vocab.id_to_token[vocab.special_sep_id].text.c_str() );  }
-    if (vocab.special_pad_id    != -1) { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, vocab.special_pad_id,  vocab.id_to_token[vocab.special_pad_id].text.c_str() );  }
-    if (vocab.special_cls_id    != -1) { LLAMA_LOG_INFO( "%s: CLS token        = %d '%s'\n", __func__, vocab.special_cls_id,  vocab.id_to_token[vocab.special_cls_id].text.c_str() );  }
-    if (vocab.special_mask_id   != -1) { LLAMA_LOG_INFO( "%s: MASK token       = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
+    if (vocab.special_bos_id  != -1)    { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, vocab.special_bos_id,     vocab.id_to_token[vocab.special_bos_id].text.c_str() );  }
+    if (vocab.special_eos_id  != -1)    { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, vocab.special_eos_id,     vocab.id_to_token[vocab.special_eos_id].text.c_str() );  }
+    if (vocab.special_eot_id  != -1)    { LLAMA_LOG_INFO( "%s: EOT token        = %d '%s'\n", __func__, vocab.special_eot_id,     vocab.id_to_token[vocab.special_eot_id].text.c_str() );  }
+    if (vocab.special_eom_id  != -1)    { LLAMA_LOG_INFO( "%s: EOM token        = %d '%s'\n", __func__, vocab.special_eom_id,     vocab.id_to_token[vocab.special_eom_id].text.c_str() );  }
+    if (vocab.special_unk_id  != -1)    { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, vocab.special_unk_id,     vocab.id_to_token[vocab.special_unk_id].text.c_str() );  }
+    if (vocab.special_sep_id  != -1)    { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, vocab.special_sep_id,     vocab.id_to_token[vocab.special_sep_id].text.c_str() );  }
+    if (vocab.special_pad_id  != -1)    { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, vocab.special_pad_id,     vocab.id_to_token[vocab.special_pad_id].text.c_str() );  }
+    if (vocab.special_cls_id  != -1)    { LLAMA_LOG_INFO( "%s: CLS token        = %d '%s'\n", __func__, vocab.special_cls_id,     vocab.id_to_token[vocab.special_cls_id].text.c_str() );  }
+    if (vocab.special_mask_id != -1)    { LLAMA_LOG_INFO( "%s: MASK token       = %d '%s'\n", __func__, vocab.special_mask_id,    vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
 
-    if (vocab.linefeed_id       != -1) { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,       vocab.id_to_token[vocab.linefeed_id].text.c_str() );       }
-    if (vocab.special_prefix_id != -1) { LLAMA_LOG_INFO( "%s: PRE token        = %d '%s'\n", __func__, vocab.special_prefix_id, vocab.id_to_token[vocab.special_prefix_id].text.c_str() ); }
-    if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token        = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
-    if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token        = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
-    if (vocab.special_eot_id    != -1) { LLAMA_LOG_INFO( "%s: EOT token        = %d '%s'\n", __func__, vocab.special_eot_id,    vocab.id_to_token[vocab.special_eot_id].text.c_str() );    }
-    if (vocab.special_eom_id    != -1) { LLAMA_LOG_INFO( "%s: EOM token        = %d '%s'\n", __func__, vocab.special_eom_id,    vocab.id_to_token[vocab.special_eom_id].text.c_str() );    }
+    if (vocab.linefeed_id != -1)        { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,        vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
+
+    if (vocab.special_fim_pre_id != -1) { LLAMA_LOG_INFO( "%s: FIM PRE token    = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
+    if (vocab.special_fim_suf_id != -1) { LLAMA_LOG_INFO( "%s: FIM SUF token    = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
+    if (vocab.special_fim_mid_id != -1) { LLAMA_LOG_INFO( "%s: FIM MID token    = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
+    if (vocab.special_fim_pad_id != -1) { LLAMA_LOG_INFO( "%s: FIM PAD token    = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
+    if (vocab.special_fim_rep_id != -1) { LLAMA_LOG_INFO( "%s: FIM REP token    = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
+    if (vocab.special_fim_sep_id != -1) { LLAMA_LOG_INFO( "%s: FIM SEP token    = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
 
     for (const auto & id : vocab.special_eog_ids) {
         LLAMA_LOG_INFO( "%s: EOG token        = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
@@ -6853,6 +6962,13 @@ static bool llm_load_tensors(
         void * progress_callback_user_data) {
     auto & hparams = model.hparams;
 
+    // check if the value of main_gpu is valid
+    if (llama_get_device_count(model) > 0 &&
+        split_mode != LLAMA_SPLIT_MODE_LAYER &&
+        (main_gpu < 0 || main_gpu >= llama_get_device_count(model))) {
+        throw std::runtime_error(format("invalid value for main_gpu: %d (available devices: %d)", main_gpu, llama_get_device_count(model)));
+    }
+
     model.split_mode   = split_mode;
     model.main_gpu     = main_gpu;
     model.n_gpu_layers = n_gpu_layers;
@@ -6862,14 +6978,21 @@ static bool llm_load_tensors(
     bool use_mmap_buffer = true;
 
     // there is very little benefit to offloading the input layer, so always keep it on the CPU
-    model.buft_input = llama_default_buffer_type_cpu(true);
+    model.buft_input = llama_default_buffer_type_cpu(model, true);
     //model.buft_input = llama_default_buffer_type_offload(main_gpu);
 
     model.buft_layer.resize(n_layer);
 
     // assign cpu layers
     for (int i = 0; i < i_gpu_start; ++i) {
-        model.buft_layer[i] = llama_default_buffer_type_cpu(true);
+#ifdef GGML_USE_AMX
+        model.buft_layer[i] = {
+            ggml_backend_amx_buffer_type(),
+            llama_default_buffer_type_cpu(model, true)
+        };
+#else
+        model.buft_layer[i] = llama_default_buffer_type_cpu(model, true);
+#endif
     }
 
     if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
@@ -6907,7 +7030,7 @@ static bool llm_load_tensors(
             int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
             model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
         } else {
-            model.buft_output = llama_default_buffer_type_cpu(true);
+            model.buft_output = llama_default_buffer_type_cpu(model, true);
         }
     } else {
         ggml_backend_buffer_type_t split_buft;
@@ -6931,7 +7054,7 @@ static bool llm_load_tensors(
                 llama_default_buffer_type_offload(model, main_gpu)
             };
         } else {
-            model.buft_output = llama_default_buffer_type_cpu(true);
+            model.buft_output = llama_default_buffer_type_cpu(model, true);
         }
     }
 
@@ -7351,6 +7474,12 @@ static bool llm_load_tensors(
 
                     if (model.arch == LLM_ARCH_BERT) {
                         model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD,    "weight"), {n_embd, n_ctx_train});
+
+                        model.cls   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.cls_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         llama_model_loader::TENSOR_NOT_REQUIRED);
+
+                        model.cls_out   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.cls_out_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS_OUT, "bias"),   {1},         llama_model_loader::TENSOR_NOT_REQUIRED);
                     }
 
                     model.tok_norm   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
@@ -7403,6 +7532,8 @@ static bool llm_load_tensors(
                     model.tok_norm   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
                     model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}); //LayerNorm bias
 
+                    model.cls   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    model.cls_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "bias"),   {1},         llama_model_loader::TENSOR_NOT_REQUIRED);
                     for (int i = 0; i < n_layer; ++i) {
                         ggml_context * ctx_layer = ctx_for_layer(i);
                         ggml_context * ctx_split = ctx_for_layer_split(i);
@@ -8728,6 +8859,45 @@ static bool llm_load_tensors(
                     }
 
                 } break;
+            case LLM_ARCH_CHAMELEON:
+                {
+                 model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+                 // output
+                    {
+                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
+                        // if output is NULL, init from the input tok embed
+                        if (model.output == NULL) {
+                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+                        }
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head});
+                        layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv});
+                        layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i),  {n_embd_head_k, n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i),  {n_embd_head_k, n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+
+                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+
+                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                    }
+                } break;
             default:
                 throw std::runtime_error("unknown architecture");
         }
@@ -8753,55 +8923,40 @@ static bool llm_load_tensors(
         llama_buf_map bufs;
         bufs.reserve(n_max_backend_buffer);
 
-        // only the mmap region containing the tensors in the model is mapped to the backend buffer
-        // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
-        // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
-        if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) {
+        // check if this backend device supports buffer_from_host_ptr
+        // when using a host buffer as the CPU bakcend buffer, use the CPU device to prioritize using buffer_from_host_ptr over the host buffer
+        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft == llama_default_buffer_type_cpu(model, true) ? ggml_backend_cpu_buffer_type() : buft);
+        bool buffer_from_host_ptr_supported = false;
+        if (dev) {
+            ggml_backend_dev_props props;
+            ggml_backend_dev_get_props(dev, &props);
+            buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
+        }
+
+        if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported) {
             for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
+                // only the mmap region containing the tensors in the model is mapped to the backend buffer
+                // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
+                // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
                 void * addr = nullptr;
-                size_t first, last;
+                size_t first, last; // NOLINT
                 ml.get_mapping_range(&first, &last, &addr, idx, ctx);
                 if (first >= last) {
                     continue;
                 }
-                ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first);
-                if (buf == nullptr) {
-                    throw std::runtime_error("unable to allocate backend CPU buffer");
-                }
-                model.bufs.push_back(buf);
-                bufs.emplace(idx, buf);
-#ifdef GGML_USE_CUDA
-                if (n_layer >= n_gpu_layers) {
-                    ggml_backend_cuda_register_host_buffer(
-                        ggml_backend_buffer_get_base(buf),
-                        ggml_backend_buffer_get_size(buf));
-                }
-#endif
-            }
-        }
-#ifdef GGML_USE_METAL
-        else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) {
-            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
                 const size_t max_size = ggml_get_max_tensor_size(ctx);
-                void * addr = nullptr;
-                size_t first, last;
-                ml.get_mapping_range(&first, &last, &addr, idx, ctx);
-                if (first >= last) {
-                    continue;
-                }
-                ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
+                ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
                 if (buf == nullptr) {
-                    throw std::runtime_error("unable to allocate backend metal buffer");
+                    throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
                 }
                 model.bufs.push_back(buf);
                 bufs.emplace(idx, buf);
             }
         }
-#endif
         else {
             ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
             if (buf == nullptr) {
-                throw std::runtime_error("unable to allocate backend buffer");
+                throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
             }
             model.bufs.push_back(buf);
             if (use_mlock && ggml_backend_buffer_is_host(buf)) {
@@ -10197,6 +10352,10 @@ struct llm_build_context {
         struct ggml_tensor * cur;
 
         switch (pooling_type) {
+            case LLAMA_POOLING_TYPE_NONE:
+                {
+                    cur = inp;
+                } break;
             case LLAMA_POOLING_TYPE_MEAN:
                 {
                     struct ggml_tensor * inp_mean = build_inp_mean();
@@ -10208,9 +10367,26 @@ struct llm_build_context {
                     struct ggml_tensor * inp_cls = build_inp_cls();
                     cur = ggml_get_rows(ctx0, inp, inp_cls);
                 } break;
-            case LLAMA_POOLING_TYPE_NONE:
+            case LLAMA_POOLING_TYPE_RANK:
                 {
-                    cur = inp;
+                    struct ggml_tensor * inp_cls = build_inp_cls();
+                    inp = ggml_get_rows(ctx0, inp, inp_cls);
+
+                    // classification head
+                    // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
+                    GGML_ASSERT(model.cls       != nullptr);
+                    GGML_ASSERT(model.cls_b     != nullptr);
+
+                    cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls, inp), model.cls_b);
+                    cur = ggml_tanh(ctx0, cur);
+
+                    // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
+                    // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
+                    if (model.cls_out) {
+                        GGML_ASSERT(model.cls_out_b != nullptr);
+
+                        cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls_out, cur), model.cls_out_b);
+                    }
                 } break;
             default:
                 {
@@ -11439,8 +11615,8 @@ struct llm_build_context {
             inpL = cur;
         }
 
-        // final output
         cur = inpL;
+
         cb(cur, "result_embd", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -15865,9 +16041,189 @@ struct llm_build_context {
         cur = ggml_get_rows(ctx0, cur, inp_out_ids);
 
         cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_norm", -1);
 
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    // ref: https://github.com/facebookresearch/chameleon
+    // based on the original build_llama() function, changes:
+    //   * qk-norm
+    //   * swin-norm
+    //   * removed bias
+    //   * removed MoE
+    struct ggml_cgraph * build_chameleon() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            if (hparams.swin_norm) {
+                cur = inpL;
+            } else {
+                cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+                cb(cur, "attn_norm", il);
+            }
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                if (model.layers[il].attn_q_norm) {
+                    Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
+                                ggml_element_size(Qcur) * n_embd_head,
+                                ggml_element_size(Qcur) * n_embd_head * n_head,
+                                0);
+                    cb(Qcur, "Qcur", il);
+
+                    Qcur = llm_build_norm(ctx0, Qcur, hparams,
+                                model.layers[il].attn_q_norm,
+                                model.layers[il].attn_q_norm_b,
+                                LLM_NORM, cb, il);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                if (model.layers[il].attn_k_norm) {
+                    Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
+                                ggml_element_size(Kcur) * n_embd_head,
+                                ggml_element_size(Kcur) * n_embd_head * n_head_kv,
+                                0);
+                    cb(Kcur, "Kcur", il);
+
+                    Kcur = llm_build_norm(ctx0, Kcur, hparams,
+                               model.layers[il].attn_k_norm,
+                               model.layers[il].attn_k_norm_b,
+                               LLM_NORM, cb, il);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, nullptr,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+
+                if (hparams.swin_norm) {
+                    cur = llm_build_norm(ctx0, cur, hparams,
+                        model.layers[il].attn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                }
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            if (!hparams.swin_norm) {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+            }
+
+            cur = llm_build_ffn(ctx0, lctx, cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_out", il);
+
+            if (hparams.swin_norm) {
+                cur = llm_build_norm(ctx0, cur, hparams,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output_with_img_logits", -1);
+
+        // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
+        // Needs to be removed once image outputs are supported.
+        int img_token_end_idx = 8196;
+        int img_token_start_idx = 4;
+        int num_img_tokens = img_token_end_idx - img_token_start_idx;
+        // creates 1d tensor of size num_img_tokens and values -FLT_MAX,
+        // which ensures that text token values are always at least larger than image token values
+        struct ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
+        img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
+        cb(img_logits, "img_logits", -1);
+        cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
+        cb(cur, "result_output", -1);
+
         ggml_build_forward_expand(gf, cur);
 
         return gf;
@@ -16132,6 +16488,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_rwkv6();
             } break;
+        case LLM_ARCH_CHAMELEON:
+            {
+                result = llm.build_chameleon();
+            } break;
         default:
             GGML_ABORT("fatal error");
     }
@@ -16418,7 +16778,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
         }
     }
 
-    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
+    if (cparams.embeddings && (
+                cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
+                cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
         const int64_t n_tokens     = batch.n_tokens;
         const int64_t n_seq_tokens = batch.n_seq_tokens;
         const int64_t n_seqs       = batch.n_seqs;
@@ -16433,7 +16795,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
             const llama_seq_id seq_id = batch.seq_id[s][0];
 
             // TODO: adapt limits to n_seqs when batch.equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
 
             for (int i = 0; i < n_seq_tokens; ++i) {
                 const llama_pos pos = batch.pos[s*n_seq_tokens + i];
@@ -16635,7 +16997,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
             lctx.embd = nullptr;
         }
 
-        lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size);
+        lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(lctx.model, true), new_size);
         if (lctx.buf_output == nullptr) {
             LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
             return 0;
@@ -16704,24 +17066,20 @@ static void llama_graph_compute(
             ggml_cgraph * gf,
                     int   n_threads,
         ggml_threadpool * threadpool) {
-#ifdef GGML_USE_METAL
-    if (ggml_backend_is_metal(lctx.backend_metal)) {
-        ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
-    }
-#endif
-
     if (lctx.backend_cpu != nullptr) {
-        ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
         ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
         ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
     }
-#ifdef GGML_USE_BLAS
-    if (lctx.backend_blas != nullptr) {
-        ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
-    }
-#endif
 
-    ggml_backend_sched_graph_compute_async(lctx.sched, gf);
+    // set the number of threads for all the backends
+    for (const auto & set_n_threads_fn : lctx.set_n_threads_fns) {
+        set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
+    }
+
+    auto err = ggml_backend_sched_graph_compute_async(lctx.sched, gf);
+    if (err != GGML_STATUS_SUCCESS) {
+        LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, err);
+    }
 
     // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
 }
@@ -16737,10 +17095,10 @@ static void llama_graph_compute(
 //
 static int llama_decode_internal(
          llama_context & lctx,
-           llama_batch   batch_all) { // TODO: rename back to batch
+           llama_batch   batch) {
 
     lctx.is_encoding = false;
-    const uint32_t n_tokens_all = batch_all.n_tokens;
+    const uint32_t n_tokens_all = batch.n_tokens;
 
     if (n_tokens_all == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
@@ -16751,12 +17109,12 @@ static int llama_decode_internal(
     const auto & hparams = model.hparams;
     const auto & cparams = lctx.cparams;
 
-    GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
+    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
 
-    if (batch_all.token) {
+    if (batch.token) {
         for (uint32_t i = 0; i < n_tokens_all; ++i) {
-            if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
-                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch_all.token[i]);
+            if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
                 return -1;
             }
         }
@@ -16787,9 +17145,9 @@ static int llama_decode_internal(
     lctx.embd_seq.clear();
 
     // count outputs
-    if (batch_all.logits && !embd_pooled) {
+    if (batch.logits && !embd_pooled) {
         for (uint32_t i = 0; i < n_tokens_all; ++i) {
-            n_outputs += batch_all.logits[i] != 0;
+            n_outputs += batch.logits[i] != 0;
         }
     } else if (lctx.logits_all || embd_pooled) {
         n_outputs = n_tokens_all;
@@ -16798,7 +17156,7 @@ static int llama_decode_internal(
         n_outputs = 1;
     }
 
-    lctx.sbatch.from_batch(batch_all, n_embd,
+    lctx.sbatch.from_batch(batch, n_embd,
         /* simple_split */ !kv_self.recurrent,
         /* logits_all   */ n_outputs == n_tokens_all);
 
@@ -16973,6 +17331,20 @@ static int llama_decode_internal(
                             ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
                         }
                     } break;
+                case LLAMA_POOLING_TYPE_RANK:
+                    {
+                        // extract the rerank score - a single float per sequence
+                        auto & embd_seq_out = lctx.embd_seq;
+
+                        for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
+                            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                                continue;
+                            }
+                            embd_seq_out[seq_id].resize(1);
+                            ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
+                        }
+                    } break;
                 case LLAMA_POOLING_TYPE_UNSPECIFIED:
                     {
                         GGML_ABORT("unknown pooling type");
@@ -17179,6 +17551,13 @@ static int llama_encode_internal(
                             ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
                         }
                     } break;
+                case LLAMA_POOLING_TYPE_RANK:
+                    {
+                        // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
+                        //       wait for an encoder model that requires this pooling type in order to test it
+                        //       https://github.com/ggerganov/llama.cpp/pull/9510
+                        GGML_ABORT("RANK pooling not implemented yet");
+                    }
                 case LLAMA_POOLING_TYPE_UNSPECIFIED:
                     {
                         GGML_ABORT("unknown pooling type");
@@ -17516,10 +17895,9 @@ static void llama_tensor_dequantize_internal(
     }
     float * f32_output = (float *) output.data();
 
-    ggml_type_traits_t qtype;
+    const ggml_type_traits * qtype = ggml_get_type_traits(tensor->type);
     if (ggml_is_quantized(tensor->type)) {
-        qtype = ggml_internal_get_type_traits(tensor->type);
-        if (qtype.to_float == NULL) {
+        if (qtype->to_float == NULL) {
             throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
         }
     } else if (tensor->type != GGML_TYPE_F16 &&
@@ -17533,7 +17911,7 @@ static void llama_tensor_dequantize_internal(
         } else if (tensor->type == GGML_TYPE_BF16) {
             ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
         } else if (ggml_is_quantized(tensor->type)) {
-            qtype.to_float(tensor->data, f32_output, nelements);
+            qtype->to_float(tensor->data, f32_output, nelements);
         } else {
             GGML_ABORT("fatal error"); // unreachable
         }
@@ -17569,7 +17947,7 @@ static void llama_tensor_dequantize_internal(
             } else if (typ == GGML_TYPE_BF16) {
                 ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
             } else {
-                qtype.to_float(inbuf, outbuf, nels);
+                qtype->to_float(inbuf, outbuf, nels);
             }
         };
         workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
@@ -18651,21 +19029,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
 }
 
 size_t llama_max_devices(void) {
-#if defined(GGML_USE_RPC)
-    return GGML_RPC_MAX_SERVERS;
-#elif defined(GGML_USE_METAL)
-    return 1;
-#elif defined(GGML_USE_CUDA)
-    return GGML_CUDA_MAX_DEVICES;
-#elif defined(GGML_USE_SYCL)
-    return GGML_SYCL_MAX_DEVICES;
-#elif defined(GGML_USE_VULKAN)
-    return GGML_VK_MAX_DEVICES;
-#elif defined(GGML_USE_CANN)
-    return GGML_CANN_MAX_DEVICES;
-#else
-    return 1;
-#endif
+    return 16;
 }
 
 bool llama_supports_mmap(void) {
@@ -18677,15 +19041,20 @@ bool llama_supports_mlock(void) {
 }
 
 bool llama_supports_gpu_offload(void) {
-#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL)   || defined(GGML_USE_VULKAN) || \
-    defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
+#if defined(GGML_USE_KOMPUTE)
     // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
     return true;
 #else
-    return false;
+    return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
+           ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr ||
+           llama_supports_rpc();
 #endif
 }
 
+bool llama_supports_rpc(void) {
+    return ggml_backend_reg_by_name("RPC") != nullptr;
+}
+
 void llama_backend_init(void) {
     ggml_time_init();
 
@@ -18747,17 +19116,72 @@ struct llama_model * llama_load_model_from_file(
             return true;
         };
     }
+
     if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
         // split the servers set them into model->rpc_servers
         std::string servers(params.rpc_servers);
         size_t pos = 0;
-        while ((pos = servers.find(",")) != std::string::npos) {
+        while ((pos = servers.find(',')) != std::string::npos) {
             std::string server = servers.substr(0, pos);
             model->rpc_servers.push_back(server);
             servers.erase(0, pos + 1);
         }
         model->rpc_servers.push_back(servers);
     }
+
+    // add RPC devices
+    if (!model->rpc_servers.empty()) {
+        ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
+        if (!rpc_reg) {
+            LLAMA_LOG_ERROR("%s: failed to find RPC backend\n", __func__);
+            llama_free_model(model);
+            return nullptr;
+        }
+
+        // ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
+        using ggml_backend_rpc_add_device_t = ggml_backend_dev_t (*)(const char *);
+        ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
+        if (!ggml_backend_rpc_add_device_fn) {
+            LLAMA_LOG_ERROR("%s: failed to find RPC device add function\n", __func__);
+            llama_free_model(model);
+            return nullptr;
+        }
+
+        for (const std::string & server : model->rpc_servers) {
+            ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
+            if (dev) {
+                model->devices.push_back(dev);
+            } else {
+                LLAMA_LOG_ERROR("%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
+                llama_free_model(model);
+                return nullptr;
+            }
+        }
+    }
+
+    // create list of devices to use with this model
+    // currently, we use all available devices
+    // TODO: rework API to give user more control over device selection
+    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        switch (ggml_backend_dev_type(dev)) {
+            case GGML_BACKEND_DEVICE_TYPE_CPU:
+            case GGML_BACKEND_DEVICE_TYPE_CPU_FULL:
+                // skip CPU backends since they are `handled separately
+                break;
+
+            case GGML_BACKEND_DEVICE_TYPE_GPU:
+            case GGML_BACKEND_DEVICE_TYPE_GPU_FULL:
+            {
+                size_t free, total; // NOLINT
+                ggml_backend_dev_memory(dev, &free, &total);
+                LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
+                model->devices.push_back(dev);
+                break;
+            }
+        }
+    }
+
     int status = llama_model_load(path_model, *model, params);
     GGML_ASSERT(status <= 0);
     if (status < 0) {
@@ -18766,7 +19190,7 @@ struct llama_model * llama_load_model_from_file(
         } else if (status == -2) {
             LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
         }
-        delete model;
+        llama_free_model(model);
         return nullptr;
     }
 
@@ -18806,7 +19230,7 @@ struct llama_context * llama_new_context_with_model(
         params.flash_attn = false;
     }
 
-    if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
+    if (ggml_is_quantized(params.type_v) && !params.flash_attn) {
         LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
         return nullptr;
     }
@@ -18919,102 +19343,39 @@ struct llama_context * llama_new_context_with_model(
 
     if (!hparams.vocab_only) {
         // initialize backends
-#if defined(GGML_USE_RPC)
-        if (model->n_gpu_layers > 0) {
-            for (const auto & endpoint : model->rpc_servers) {
-                ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
+        int main_gpu = model->main_gpu;
+
+        // with registry
+        if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
+            if (main_gpu >= 0 && main_gpu < (int)model->devices.size()) {
+                ggml_backend_dev_t main_dev = model->devices[main_gpu];
+                ggml_backend_t backend = ggml_backend_dev_init(main_dev, nullptr);
                 if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
+                    LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(main_dev));
                     llama_free(ctx);
                     return nullptr;
                 }
                 ctx->backends.push_back(backend);
             }
-        }
-#endif
-
-#if defined(GGML_USE_METAL)
-        if (model->n_gpu_layers > 0) {
-            ctx->backend_metal = ggml_backend_metal_init();
-            if (ctx->backend_metal == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
-                llama_free(ctx);
-                return nullptr;
-            }
-            ctx->backends.push_back(ctx->backend_metal);
-        }
-#elif defined(GGML_USE_CUDA)
-        if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
-            // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
-            ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
-            if (backend == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
-                llama_free(ctx);
-                return nullptr;
-            }
-            ctx->backends.push_back(backend);
         } else {
             // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
-            for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
-                ggml_backend_t backend = ggml_backend_cuda_init(device);
+            for (auto * dev : model->devices) {
+                ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
                 if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
+                    LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
                     llama_free(ctx);
                     return nullptr;
                 }
                 ctx->backends.push_back(backend);
             }
         }
-#elif defined(GGML_USE_VULKAN)
-        if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
-            LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
-            llama_free(ctx);
-            return nullptr;
+        if (main_gpu >= (int)model->devices.size()) {
+            main_gpu -= (int)model->devices.size();
         }
-        if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
-            ggml_backend_t backend = ggml_backend_vk_init(model->main_gpu);
-            if (backend == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
-                llama_free(ctx);
-                return nullptr;
-            }
-            ctx->backends.push_back(backend);
-        } else {
-            for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
-                ggml_backend_t backend = ggml_backend_vk_init(device);
-                if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device);
-                    llama_free(ctx);
-                    return nullptr;
-                }
-                ctx->backends.push_back(backend);
-            }
-        }
-#elif defined(GGML_USE_SYCL)
-        // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
-        if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
-            ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
-            if (backend == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
-                llama_free(ctx);
-                return nullptr;
-            }
-            ctx->backends.push_back(backend);
-        } else {
-            // LLAMA_SPLIT_LAYER requires a backend for each GPU
-            for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
-                ggml_backend_t backend = ggml_backend_sycl_init(i);
-                if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d for No.%d backend\n", __func__, i, i);
-                    llama_free(ctx);
-                    return nullptr;
-                }
-                ctx->backends.push_back(backend);
-            }
-        }
-#elif defined(GGML_USE_KOMPUTE)
+
+#if defined(GGML_USE_KOMPUTE)
         if (model->n_gpu_layers > 0) {
-            auto * backend = ggml_backend_kompute_init(model->main_gpu);
+            auto * backend = ggml_backend_kompute_init(main_gpu);
             if (backend == nullptr) {
                 LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
                 llama_free(ctx);
@@ -19022,40 +19383,21 @@ struct llama_context * llama_new_context_with_model(
             }
             ctx->backends.push_back(backend);
         }
-#elif defined(GGML_USE_CANN)
-    // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
-    // TODO: ggml_backend_cann is not support split tensor now, just leave code here.
-    if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
-        ggml_backend_t backend = ggml_backend_cann_init(model->main_gpu);
-        if (backend == nullptr) {
-            LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, model->main_gpu);
-            llama_free(ctx);
-            return nullptr;
-        }
-        ctx->backends.push_back(backend);
-    } else {
-        // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
-        // TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
-        for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) {
-            ggml_backend_t backend = ggml_backend_cann_init(device);
-            if (backend == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device);
-                llama_free(ctx);
-                return nullptr;
-            }
-            ctx->backends.push_back(backend);
-        }
-    }
 #endif
 
-#ifdef GGML_USE_BLAS
-        ctx->backend_blas = ggml_backend_blas_init();
-        if (ctx->backend_blas == nullptr) {
-            LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
-        } else {
-            ctx->backends.push_back(ctx->backend_blas);
+        // add other backends (such as BLAS)
+        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+            if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
+                ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+                if (backend == nullptr) {
+                    LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
+                    llama_free(ctx);
+                    return nullptr;
+                }
+                ctx->backends.push_back(backend);
+            }
         }
-#endif
 
         ctx->backend_cpu = ggml_backend_cpu_init();
         if (ctx->backend_cpu == nullptr) {
@@ -19065,6 +19407,18 @@ struct llama_context * llama_new_context_with_model(
         }
         ctx->backends.push_back(ctx->backend_cpu);
 
+        // create a list of the set_n_threads functions in the backends
+        for (auto * backend : ctx->backends) {
+            ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+            ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
+            if (reg) {
+                auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+                if (ggml_backend_set_n_threads_fn) {
+                    ctx->set_n_threads_fns.emplace_back(backend, ggml_backend_set_n_threads_fn);
+                }
+            }
+        }
+
         if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
             LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
             llama_free(ctx);
@@ -19084,7 +19438,7 @@ struct llama_context * llama_new_context_with_model(
             }
 
             LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
-                (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
+                      (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
                 ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
                 ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
         }
@@ -19110,7 +19464,7 @@ struct llama_context * llama_new_context_with_model(
             for (auto * backend : ctx->backends) {
                 if (ggml_backend_is_cpu(backend)) {
                     // use host buffers for the CPU backend compute buffer
-                    backend_buft.push_back(llama_default_buffer_type_cpu(true));
+                    backend_buft.push_back(llama_default_buffer_type_cpu(*model, true));
                 } else {
                     backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
                 }
@@ -19121,17 +19475,37 @@ struct llama_context * llama_new_context_with_model(
             // buffer used to store the computation graph and the tensor meta data
             ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
 
+            // TODO: move these checks to ggml_backend_sched
             // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
             bool pipeline_parallel =
                 llama_get_device_count(*model) > 1 &&
                 model->n_gpu_layers > (int)model->hparams.n_layer &&
                 model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
                 params.offload_kqv;
-#ifndef GGML_USE_CUDA
-            // pipeline parallelism requires support for async compute and events
-            // currently this is only implemented in the CUDA backend
-            pipeline_parallel = false;
-#endif
+
+            // pipeline parallelism requires support for async compute and events in all devices
+            if (pipeline_parallel) {
+                for (auto * backend : ctx->backends) {
+                    if (ggml_backend_is_cpu(backend)) {
+                        // ignore CPU backend
+                        continue;
+                    }
+                    auto * dev = ggml_backend_get_device(backend);
+                    if (!dev) {
+                        // backend is using old interface, not supported
+                        pipeline_parallel = false;
+                        break;
+                    }
+                    ggml_backend_dev_props props;
+                    ggml_backend_dev_get_props(dev, &props);
+                    if (!props.caps.async || !props.caps.events) {
+                        // device does not support async compute or events
+                        pipeline_parallel = false;
+                        break;
+                    }
+                }
+            }
+
             ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel);
 
             if (pipeline_parallel) {
@@ -19257,6 +19631,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_CHATGLM:
         case LLM_ARCH_GRANITE:
         case LLM_ARCH_GRANITE_MOE:
+        case LLM_ARCH_CHAMELEON:
             return LLAMA_ROPE_TYPE_NORM;
 
         // the pairs of head values are offset by n_rot/2
@@ -20659,9 +21034,7 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
 
 struct llama_batch llama_batch_get_one(
              llama_token * tokens,
-                 int32_t   n_tokens,
-               llama_pos   pos_0,
-            llama_seq_id   seq_id) {
+                 int32_t   n_tokens) {
     return {
         /*n_tokens       =*/ n_tokens,
         /*tokens         =*/ tokens,
@@ -20670,9 +21043,6 @@ struct llama_batch llama_batch_get_one(
         /*n_seq_id       =*/ nullptr,
         /*seq_id         =*/ nullptr,
         /*logits         =*/ nullptr,
-        /*all_pos_0      =*/ pos_0,
-        /*all_pos_1      =*/ 1,
-        /*all_seq_id     =*/ seq_id,
     };
 }
 
@@ -20685,9 +21055,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
         /*n_seq_id       =*/ nullptr,
         /*seq_id         =*/ nullptr,
         /*logits         =*/ nullptr,
-        /*all_pos_0      =*/ 0,
-        /*all_pos_1      =*/ 0,
-        /*all_seq_id     =*/ 0,
     };
 
     if (embd) {
@@ -20723,11 +21090,62 @@ void llama_batch_free(struct llama_batch batch) {
     if (batch.logits)   free(batch.logits);
 }
 
+// temporary allocate memory for the input batch if needed
+static const llama_seq_id batch_default_seq_id = 0;
+struct llama_batch_allocr {
+    std::array seq_id_0 = {batch_default_seq_id};
+    std::vector      pos;
+    std::vector        n_seq_id;
+    std::vector seq_id;
+    std::vector         logits;
+    struct llama_batch          batch;
+    // optionally fulfill the batch returned by llama_batch_get_one
+    llama_batch_allocr(struct llama_context * ctx, struct llama_batch in_batch) {
+        batch = in_batch;
+        if (!batch.pos) {
+            // determine the last position in KV cache
+            llama_pos last_pos = -1;
+            for (const auto & cell : ctx->kv_self.cells) {
+                if (cell.has_seq_id(batch_default_seq_id)) {
+                    last_pos = std::max(last_pos, cell.pos);
+                }
+            }
+            last_pos++; // next position
+            pos.resize(batch.n_tokens);
+            for (int32_t i = 0; i < batch.n_tokens; i++) {
+                pos[i] = i+last_pos;
+            }
+            batch.pos = pos.data();
+        }
+        if (!batch.n_seq_id) {
+            n_seq_id.resize(batch.n_tokens);
+            for (int32_t i = 0; i < batch.n_tokens; i++) {
+                n_seq_id[i] = seq_id_0.size();
+            }
+            batch.n_seq_id = n_seq_id.data();
+        }
+        if (!batch.seq_id) {
+            seq_id.resize(batch.n_tokens + 1);
+            seq_id[batch.n_tokens] = NULL;
+            for (int32_t i = 0; i < batch.n_tokens; i++) {
+                seq_id[i] = seq_id_0.data();
+            }
+            batch.seq_id = seq_id.data();
+        }
+        if (!batch.logits) {
+            logits.resize(batch.n_tokens);
+            logits[logits.size() - 1] = true;
+            batch.logits = logits.data();
+        }
+    }
+};
+
 int32_t llama_encode(
         struct llama_context * ctx,
           struct llama_batch   batch) {
-    const int ret = llama_encode_internal(*ctx, batch);
-    if (ret < 0) {
+    llama_batch_allocr batch_allocr(ctx, batch);
+    const int ret = llama_encode_internal(*ctx, batch_allocr.batch);
+    if (ret != 0) {
         LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
     }
 
@@ -20737,8 +21155,9 @@ int32_t llama_encode(
 int32_t llama_decode(
         struct llama_context * ctx,
           struct llama_batch   batch) {
-    const int ret = llama_decode_internal(*ctx, batch);
-    if (ret < 0) {
+    llama_batch_allocr batch_allocr(ctx, batch);
+    const int ret = llama_decode_internal(*ctx, batch_allocr.batch);
+    if (ret != 0) {
         LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
     }
 
@@ -20917,6 +21336,10 @@ llama_token llama_token_eos(const struct llama_model * model) {
     return llama_token_eos_impl(model->vocab);
 }
 
+llama_token llama_token_eot(const struct llama_model * model) {
+    return llama_token_eot_impl(model->vocab);
+}
+
 llama_token llama_token_cls(const struct llama_model * model) {
     return llama_token_cls_impl(model->vocab);
 }
@@ -20953,8 +21376,28 @@ llama_token llama_token_suffix(const struct llama_model * model) {
     return llama_token_suffix_impl(model->vocab);
 }
 
-llama_token llama_token_eot(const struct llama_model * model) {
-    return llama_token_eot_impl(model->vocab);
+llama_token llama_token_fim_pre(const struct llama_model * model) {
+    return llama_token_fim_pre_impl(model->vocab);
+}
+
+llama_token llama_token_fim_suf(const struct llama_model * model) {
+    return llama_token_fim_suf_impl(model->vocab);
+}
+
+llama_token llama_token_fim_mid(const struct llama_model * model) {
+    return llama_token_fim_mid_impl(model->vocab);
+}
+
+llama_token llama_token_fim_pad(const struct llama_model * model) {
+    return llama_token_fim_pad_impl(model->vocab);
+}
+
+llama_token llama_token_fim_rep(const struct llama_model * model) {
+    return llama_token_fim_rep_impl(model->vocab);
+}
+
+llama_token llama_token_fim_sep(const struct llama_model * model) {
+    return llama_token_fim_sep_impl(model->vocab);
 }
 
 //
@@ -21255,6 +21698,15 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "[|assistant|]";
         }
+    } else if (tmpl == "rwkv-world" || tmpl_contains("rwkv-world") || tmpl_contains("'User: ' + message['content'] + '\n\nAssistant:'")) {
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "user") {
+                ss << "User: " << message->content << "\n\nAssistant:";
+            } else {
+                ss << message->content << "\n\n";
+            }
+        }
     } else {
         // template not supported
         return -1;
@@ -21317,6 +21769,10 @@ bool llama_sampler_is_grammar_empty(struct llama_sampler * gsmpl) {
     return llama_sampler_is_grammar_empty_impl(gsmpl);
 }
 
+struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model) {
+    return llama_sampler_init_infill_impl(model->vocab);
+}
+
 //
 // model split
 //
@@ -21356,6 +21812,7 @@ const char * llama_print_system_info(void) {
     s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
     s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
     s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
+    s += "AMX_INT8 = "    + std::to_string(ggml_cpu_has_amx_int8())    + " | ";
     s += "FMA = "         + std::to_string(ggml_cpu_has_fma())         + " | ";
     s += "NEON = "        + std::to_string(ggml_cpu_has_neon())        + " | ";
     s += "SVE = "         + std::to_string(ggml_cpu_has_sve())         + " | ";
@@ -21440,15 +21897,9 @@ const std::vector> & llama_internal
 }
 
 void llama_log_set(ggml_log_callback log_callback, void * user_data) {
-    g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
-    g_state.log_callback_user_data = user_data;
-#ifdef GGML_USE_METAL
-    ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
-#elif defined(GGML_USE_CUDA)
-    ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
-#elif defined(GGML_USE_CANN)
-    ggml_backend_cann_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
-#endif
+    ggml_log_set(log_callback, user_data);
+    g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
+    g_logger_state.log_callback_user_data = user_data;
 }
 
 static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
@@ -21457,12 +21908,12 @@ static void llama_log_internal_v(ggml_log_level level, const char * format, va_l
     char buffer[128];
     int len = vsnprintf(buffer, 128, format, args);
     if (len < 128) {
-        g_state.log_callback(level, buffer, g_state.log_callback_user_data);
+        g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
     } else {
         char * buffer2 = new char[len + 1];
         vsnprintf(buffer2, len + 1, format, args_copy);
         buffer2[len] = 0;
-        g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
+        g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
         delete[] buffer2;
     }
     va_end(args_copy);
diff --git a/src/unicode-data.cpp b/src/unicode-data.cpp
index 02bdf7823..04dcd7fcf 100644
--- a/src/unicode-data.cpp
+++ b/src/unicode-data.cpp
@@ -7,7 +7,7 @@
 #include 
 #include 
 
-const std::vector> unicode_ranges_flags = {  // start, flags // last=next_start-1
+const std::initializer_list> unicode_ranges_flags = {  // start, flags // last=next_start-1
 {0x000000, 0x0080},
 {0x000020, 0x0008},
 {0x000021, 0x0020},
@@ -2311,7 +2311,8 @@ const std::unordered_set unicode_set_whitespace = {
 0x003000,
 };
 
-const std::unordered_map unicode_map_lowercase = {
+// list is always in ascending order, to enable binary search
+const std::initializer_list> unicode_map_lowercase = {
 {0x000041, 0x000061},
 {0x000042, 0x000062},
 {0x000043, 0x000063},
@@ -3747,7 +3748,8 @@ const std::unordered_map unicode_map_lowercase = {
 {0x01E921, 0x01E943},
 };
 
-const std::unordered_map unicode_map_uppercase = {
+// list is always in ascending order, to enable binary search
+const std::initializer_list> unicode_map_uppercase = {
 {0x000061, 0x000041},
 {0x000062, 0x000042},
 {0x000063, 0x000043},
@@ -5200,7 +5202,7 @@ const std::unordered_map unicode_map_uppercase = {
 {0x01E943, 0x01E921},
 };
 
-const std::vector unicode_ranges_nfd = {  // start, last, nfd
+const std::initializer_list unicode_ranges_nfd = {  // start, last, nfd
 {0x000000, 0x000000, 0x000000},
 {0x0000C0, 0x0000C5, 0x000041},
 {0x0000C7, 0x0000C7, 0x000043},
diff --git a/src/unicode-data.h b/src/unicode-data.h
index e27fe1770..f6973ebd2 100644
--- a/src/unicode-data.h
+++ b/src/unicode-data.h
@@ -13,8 +13,8 @@ struct range_nfd {
 
 static const uint32_t MAX_CODEPOINTS = 0x110000;
 
-extern const std::vector> unicode_ranges_flags;
+extern const std::initializer_list> unicode_ranges_flags;
 extern const std::unordered_set unicode_set_whitespace;
-extern const std::unordered_map unicode_map_lowercase;
-extern const std::unordered_map unicode_map_uppercase;
-extern const std::vector unicode_ranges_nfd;
+extern const std::initializer_list> unicode_map_lowercase;
+extern const std::initializer_list> unicode_map_uppercase;
+extern const std::initializer_list unicode_ranges_nfd;
diff --git a/src/unicode.cpp b/src/unicode.cpp
index f4e941cd1..50b35bbbc 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -123,11 +123,11 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
 static std::vector unicode_cpt_flags_array() {
     std::vector cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
 
-    assert (unicode_ranges_flags.front().first == 0);
-    assert (unicode_ranges_flags.back().first == MAX_CODEPOINTS);
+    assert (unicode_ranges_flags.begin()[0].first == 0);
+    assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS);
     for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) {
-        const auto range_ini = unicode_ranges_flags[i-1];  // codepoint_ini, flags
-        const auto range_end = unicode_ranges_flags[i];    // codepoint_end, flags
+        const auto range_ini = unicode_ranges_flags.begin()[i-1];  // codepoint_ini, flags
+        const auto range_end = unicode_ranges_flags.begin()[i];    // codepoint_end, flags
         for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) {
             cpt_flags[cpt] = range_ini.second;
         }
@@ -597,7 +597,7 @@ std::vector unicode_cpts_normalize_nfd(const std::vector & c
     std::vector result(cpts.size());
     for (size_t i = 0; i < cpts.size(); ++i) {
         const uint32_t cpt = cpts[i];
-        auto it = std::upper_bound(unicode_ranges_nfd.cbegin(), unicode_ranges_nfd.cend(), cpt, comp) - 1;
+        auto it = std::upper_bound(unicode_ranges_nfd.begin(), unicode_ranges_nfd.end(), cpt, comp) - 1;
         result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt;
     }
     return result;
@@ -639,8 +639,15 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
 }
 
 uint32_t unicode_tolower(uint32_t cp) {
-    auto it = unicode_map_lowercase.find(cp);
-    return it == unicode_map_lowercase.end() ? cp : it->second;
+    // binary search
+    auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cp,
+        [](const std::pair & pair, uint32_t value) {
+            return pair.first < value;
+        });
+    if (it != unicode_map_lowercase.end() && it->first == cp) {
+        return it->second;
+    }
+    return cp;  // Return the original code point if no lowercase mapping is found
 }
 
 std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs) {
diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp
index e07d09733..3665238b5 100644
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@@ -10,12 +10,12 @@
 #include 
 
 int main(void) {
-    gpt_params params;
+    common_params params;
 
     printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
     for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
         try {
-            auto ctx_arg = gpt_params_parser_init(params, (enum llama_example)ex);
+            auto ctx_arg = common_params_parser_init(params, (enum llama_example)ex);
             std::unordered_set seen_args;
             std::unordered_set seen_env_vars;
             for (const auto & opt : ctx_arg.options) {
@@ -58,44 +58,44 @@ int main(void) {
 
     // missing value
     argv = {"binary_name", "-m"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
 
     // wrong value (int)
     argv = {"binary_name", "-ngl", "hello"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
 
     // wrong value (enum)
     argv = {"binary_name", "-sm", "hello"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
 
     // non-existence arg in specific example (--draft cannot be used outside llama-speculative)
     argv = {"binary_name", "--draft", "123"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
 
 
     printf("test-arg-parser: test valid usage\n\n");
 
     argv = {"binary_name", "-m", "model_file.gguf"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
     assert(params.model == "model_file.gguf");
 
     argv = {"binary_name", "-t", "1234"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
     assert(params.cpuparams.n_threads == 1234);
 
     argv = {"binary_name", "--verbose"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
     assert(params.verbosity > 1);
 
     argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
     assert(params.model == "abc.gguf");
     assert(params.n_predict == 6789);
     assert(params.n_batch == 9090);
 
     // --draft cannot be used outside llama-speculative
     argv = {"binary_name", "--draft", "123"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
     assert(params.n_draft == 123);
 
 // skip this part on windows, because setenv is not supported
@@ -106,12 +106,12 @@ int main(void) {
 
     setenv("LLAMA_ARG_THREADS", "blah", true);
     argv = {"binary_name"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
 
     setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
     setenv("LLAMA_ARG_THREADS", "1010", true);
     argv = {"binary_name"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
     assert(params.model == "blah.gguf");
     assert(params.cpuparams.n_threads == 1010);
 
@@ -121,7 +121,7 @@ int main(void) {
     setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
     setenv("LLAMA_ARG_THREADS", "1010", true);
     argv = {"binary_name", "-m", "overwritten.gguf"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
     assert(params.model == "overwritten.gguf");
     assert(params.cpuparams.n_threads == 1010);
 #endif // _WIN32
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 9a96cfc4c..ee1a8877e 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1,6 +1,6 @@
 // This file defines tests for various GGML ops and backends.
 // For the forward pass it asserts that the results of multiple backends computing the same GGML ops are consistent.
-// For the backwards pass it asserts that the gradients from backpropagation are consistent
+// For the backward pass it asserts that the gradients from backpropagation are consistent
 // with the gradients obtained via the method of finite differences ("grad" mode, this is optional).
 // It is also possible to check the performance ("perf" mode).
 //
@@ -32,63 +32,52 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
-    // static RNG initialization (revisit if n_threads stops being constant)
-    static const size_t n_threads = std::thread::hardware_concurrency();
-    static std::vector generators = []() {
-        std::random_device rd;
-        std::vector vec;
-        vec.reserve(n_threads);
-        //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
-        for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
-        return vec;
-    }();
+    size_t nels = ggml_nelements(tensor);
+    std::vector data(nels);
+    {
+        // parallel initialization
+        static const size_t n_threads = std::thread::hardware_concurrency();
+        // static RNG initialization (revisit if n_threads stops being constant)
+        static std::vector generators = []() {
+            std::random_device rd;
+            std::vector vec;
+            vec.reserve(n_threads);
+            //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
+            for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
+            return vec;
+        }();
 
-    size_t size = ggml_nelements(tensor);
-    std::vector data(size);
+        auto init_thread = [&](size_t ith, size_t start, size_t end) {
+            std::uniform_real_distribution distribution(min, max);
+            auto & gen = generators[ith];
+            for (size_t i = start; i < end; i++) {
+                data[i] = distribution(gen);
+            }
+        };
 
-    auto init_thread = [&](size_t ith, size_t start, size_t end) {
-        std::uniform_real_distribution distribution(min, max);
-        for (size_t i = start; i < end; i++) {
-            data[i] = distribution(generators[ith]);
+        std::vector> tasks;
+        tasks.reserve(n_threads);
+        for (size_t i = 0; i < n_threads; i++) {
+            size_t start =     i*nels/n_threads;
+            size_t end   = (i+1)*nels/n_threads;
+            tasks.push_back(std::async(std::launch::async, init_thread, i, start, end));
         }
-    };
-
-    std::vector threads;
-    threads.reserve(n_threads);
-    for (size_t i = 0; i < n_threads; i++) {
-        size_t start =     i*size/n_threads;
-        size_t end   = (i+1)*size/n_threads;
-        threads.emplace_back(init_thread, i, start, end);
-    }
-    for (auto & t : threads) {
-        t.join();
-    }
-
-#if 0
-    const char * val_str = getenv("GGML_TEST_EPS");
-    float val = 1e-9f;
-    if (val_str != nullptr) {
-        val = std::stof(val_str);
-        printf("GGML_TEST_EPS=%e\n", val);
-    }
-
-    // test quantization with very small values that may result in nan scales due to division by zero
-    if (ggml_is_quantized(tensor->type)) {
-        for (int i = 0; i < 256; i++) {
-            data[i] = val;
+        for (auto & t : tasks) {
+            t.get();
         }
     }
-#endif
 
     if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
-        ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
+        ggml_backend_tensor_set(tensor, data.data(), 0, nels * sizeof(float));
     } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
-        GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
-        std::vector dataq(ggml_row_size(tensor->type, size));
-        std::vector imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
+        GGML_ASSERT(nels % ggml_blck_size(tensor->type) == 0);
+
+         // dummy importance matrix
+        std::vector imatrix(tensor->ne[0], 1.0f);
         const float * im = imatrix.data();
         if (!ggml_quantize_requires_imatrix(tensor->type)) {
             // when the imatrix is optional, we want to test both quantization with and without imatrix
@@ -98,19 +87,40 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
             }
         }
 
-        ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
-        GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
-        // TODO: other cases
-        //#pragma omp parallel for
-        //for (int i = 0; i < tensor->ne[1]; i++) {
-        //    ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
-        //        i * tensor->ne[0], 1, tensor->ne[0], im);
-        //}
+        std::vector dataq(ggml_row_size(tensor->type, nels));
+        {
+            // parallel quantization by block
+            size_t blck_size = ggml_blck_size(tensor->type);
+            size_t n_blocks = nels / blck_size;
 
+            auto quantize_thread = [&](size_t start, size_t end) {
+                ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
+                    start * blck_size, end - start, blck_size, im);
+            };
+
+            const size_t min_blocks_per_thread = 1;
+            const size_t n_threads = std::min(std::thread::hardware_concurrency()/2,
+                                                      std::max(1, n_blocks / min_blocks_per_thread));
+            std::vector> tasks;
+            tasks.reserve(n_threads);
+            for (size_t i = 0; i < n_threads; i++) {
+                size_t start =     i*n_blocks/n_threads;
+                size_t end   = (i+1)*n_blocks/n_threads;
+                tasks.push_back(std::async(std::launch::async, quantize_thread, start, end));
+            }
+            for (auto & t : tasks) {
+                t.get();
+            }
+        }
         ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
     } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
         // This is going to create some weird integers though.
         ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
+    } else if (tensor->type == GGML_TYPE_I64) {
+        // Integers with a size of 8 bytes can be set by mirroring the float data, the specific values are again not really meaningful.
+        const size_t nbytes_half = ggml_nbytes(tensor)/2;
+        ggml_backend_tensor_set(tensor, data.data(), 0*nbytes_half, nbytes_half);
+        ggml_backend_tensor_set(tensor, data.data(), 1*nbytes_half, nbytes_half);
     } else {
         GGML_ABORT("fatal error");
     }
@@ -123,7 +133,7 @@ static std::vector tensor_to_float(const ggml_tensor * t) {
     std::vector buf(ggml_nbytes(t));
     ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
 
-    ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
+    const auto * tt = ggml_get_type_traits(t->type);
     size_t bs = ggml_blck_size(t->type);
     std::vector vq(ggml_blck_size(t->type));
     bool quantized = ggml_is_quantized(t->type);
@@ -140,6 +150,8 @@ static std::vector tensor_to_float(const ggml_tensor * t) {
                         tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i]));
                     } else if (t->type == GGML_TYPE_F32) {
                         tv.push_back(*(float *) &buf[i]);
+                    } else if (t->type == GGML_TYPE_I64) {
+                        tv.push_back((float)*(int64_t *) &buf[i]);
                     } else if (t->type == GGML_TYPE_I32) {
                         tv.push_back((float)*(int32_t *) &buf[i]);
                     } else if (t->type == GGML_TYPE_I16) {
@@ -147,7 +159,7 @@ static std::vector tensor_to_float(const ggml_tensor * t) {
                     } else if (t->type == GGML_TYPE_I8) {
                         tv.push_back((float)*(int8_t *) &buf[i]);
                     } else if (quantized) {
-                        tt.to_float(&buf[i], vq.data(), bs);
+                        tt->to_float(&buf[i], vq.data(), bs);
                         tv.insert(tv.end(), vq.begin(), vq.end());
                     } else {
                         GGML_ABORT("fatal error");
@@ -160,60 +172,6 @@ static std::vector tensor_to_float(const ggml_tensor * t) {
     return tv;
 }
 
-/*
-static double cosine_similarity(const float * v1, const float * v2, size_t n) {
-    double dot = 0.0;
-    double mag1 = 0.0;
-    double mag2 = 0.0;
-
-    for (size_t i = 0; i < n; i++) {
-        if (std::isnan(v1[i]) || std::isnan(v2[i])) {
-            return -1.0f;
-        }
-        if (std::isinf(v1[i]) && std::isinf(v2[i])) {
-            continue;
-        }
-        dot  += v1[i]*v2[i];
-        mag1 += v1[i]*v1[i];
-        mag2 += v2[i]*v2[i];
-    }
-
-    return dot/sqrt(mag1*mag2);
-}
-
-static float distance(const float * v1, const float * v2, size_t n) {
-    double d = 0.0;
-
-    for (size_t i = 0; i < n; i++) {
-        if (std::isnan(v1[i]) || std::isnan(v2[i])) {
-            return INFINITY;
-        }
-        if (std::isinf(v1[i]) && std::isinf(v2[i])) {
-            continue;
-        }
-        d += (v1[i] - v2[i])*(v1[i] - v2[i]);
-    }
-
-    return sqrt(d);
-}
-
-static float vec_len(const float * v, size_t n) {
-    double d = 0.0;
-
-    for (size_t i = 0; i < n; i++) {
-        if (std::isnan(v[i])) {
-            return INFINITY;
-        }
-        if (std::isinf(v[i])) {
-            continue;
-        }
-        d += v[i]*v[i];
-    }
-
-    return sqrt(d);
-}
-*/
-
 // normalized mean squared error = mse(a, b) / mse(a, 0)
 static double nmse(const float * a, const float * b, size_t n) {
     double mse_a_b = 0.0;
@@ -264,7 +222,6 @@ static double mean_abs_asymm(const float * a, const float * b, const size_t n, c
 }
 
 // utils for printing the variables of the test cases
-#define VAR_TO_STR(x) (#x "=" + var_to_str(x))
 
 template
 static std::string var_to_str(const T & x) {
@@ -297,10 +254,6 @@ static std::string var_to_str(const std::array & x) {
     return s;
 }
 
-//static std::string var_to_str(ggml_unary_op unary_op) {
-//    return ggml_unary_op_name(unary_op);
-//}
-
 static std::string var_to_str(ggml_type type) {
     return ggml_type_name(type);
 }
@@ -313,6 +266,8 @@ static std::string var_to_str(ggml_op_pool pool) {
     }
 }
 
+#define VAR_TO_STR(x) (#x "=" + var_to_str(x))
+
 #define VARS_TO_STR1(a) VAR_TO_STR(a)
 #define VARS_TO_STR2(a, b) VAR_TO_STR(a) + "," + VAR_TO_STR(b)
 #define VARS_TO_STR3(a, b, c) VAR_TO_STR(a) + "," + VARS_TO_STR2(b, c)
@@ -370,13 +325,13 @@ struct test_case {
         return 1e-4;
     }
 
-    virtual float grad_eps(){
+    virtual float grad_eps() {
         return 1e-1f;
     }
 
     // If false, estimate gradient with 2 points, neglects 3rd order derivative and higher.
     // If true,  estimate gradient with 4 points, neglects 5th order derivative and higher.
-    virtual bool grad_precise(){
+    virtual bool grad_precise() {
         return false;
     }
 
@@ -409,6 +364,11 @@ struct test_case {
         return size;
     }
 
+    virtual uint64_t op_flops(ggml_tensor * t) {
+        GGML_UNUSED(t);
+        return 0;
+    }
+
     ggml_cgraph * gf = nullptr;
     ggml_cgraph * gb = nullptr;
 
@@ -651,12 +611,11 @@ struct test_case {
         }
 
         // align while also leaving some margin for variations in parameters
-        int align = 20;
+        int align = 8;
         int last = (len + align - 1) / align * align;
         if (last - len < 5) {
             last += align;
         }
-        last = std::max(last, 60);
         printf("%*s", last - len, "");
 
         // allocate
@@ -677,9 +636,25 @@ struct test_case {
         // warmup run
         ggml_backend_graph_compute(backend, gf);
 
+        // determine number of runs
+        int n_runs;
+        if (op_flops(out) > 0) {
+            // based on flops
+            const uint64_t GFLOP = 1000 * 1000 * 1000;
+            const uint64_t target_flops_cpu =   8ULL * GFLOP;
+            const uint64_t target_flops_gpu = 100ULL * GFLOP;
+            uint64_t target_flops = ggml_backend_is_cpu(backend) ? target_flops_cpu : target_flops_gpu;
+            n_runs = std::min(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1;
+        } else {
+            // based on memory size
+            const size_t GB = 1ULL << 30;
+            const size_t target_size_cpu =  8 * GB;
+            const size_t target_size_gpu = 32 * GB;
+            size_t target_size = ggml_backend_is_cpu(backend) ? target_size_cpu : target_size_gpu;
+            n_runs = std::min(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
+        }
+
         // duplicate the op
-        size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
-        int n_runs = std::min((size_t) ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
         for (int i = 1; i < n_runs; i++) {
             ggml_graph_add_node(gf, out);
         }
@@ -704,19 +679,46 @@ struct test_case {
         }
 
         // run
-        ggml_backend_synchronize(backend);
+        int64_t total_time_us = 0;
+        int total_runs = 0;
+        do {
+            int64_t start_time = ggml_time_us();
+            ggml_backend_graph_compute(backend, gf);
+            int64_t end_time = ggml_time_us();
 
-        int64_t start_time = ggml_time_us();
-        ggml_backend_graph_compute(backend, gf);
-        ggml_backend_synchronize(backend);
-        int64_t end_time = ggml_time_us();
-        double time_us = end_time - start_time;
+            total_time_us += end_time - start_time;
+            total_runs += n_runs;
+        } while (total_time_us < 1000*1000); // run for at least 1 second
 
-        printf("    %5d runs - %8.2f us/run - %8zu kB/run - \033[1;34m%7.2f GB/s\033[0m\n",
-            n_runs,
-            time_us / n_runs,
-            op_size(out) / 1024,
-            mem / (time_us/1e6) / 1024.0 / 1024.0 / 1024.0);
+        printf("    %8d runs - %8.2f us/run - ",
+            total_runs,
+            (double)total_time_us / total_runs);
+
+        if (op_flops(out) > 0) {
+            double flops_per_sec = (op_flops(out) * total_runs) / (total_time_us / 1e6);
+            auto format_flops = [](double flops) -> std::string {
+                char buf[256];
+                if (flops >= 1e12) {
+                    snprintf(buf, sizeof(buf), "%6.2f TFLOP", flops / 1e12);
+                } else if (flops >= 1e9) {
+                    snprintf(buf, sizeof(buf), "%6.2f GFLOP", flops / 1e9);
+                } else if (flops >= 1e6) {
+                    snprintf(buf, sizeof(buf), "%6.2f MFLOP", flops / 1e6);
+                } else {
+                    snprintf(buf, sizeof(buf), "%6.2f KFLOP", flops / 1e3);
+                }
+                return buf;
+            };
+            printf("%s/run - \033[1;34m%sS\033[0m",
+                format_flops(op_flops(out)).c_str(),
+                format_flops(flops_per_sec).c_str());
+
+        } else {
+            printf("%8zu kB/run - \033[1;34m%7.2f GB/s\033[0m",
+                op_size(out) / 1024,
+                mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0);
+        }
+        printf("\n");
 
         ggml_backend_buffer_free(buf);
 
@@ -742,7 +744,7 @@ struct test_case {
 
         ggml_tensor * out = build_graph(ctx);
 
-        if (op_name != nullptr && op_desc(out) != op_name) {
+        if ((op_name != nullptr && op_desc(out) != op_name) || out->op == GGML_OP_OPT_STEP_ADAMW) {
             //printf("  %s: skipping\n", op_desc(out).c_str());
             ggml_free(ctx);
             return true;
@@ -751,11 +753,6 @@ struct test_case {
         printf("  %s(%s): ", op_desc(out).c_str(), vars().c_str());
         fflush(stdout);
 
-        if (out->grad == nullptr) {
-            printf("backwards pass not supported \n");
-            ggml_free(ctx);
-            return true;
-        }
         if (out->type != GGML_TYPE_F32) {
             ggml_free(ctx);
             printf("not supported [%s->type != FP32]\n", out->name);
@@ -764,18 +761,26 @@ struct test_case {
 
         // check if the backend supports the ops
         bool supported = true;
+        bool any_params = false;
         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
             if (!ggml_backend_supports_op(backend, t)) {
                 printf("not supported [%s] ", ggml_backend_name(backend));
                 supported = false;
                 break;
             }
-            if ((t->flags & GGML_TENSOR_FLAG_PARAM) && t->type != GGML_TYPE_F32) {
-                printf("not supported [%s->type != FP32] ", t->name);
-                supported = false;
-                break;
+            if ((t->flags & GGML_TENSOR_FLAG_PARAM)) {
+                any_params = true;
+                if (t->type != GGML_TYPE_F32) {
+                    printf("not supported [%s->type != FP32] ", t->name);
+                    supported = false;
+                    break;
+                }
             }
         }
+        if (!any_params) {
+            printf("not supported [%s] \n", op_name);
+            supported = false;
+        }
         if (!supported) {
             printf("\n");
             ggml_free(ctx);
@@ -803,7 +808,7 @@ struct test_case {
 
         ggml_build_forward_expand(gf, out);
         ggml_graph_cpy(gf, gb);
-        ggml_build_backward_expand(ctx, gf, gb, false, false);
+        ggml_build_backward_expand(ctx, gf, gb, false);
         if (expect.size() != 1 || expect[0] != 0.0f) {
             GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
             for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
@@ -986,7 +991,7 @@ struct test_example : public test_case {
     }
     // In order to also check the gradients for your op, add calls like ggml_set_param(ctx, a)
     // immediately after you create the tensors.
-    // This is optional and only makes sense if a backwards pass has actually been implemented for the new op.
+    // This is optional and only makes sense if a backward pass has actually been implemented for the new op.
 };
 
 
@@ -1118,6 +1123,71 @@ struct test_get_rows : public test_case {
     }
 };
 
+// GGML_OP_ARGMAX
+struct test_argmax : public test_case {
+    const ggml_type type;
+    const std::array ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_argmax(ggml_type type = GGML_TYPE_F32,
+            std::array ne = {10, 100, 1, 1})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_argmax(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    double max_nmse_err() override {
+        return 0.0;
+    }
+};
+
+// GGML_OP_COUNT_EQUAL
+struct test_count_equal : public test_case {
+    const ggml_type type;
+    const std::array ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_count_equal(ggml_type type = GGML_TYPE_F32,
+            std::array ne = {4, 500, 1, 1})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * a_argmax = ggml_argmax(ctx, a);
+        ggml_set_name(a_argmax, "a_argmax");
+
+        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(b, "b");
+
+        ggml_tensor * b_argmax = ggml_argmax(ctx, a);
+        ggml_set_name(b_argmax, "b_argmax");
+
+        ggml_tensor * out = ggml_count_equal(ctx, a_argmax, b_argmax);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    double max_nmse_err() override {
+        return 0.0;
+    }
+};
+
 // GGML_OP_REPEAT
 struct test_repeat : public test_case {
     const ggml_type type;
@@ -1225,7 +1295,7 @@ struct test_set : public test_case {
             offset += ((ne_dst[i] - ne[i])/2)*dst->nb[i];
         }
         ggml_tensor * out = ggml_set(ctx, dst, src,
-            // The backwards pass requires setting a contiguous region:
+            // The backward pass requires setting a contiguous region:
             src->nb[1], src->nb[2], src->nb[3], offset);
         ggml_set_name(out, "out");
 
@@ -1337,7 +1407,7 @@ struct test_bin_bcast : public test_case {
         ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
         ggml_set_name(b, "b");
 
-        // The backwards pass supports broadcasting only for GGML_ADD:
+        // The backward pass supports broadcasting only for GGML_ADD:
         const bool grad_supported = op == ggml_add || ggml_are_same_shape(a, b);
         if (grad_supported) {
             ggml_set_param(ctx, a);
@@ -1591,13 +1661,9 @@ struct test_mul_mat : public test_case {
         return 5e-4;
     }
 
-    size_t op_size(ggml_tensor * t) override {
-        size_t a = ggml_nbytes(t->src[0]) * n * nr[0] * nr[1];
-        size_t b = ggml_nbytes(t->src[1]) * m;
-        size_t c  = ggml_nbytes(t);
-        return a + b + c;
-
+    uint64_t op_flops(ggml_tensor * t) override {
         GGML_UNUSED(t);
+        return 2 * m * n * k * bs[0] * nr[0] * bs[1] * nr[1];
     }
 
     test_mul_mat(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
@@ -1641,13 +1707,9 @@ struct test_mul_mat_id : public test_case {
         return 5e-4;
     }
 
-    size_t op_size(ggml_tensor * t) override {
-        size_t a = ggml_nbytes(t->src[2]) * n;
-        size_t b = ggml_nbytes(t->src[1]) * m;
-        size_t c  = ggml_nbytes(t);
-        return a + b + c;
-
+    uint64_t op_flops(ggml_tensor * t) override {
         GGML_UNUSED(t);
+        return 2 * m * k * n * n_used;
     }
 
     test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
@@ -1840,7 +1902,7 @@ struct test_log : public test_case {
 
     void initialize_tensors(ggml_context * ctx) override {
         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            // log(1) == 0, cluster values there to keep the sum low for better precision in the backwards pass:
+            // log(1) == 0, cluster values there to keep the sum low for better precision in the backward pass:
             init_tensor_uniform(t, 0.9f, 1.1f);
         }
     }
@@ -2758,7 +2820,10 @@ struct test_opt_step_adamw : public test_case {
         ggml_set_param(ctx, a); // Despite tensor a having gradients the output tensor will not.
         ggml_set_name(a, "a");
 
-        ggml_tensor * out = ggml_opt_step_adamw(ctx, a, alpha, beta1, beta2, eps, wd);
+        ggml_tensor * grad = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_name(grad, "grad");
+
+        ggml_tensor * out = ggml_opt_step_adamw(ctx, a, grad, alpha, beta1, beta2, eps, wd);
         ggml_set_name(out, "out");
 
         return out;
@@ -3163,47 +3228,46 @@ struct test_falcon : public test_llm {
 // ###########################################
 // ## Section 3: GGML Op Test Instantiation ##
 // ###########################################
+static const ggml_type all_types[] = {
+    GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
+    GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
+    GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
+    GGML_TYPE_Q8_0,
+    GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
+    GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
+    GGML_TYPE_Q6_K,
+    // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
+    GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
+    GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
+    GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
+};
 
+static const ggml_type base_types[] = {
+    GGML_TYPE_F32, GGML_TYPE_F16,
+    GGML_TYPE_Q4_0,
+    GGML_TYPE_Q4_K,
+    GGML_TYPE_IQ2_XXS
+};
 
-static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
+static const ggml_type other_types[] = {
+    GGML_TYPE_Q4_1,
+    GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
+    GGML_TYPE_Q8_0,
+    GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
+    GGML_TYPE_Q5_K,
+    GGML_TYPE_Q6_K,
+    // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
+    GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
+    GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
+    GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
+    GGML_TYPE_BF16,
+};
+
+// Test cases for evaluation: should try to cover edge cases while using small input sizes to keep the runtime low
+static std::vector> make_test_cases_eval() {
     std::vector> test_cases;
     std::default_random_engine rng(0);
 
-    const ggml_type all_types[] = {
-        GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
-        GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
-        GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
-        GGML_TYPE_Q8_0,
-        GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
-        GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
-        GGML_TYPE_Q6_K,
-        // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
-        GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
-        GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
-        GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
-    };
-
-    const ggml_type base_types[] = {
-        GGML_TYPE_F32, GGML_TYPE_F16,
-        GGML_TYPE_Q4_0,
-        GGML_TYPE_Q4_K,
-        GGML_TYPE_IQ2_XXS
-    };
-
-    const ggml_type other_types[] = {
-        GGML_TYPE_Q4_1,
-        GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
-        GGML_TYPE_Q8_0,
-        GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
-        GGML_TYPE_Q5_K,
-        GGML_TYPE_Q6_K,
-        // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
-        GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
-        GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
-        GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
-        GGML_TYPE_BF16,
-    };
-
     // unary ops
     for (int v : {0, 1}) {
         for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
@@ -3268,7 +3332,10 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
     test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
 
-    for (int ne3 : {1, 3}) { // CUDA backwards pass only supports ne3 == 1
+    test_cases.emplace_back(new test_argmax());
+    test_cases.emplace_back(new test_count_equal());
+
+    for (int ne3 : {1, 3}) { // CUDA backward pass only supports ne3 == 1
         test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 1}));
         test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
         test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 2, 1, 1}));
@@ -3286,8 +3353,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows
     test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3}));
     test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous
-    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
-    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10,  8, 3, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10,  8, 3, 1}, {1, 2, 0, 3}));
 
     for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) {
         test_cases.emplace_back(new test_set(GGML_TYPE_F32, GGML_TYPE_F32, {6, 5, 4, 3}, dim));
@@ -3392,6 +3459,14 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
             test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
         }
     }
+    for (ggml_type type_a : other_types) {
+        for (ggml_type type_b : {GGML_TYPE_F32}) {
+            if (ggml_blck_size(type_a) != 256) {
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), {1,  1}, {1, 1}));
+            }
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1,  1}, {1, 1}));
+        }
+    }
 #else
     // m = a rows
     // n = b rows
@@ -3411,15 +3486,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     }
 #endif
 
-    for (ggml_type type_a : other_types) {
-        for (ggml_type type_b : {GGML_TYPE_F32}) {
-            if (ggml_blck_size(type_a) != 256) {
-                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), {1,  1}, {1, 1}));
-            }
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1,  1}, {1, 1}));
-        }
-    }
-
     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 2,  128, { 8,  1}, {1, 1}));
     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  83, 2,  128, { 8,  1}, {4, 1}));
     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 2,   64, { 8,  1}, {4, 1}));
@@ -3624,20 +3690,30 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     test_cases.emplace_back(new test_falcon(2));
 #endif
 
-    // run tests
-    if (mode == MODE_GRAD) {
-        size_t n_ok = 0;
-        for (auto & test : test_cases) {
-            if (test->eval_grad(backend, op_name)) {
-                n_ok++;
+    return test_cases;
+}
+
+// Test cases for performance evaluation: should be representative of real-world use cases
+static std::vector> make_test_cases_perf() {
+    std::vector> test_cases;
+
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1,   1, 1, 1}));
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
+
+    for (int bs : {1, 512}) {
+        for (ggml_type type_a : all_types) {
+            for (ggml_type type_b : {GGML_TYPE_F32}) {
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 4096, bs, 14336, {1,  1}, {1, 1}));
             }
         }
-        printf("  %zu/%zu tests passed\n", n_ok, test_cases.size());
-
-        return n_ok == test_cases.size();
     }
 
+    return test_cases;
+}
+
+static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
     if (mode == MODE_TEST) {
+        auto test_cases = make_test_cases_eval();
         ggml_backend_t backend_cpu = ggml_backend_cpu_init();
 
         size_t n_ok = 0;
@@ -3653,7 +3729,21 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
         return n_ok == test_cases.size();
     }
 
+    if (mode == MODE_GRAD) {
+        auto test_cases = make_test_cases_eval();
+        size_t n_ok = 0;
+        for (auto & test : test_cases) {
+            if (test->eval_grad(backend, op_name)) {
+                n_ok++;
+            }
+        }
+        printf("  %zu/%zu tests passed\n", n_ok, test_cases.size());
+
+        return n_ok == test_cases.size();
+    }
+
     if (mode == MODE_PERF) {
+        auto test_cases = make_test_cases_perf();
         for (auto & test : test_cases) {
             test->eval_perf(backend, op_name);
         }
@@ -3667,9 +3757,9 @@ static void usage(char ** argv) {
     printf("Usage: %s [mode] [-o op] [-b backend]\n", argv[0]);
     printf("    valid modes:\n");
     printf("      - test (default, compare with CPU backend for correctness)\n");
-    printf("      - perf (performance evaluation)\n");
     printf("      - grad (compare gradients from backpropagation with method of finite differences)\n");
-    printf("    op names are as given by ggml_op_desc() (e.g. GGML_ADD)\n");
+    printf("      - perf (performance evaluation)\n");
+    printf("    op names for -o are as given by ggml_op_desc() (e.g. ADD, MUL_MAT, etc)\n");
 }
 
 int main(int argc, char ** argv) {
@@ -3705,20 +3795,22 @@ int main(int argc, char ** argv) {
     }
 
     // enumerate backends
-    printf("Testing %zu backends\n\n", ggml_backend_reg_get_count());
+    printf("Testing %zu devices\n\n", ggml_backend_dev_count());
 
     size_t n_ok = 0;
 
-    for (size_t i = 0; i < ggml_backend_reg_get_count(); i++) {
-        printf("Backend %zu/%zu (%s)\n", i + 1, ggml_backend_reg_get_count(), ggml_backend_reg_get_name(i));
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
 
-        if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_reg_get_name(i)) != 0) {
+        printf("Backend %zu/%zu: %s\n", i + 1, ggml_backend_dev_count(), ggml_backend_dev_name(dev));
+
+        if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_dev_name(dev)) != 0) {
             printf("  Skipping\n");
             n_ok++;
             continue;
         }
 
-        ggml_backend_t backend = ggml_backend_reg_init_backend(i, NULL);
+        ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
         GGML_ASSERT(backend != NULL);
 
         if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) {
@@ -3728,7 +3820,18 @@ int main(int argc, char ** argv) {
             continue;
         }
 
-        printf("  Backend name: %s\n", ggml_backend_name(backend));
+        ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+        auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+        if (ggml_backend_set_n_threads_fn) {
+            // TODO: better value for n_threads
+            ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency());
+        }
+
+        printf("  Device description: %s\n", ggml_backend_dev_description(dev));
+        size_t free, total; // NOLINT
+        ggml_backend_dev_memory(dev, &free, &total);
+        printf("  Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024);
+        printf("\n");
 
         bool ok = test_backend(backend, mode, op_name_filter);
 
@@ -3745,9 +3848,9 @@ int main(int argc, char ** argv) {
         ggml_backend_free(backend);
     }
 
-    printf("%zu/%zu backends passed\n", n_ok, ggml_backend_reg_get_count());
+    printf("%zu/%zu backends passed\n", n_ok, ggml_backend_dev_count());
 
-    if (n_ok != ggml_backend_reg_get_count()) {
+    if (n_ok != ggml_backend_dev_count()) {
         printf("\033[1;31mFAIL\033[0m\n");
         return 1;
     }
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index 999681152..406f3884f 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -254,6 +254,11 @@ static void test_legacy_templates() {
             "DeepSeek-V2",
             "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
             u8"You are a helpful assistant\n\nUser: Hello\n\nAssistant: Hi there<|end▁of▁sentence|>User: Who are you\n\nAssistant:    I am an assistant   <|end▁of▁sentence|>User: Another question\n\nAssistant:",
+        },
+        {
+            "RWKV-World",
+            "{% for message in messages %}{% if message['role'] == 'user' %}{{'User: ' + message['content'] + '\n\nAssistant:'}}{% else %}{{message['content'] + '\n\n'}}{% endif %}{% endfor %}",
+            "You are a helpful assistant\n\nUser: Hello\n\nAssistant:Hi there\n\nUser: Who are you\n\nAssistant:   I am an assistant   \n\nUser: Another question\n\nAssistant:",
         }
     };
 
@@ -299,11 +304,11 @@ static void test_legacy_templates() {
 
     // test llama_chat_format_single for system message
     printf("\n\n=== llama_chat_format_single (system message) ===\n\n");
-    std::vector chat2;
-    llama_chat_msg sys_msg{"system", "You are a helpful assistant"};
+    std::vector chat2;
+    common_chat_msg sys_msg{"system", "You are a helpful assistant"};
 
     auto fmt_sys = [&](std::string tmpl) {
-        auto output = llama_chat_format_single(nullptr, tmpl, chat2, sys_msg, false);
+        auto output = common_chat_format_single(nullptr, tmpl, chat2, sys_msg, false);
         printf("fmt_sys(%s) : %s\n", tmpl.c_str(), output.c_str());
         printf("-------------------------\n");
         return output;
@@ -319,10 +324,10 @@ static void test_legacy_templates() {
     chat2.push_back({"system", "You are a helpful assistant"});
     chat2.push_back({"user", "Hello"});
     chat2.push_back({"assistant", "I am assistant"});
-    llama_chat_msg new_msg{"user", "How are you"};
+    common_chat_msg new_msg{"user", "How are you"};
 
     auto fmt_single = [&](std::string tmpl) {
-        auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
+        auto output = common_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
         printf("fmt_single(%s) : %s\n", tmpl.c_str(), output.c_str());
         printf("-------------------------\n");
         return output;
diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp
index 2ef606d2c..2200ad93d 100644
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@@ -240,12 +240,14 @@ static bool check_gradient(
     struct ggml_cgraph * gb = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
     ggml_build_forward_expand(gf, f);
     ggml_graph_cpy(gf, gb);
-    ggml_build_backward_expand(ctx0, gf, gb, false, false);
+    ggml_build_backward_expand(ctx0, gf, gb, false);
 
     ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
 
-    ggml_graph_reset  (gf);
-    ggml_set_f32      (f->grad, 1.0f);
+    ggml_graph_reset(gb);
+    if (f->grad) {
+        ggml_set_f32(f->grad, 1.0f);
+    }
 
     ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
 
@@ -298,8 +300,10 @@ static bool check_gradient(
             ggml_set_f32_1d(x[i], k, x0);
 
             // compute gradient using backward graph
-            ggml_graph_reset  (gf);
-            ggml_set_f32      (f->grad, 1.0f);
+            ggml_graph_reset(gb);
+            if (f->grad) {
+                ggml_set_f32(f->grad, 1.0f);
+            }
 
             ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
 
diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp
index 3a89598a8..9d2db91f5 100755
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@@ -696,7 +696,7 @@ static void test_all(const std::string & lang, std::function tmp_q(2*test_size);
     std::vector tmp_out(test_size);
 
-    qfns.from_float(test_data, tmp_q.data(), test_size);
-    qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
+    qfns->from_float(test_data, tmp_q.data(), test_size);
+    qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
     return array_rmse(test_data, tmp_out.data(), test_size);
 }
 
 // Total quantization error on test data
-static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
+static float reference_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) {
     std::vector tmp_q(2*test_size);
     std::vector tmp_out(test_size);
     std::vector tmp_out_ref(test_size);
 
-    qfns.from_float(test_data, tmp_q.data(), test_size);
-    qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
+    qfns->from_float(test_data, tmp_q.data(), test_size);
+    qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
 
-    qfns.from_float_ref(test_data, tmp_q.data(), test_size);
-    qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
+    qfns->from_float_ref(test_data, tmp_q.data(), test_size);
+    qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
 
     return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
 }
@@ -78,18 +78,18 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {
 
 // Total dot product error
 static float dot_product_error(
-    ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2
+    const ggml_type_traits * qfns, size_t test_size, const float * test_data1, const float *test_data2
 ) {
     std::vector tmp_q1(2*test_size);
     std::vector tmp_q2(2*test_size);
 
-    auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
+    const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type);
 
-    qfns.from_float(test_data1, tmp_q1.data(), test_size);
-    vdot.from_float(test_data2, tmp_q2.data(), test_size);
+    qfns->from_float(test_data1, tmp_q1.data(), test_size);
+    vdot->from_float(test_data2, tmp_q2.data(), test_size);
 
     float result = INFINITY;
-    qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
+    qfns->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
 
     const float dot_ref = dot_product(test_data1, test_data2, test_size);
 
@@ -131,10 +131,10 @@ int main(int argc, char * argv[]) {
 
     for (int i = 0; i < GGML_TYPE_COUNT; i++) {
         ggml_type type = (ggml_type) i;
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
+        const auto * qfns = ggml_get_type_traits(type);
 
         // deprecated - skip
-        if (qfns.blck_size == 0) {
+        if (qfns->blck_size == 0) {
             continue;
         }
 
@@ -143,7 +143,7 @@ int main(int argc, char * argv[]) {
         printf("Testing %s\n", ggml_type_name((ggml_type) i));
         ggml_quantize_init(ei);
 
-        if (qfns.from_float && qfns.to_float) {
+        if (qfns->from_float && qfns->to_float) {
             const float total_error = total_quantization_error(qfns, test_size, test_data.data());
             const float max_quantization_error =
                 type == GGML_TYPE_TQ1_0   ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp
index 24e066053..bdbdd90a8 100644
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@@ -122,9 +122,9 @@ static void usage(char * argv[]) {
     printf("  --type TYPE           set test type as");
     for (int i = 0; i < GGML_TYPE_COUNT; i++) {
         ggml_type type = (ggml_type) i;
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
+        const auto * qfns = ggml_get_type_traits(type);
         if (ggml_type_name(type) != NULL) {
-            if (qfns.from_float && qfns.to_float) {
+            if (qfns->from_float && qfns->to_float) {
                 printf(" %s", ggml_type_name(type));
             }
         }
@@ -270,12 +270,12 @@ int main(int argc, char * argv[]) {
 
     for (int i = 0; i < GGML_TYPE_COUNT; i++) {
         ggml_type type = (ggml_type) i;
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
+        const auto * qfns = ggml_get_type_traits(type);
         if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
             continue;
         }
 
-        if (qfns.from_float && qfns.to_float) {
+        if (qfns->from_float && qfns->to_float) {
             printf("%s\n", ggml_type_name(type));
 
             ggml_quantize_init(type);
@@ -285,7 +285,7 @@ int main(int argc, char * argv[]) {
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
-                        qfns.from_float_ref(test_data1, test_q1, size);
+                        qfns->from_float_ref(test_data1, test_q1, size);
                         return test_q1[0];
                     };
                     size_t quantized_size = ggml_row_size(type, size);
@@ -299,7 +299,7 @@ int main(int argc, char * argv[]) {
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
-                        qfns.from_float(test_data1, test_q1, size);
+                        qfns->from_float(test_data1, test_q1, size);
                         return test_q1[0];
                     };
                     size_t quantized_size = ggml_row_size(type, size);
@@ -310,11 +310,11 @@ int main(int argc, char * argv[]) {
 
             if (params.op_dequantize_row_q) {
                 printf("  dequantize_row_q\n");
-                qfns.from_float(test_data1, test_q1, largest);
+                qfns->from_float(test_data1, test_q1, largest);
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
-                        qfns.to_float(test_q1, test_out, size);
+                        qfns->to_float(test_q1, test_out, size);
                         return test_out[0];
                     };
                     size_t quantized_size = ggml_row_size(type, size);
@@ -328,8 +328,8 @@ int main(int argc, char * argv[]) {
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
-                        auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
-                        vdot.from_float(test_data1, test_q1, size);
+                        const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type);
+                        vdot->from_float(test_data1, test_q1, size);
                         return test_q1[0];
                     };
                     size_t quantized_size = ggml_row_size(type, size);
@@ -340,13 +340,13 @@ int main(int argc, char * argv[]) {
 
             if (params.op_vec_dot_q) {
                 printf("  vec_dot_q\n");
-                qfns.from_float(test_data1, test_q1, largest);
-                qfns.from_float(test_data2, test_q2, largest);
+                qfns->from_float(test_data1, test_q1, largest);
+                qfns->from_float(test_data2, test_q2, largest);
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
                         float result;
-                        qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
+                        qfns->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
                         return result;
                     };
                     size_t quantized_size = ggml_row_size(type, size);
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 6e021c4c7..05600e6f5 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -18,181 +18,176 @@ static void dump(const llama_token_data_array * cur_p) {
 
 #define DUMP(__cur_p) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__cur_p)); printf("-\n"); } while(0)
 
-#define APPLY(__cnstr, __cur_p) do { \
-    auto * cnstr = (__cnstr); \
-    llama_sampler_apply(cnstr, (__cur_p)); \
-    llama_sampler_free(cnstr); \
-} while(0)
+struct sampler_tester {
+    sampler_tester(size_t n_vocab) {
+        cur.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
+            const float logit = logf(token_id);
+            cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
+        }
 
-static void test_top_k(const std::vector & probs, const std::vector & expected_probs, int k) {
-    const size_t n_vocab = probs.size();
+        cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };
+    }
+
+    sampler_tester(const std::vector & probs, const std::vector & probs_expected) : probs_expected(probs_expected) {
+        cur.reserve(probs.size());
+        for (llama_token token_id = 0; token_id < (llama_token)probs.size(); token_id++) {
+            const float logit = logf(probs[token_id]);
+            cur.emplace_back(llama_token_data{token_id, logit, probs[token_id]});
+        }
+
+        cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };
+    }
+
+    void apply(llama_sampler * sampler) {
+        llama_sampler_apply(sampler, &cur_p);
+        llama_sampler_free(sampler);
+    }
+
+    void check() {
+        GGML_ASSERT(cur_p.size == probs_expected.size());
+        for (size_t i = 0; i < cur_p.size; i++) {
+            GGML_ASSERT(fabs(cur_p.data[i].p - probs_expected[i]) < 1e-5);
+        }
+    }
+
+    llama_token_data_array cur_p;
+
+private:
+    const std::vector probs_expected;
 
     std::vector cur;
-    cur.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
+};
 
-    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
-    APPLY(llama_sampler_init_softmax(), &cur_p);
-    DUMP(&cur_p);
-    APPLY(llama_sampler_init_top_k(k), &cur_p);
-    DUMP(&cur_p);
+static void test_temp(const std::vector & probs, const std::vector & probs_expected, float temp) {
+    sampler_tester tester(probs, probs_expected);
 
-    GGML_ASSERT(cur_p.size == expected_probs.size());
-    for (size_t i = 0; i < cur_p.size; i++) {
-        GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-5);
-    }
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_temp(temp));
+    tester.apply(llama_sampler_init_dist(0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
 }
 
-static void test_top_p(const std::vector & probs, const std::vector & expected_probs, float p) {
-    const size_t n_vocab = probs.size();
+static void test_temp_ext(const std::vector & probs, const std::vector & probs_expected, float temp, float delta, float exponent) {
+    sampler_tester tester(probs, probs_expected);
 
-    std::vector cur;
-    cur.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_temp_ext(temp, delta, exponent));
+    tester.apply(llama_sampler_init_dist (0));
+    DUMP(&tester.cur_p);
 
-    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
-    APPLY(llama_sampler_init_softmax(), &cur_p);
-    DUMP(&cur_p);
-    APPLY(llama_sampler_init_top_p(p, 1), &cur_p);
-    DUMP(&cur_p);
-
-    GGML_ASSERT(cur_p.size == expected_probs.size());
-    for (size_t i = 0; i < cur_p.size; i++) {
-        GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
+    tester.check();
 }
 
-static void test_tfs(const std::vector & probs, const std::vector & expected_probs, float z) {
-    const size_t n_vocab = probs.size();
+static void test_top_k(const std::vector & probs, const std::vector & probs_expected, int k) {
+    sampler_tester tester(probs, probs_expected);
 
-    std::vector cur;
-    cur.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_top_k(k));
+    tester.apply(llama_sampler_init_dist (0));
+    DUMP(&tester.cur_p);
 
-    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
-    DUMP(&cur_p);
-    APPLY(llama_sampler_init_tail_free(z, 1), &cur_p);
-    DUMP(&cur_p);
-
-    GGML_ASSERT(cur_p.size == expected_probs.size());
-    for (size_t i = 0; i < cur_p.size; i++) {
-        GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
+    tester.check();
 }
 
-static void test_min_p(const std::vector & probs, const std::vector & expected_probs, float p) {
-    const size_t n_vocab = probs.size();
+static void test_top_p(const std::vector & probs, const std::vector & probs_expected, float p) {
+    sampler_tester tester(probs, probs_expected);
 
-    std::vector cur;
-    cur.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_top_p(p, 1));
+    tester.apply(llama_sampler_init_dist (0));
+    DUMP(&tester.cur_p);
 
-    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
-    DUMP(&cur_p);
-    APPLY(llama_sampler_init_min_p(p, 1), &cur_p);
-    DUMP(&cur_p);
-    APPLY(llama_sampler_init_softmax(), &cur_p);
-
-    GGML_ASSERT(cur_p.size == expected_probs.size());
-    for (size_t i = 0; i < cur_p.size; i++) {
-        GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
+    tester.check();
 }
 
-static void test_typical(const std::vector & probs, const std::vector & expected_probs, float p) {
-    const size_t n_vocab = probs.size();
+static void test_tfs(const std::vector & probs, const std::vector & probs_expected, float z) {
+    sampler_tester tester(probs, probs_expected);
 
-    std::vector cur;
-    cur.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_tail_free(z, 1));
+    DUMP(&tester.cur_p);
 
-    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
-    DUMP(&cur_p);
-    APPLY(llama_sampler_init_typical(p, 1), &cur_p);
-    DUMP(&cur_p);
+    tester.check();
+}
 
-    GGML_ASSERT(cur_p.size == expected_probs.size());
-    for (size_t i = 0; i < cur_p.size; i++) {
-        GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
+static void test_min_p(const std::vector & probs, const std::vector & probs_expected, float p) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_min_p(p, 1));
+    tester.apply(llama_sampler_init_dist (0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
+static void test_xtc(const std::vector & probs, const std::vector & probs_expected, float p, float t) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_xtc(p, t, 0, 0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
+static void test_typical(const std::vector & probs, const std::vector & probs_expected, float p) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_typical(p, 1));
+    DUMP(&tester.cur_p);
+
+    tester.check();
 }
 
 static void test_penalties(
     const std::vector & probs, const std::vector & last_tokens,
-    const std::vector & expected_probs, float repeat_penalty, float alpha_frequency, float alpha_presence
+    const std::vector & probs_expected, float repeat_penalty, float alpha_frequency, float alpha_presence
 ) {
-    GGML_ASSERT(probs.size() == expected_probs.size());
+    GGML_ASSERT(probs.size() == probs_expected.size());
+
+    sampler_tester tester(probs, probs_expected);
 
     const size_t n_vocab = probs.size();
-
-    std::vector cur;
-    cur.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
-
-    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
-
     auto * sampler = llama_sampler_init_penalties(n_vocab, LLAMA_TOKEN_NULL, LLAMA_TOKEN_NULL, last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence, false, false);
 
     for (size_t i = 0; i < last_tokens.size(); i++) {
         llama_sampler_accept(sampler, last_tokens[i]);
     }
 
-    APPLY(llama_sampler_init_softmax(), &cur_p);
-    DUMP(&cur_p);
-    APPLY(sampler, &cur_p);
-    APPLY(llama_sampler_init_softmax(), &cur_p);
-    DUMP(&cur_p);
+    DUMP(&tester.cur_p);
+    tester.apply(sampler);
+    tester.apply(llama_sampler_init_dist(0));
+    DUMP(&tester.cur_p);
 
-    GGML_ASSERT(cur_p.size == expected_probs.size());
-    for (size_t i = 0; i < cur_p.size; i++) {
-        GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
+    tester.check();
 }
 
 static void test_sampler_queue(const size_t n_vocab, const std::string & samplers_sequence, const int top_k, const float top_p, const float min_p
 ) {
-    std::vector cur;
-    cur.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(token_id);
-        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
-
-    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+    sampler_tester tester(n_vocab);
 
           llama_token min_token_id = 0;
     const llama_token max_token_id = n_vocab-1;
 
     for (auto s : samplers_sequence) {
         switch (s){
-            case 'k': APPLY(llama_sampler_init_top_k(top_k), &cur_p); break;
+            case 'k': tester.apply(llama_sampler_init_top_k(top_k)); break;
             case 'f': GGML_ABORT("tail_free test not implemented");
             case 'y': GGML_ABORT("typical test not implemented");
-            case 'p': APPLY(llama_sampler_init_top_p(top_p, 1), &cur_p); break;
-            case 'm': APPLY(llama_sampler_init_min_p(min_p, 1), &cur_p); break;
+            case 'p': tester.apply(llama_sampler_init_top_p(top_p, 1)); break;
+            case 'm': tester.apply(llama_sampler_init_min_p(min_p, 1)); break;
             case 't': GGML_ABORT("temperature test not implemented");
             default : GGML_ABORT("Unknown sampler");
         }
 
-        APPLY(llama_sampler_init_softmax(), &cur_p); // make sure tokens are sorted for tests
+        tester.apply(llama_sampler_init_dist(0));
+
+        auto & cur_p = tester.cur_p;
 
         const int size = cur_p.size;
 
@@ -263,7 +258,7 @@ static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vec
     }
     const int64_t t_end = ggml_time_us();
     llama_sampler_free(cnstr);
-    printf("%-42s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
+    printf("%-43s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
 }
 
 #define BENCH(__cnstr, __data, __n_iter) bench((__cnstr), #__cnstr, (__data), (__n_iter))
@@ -279,26 +274,32 @@ static void test_perf() {
         data.emplace_back(llama_token_data{i, logit, 0.0f});
     }
 
-    BENCH(llama_sampler_init_top_k    (40),      data, 32);
-    BENCH(llama_sampler_init_top_p    (0.8f, 1), data, 32);
-    BENCH(llama_sampler_init_min_p    (0.2f, 1), data, 32);
-    BENCH(llama_sampler_init_tail_free(0.5f, 1), data, 32);
-    BENCH(llama_sampler_init_typical  (0.5f, 1), data, 32);
-    BENCH(llama_sampler_init_softmax  (),        data, 32);
+    BENCH(llama_sampler_init_top_k    (40),                     data, 32);
+    BENCH(llama_sampler_init_top_p    (0.8f, 1),                data, 32);
+    BENCH(llama_sampler_init_min_p    (0.2f, 1),                data, 32);
+    BENCH(llama_sampler_init_tail_free(0.5f, 1),                data, 32);
+    BENCH(llama_sampler_init_typical  (0.5f, 1),                data, 32);
+    BENCH(llama_sampler_init_xtc      (1.0f, 0.1f, 1, 1),       data, 32);
 }
 
 int main(void) {
     ggml_time_init();
 
-    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 1);
-    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 3);
+    test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f);
+    test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f);
+
+    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f, 0.0f, 1.0f);
+    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f, 0.0f, 1.0f);
+
+    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 1);
+    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 3);
     test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 4);
     test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0);
 
-    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0);
-    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f);
-    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 0.8f);
-    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1);
+    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 0);
+    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.571429f, 0.428571f}, 0.7f);
+    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 0.8f);
+    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f);
 
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.00f);
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.24f);
@@ -309,6 +310,14 @@ int main(void) {
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  0.76f);
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  1.00f);
 
+    printf("XTC should:\n");
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.1f},                                0.99f, 0.09f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.2f, 0.1f},                          0.99f, 0.19f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.3f, 0.2f, 0.1f},                    0.99f, 0.29f);
+
+    printf("XTC should not:\n");
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.2f, 0.1f},              0.99f, 0.39f);
+
     test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
     test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f);
     test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f);
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
index d3d21331b..0af85f002 100644
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -7,6 +7,7 @@
 #include 
 #include 
 #include 
+#include 
 
 //static const std::map> & k_tests() {
 //    static std::map> _k_tests = {
@@ -194,45 +195,64 @@ int main(int argc, char **argv) {
 
     const bool add_special = false;
 
-    for (const auto & test_kv : k_tests) {
-        const std::vector res = llama_tokenize(ctx, test_kv.first, add_special, false);
+    // multi-threaded tokenization
+    const int nthread = std::thread::hardware_concurrency();
+    std::vector threads(nthread);
 
-        printf("\n");
-        printf("src: '%s'\n", test_kv.first.c_str());
-        printf("res: '%s'\n", llama_detokenize(ctx, res).c_str());
-        printf("tok: ");
-        for (const auto & tok : res) {
-            printf("%d ", tok);
-        }
-        printf("\n");
+    for (int i = 0; i < nthread; i++) {
+        threads[i] = std::thread([&, i]() {
+            for (const auto & test_kv : k_tests) {
+                const std::vector res = common_tokenize(ctx, test_kv.first, add_special, false);
 
-        bool correct = res.size() == test_kv.second.size();
-        for (int i = 0; i < (int) res.size() && correct; ++i) {
-            if (test_kv.second[i] != res[i]) {
-                correct = false;
+                // here only print the result of the first thread
+                // because the other threads are running the same tests
+                if (i != 0) {
+                    continue;
+                }
+
+                printf("\n");
+                printf("src: '%s'\n", test_kv.first.c_str());
+                printf("res: '%s'\n", common_detokenize(ctx, res).c_str());
+                printf("tok: ");
+                for (const auto & tok : res) {
+                    printf("%d ", tok);
+                }
+                printf("\n");
+
+                bool correct = res.size() == test_kv.second.size();
+                for (int i = 0; i < (int) res.size() && correct; ++i) {
+                    if (test_kv.second[i] != res[i]) {
+                        correct = false;
+                    }
+                }
+
+                if (!correct) {
+                    fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
+                    fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
+                        common_detokenize(ctx, res).c_str(),
+                        common_detokenize(ctx, test_kv.second).c_str());
+                    fprintf(stderr, "%s : expected tokens: ", __func__);
+                    for (const auto & t : test_kv.second) {
+                        fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str());
+                    }
+                    fprintf(stderr, "\n");
+                    fprintf(stderr, "%s : got tokens:      ", __func__);
+                    for (const auto & t : res) {
+                        fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str());
+                    }
+                    fprintf(stderr, "\n");
+
+                    success = false;
+                }
             }
-        }
-
-        if (!correct) {
-            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
-            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
-                llama_detokenize(ctx, res).c_str(),
-                llama_detokenize(ctx, test_kv.second).c_str());
-            fprintf(stderr, "%s : expected tokens: ", __func__);
-            for (const auto & t : test_kv.second) {
-                fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
-            }
-            fprintf(stderr, "\n");
-            fprintf(stderr, "%s : got tokens:      ", __func__);
-            for (const auto & t : res) {
-                fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
-            }
-            fprintf(stderr, "\n");
-
-            success = false;
-        }
+        });
     }
 
+    for (int i = 0; i < nthread; i++) {
+        threads[i].join();
+    }
+
+    // single threaded tokenization
     if (!fname_text.empty()) {
         fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
 
@@ -253,7 +273,7 @@ int main(int argc, char **argv) {
         {
             const auto t_start = ggml_time_us();
 
-            res = llama_tokenize(ctx, text, add_special, false);
+            res = common_tokenize(ctx, text, add_special, false);
 
             const auto t_end = ggml_time_us();
 
diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp
index 9498387e0..0ff7fc833 100644
--- a/tests/test-tokenizer-1-bpe.cpp
+++ b/tests/test-tokenizer-1-bpe.cpp
@@ -78,10 +78,10 @@ int main(int argc, char **argv) {
     const int n_vocab = llama_n_vocab(model);
 
     for (int i = 0; i < n_vocab; ++i) {
-        std::string str = llama_detokenize(ctx, std::vector(1, i));
+        std::string str = common_detokenize(ctx, std::vector(1, i));
         try {
             auto cps = unicode_cpts_from_utf8(str);
-            std::vector tokens = llama_tokenize(ctx, str, false, true);
+            std::vector tokens = common_tokenize(ctx, str, false, true);
             if (ignore_merges && tokens.size() > 1) {
                 fprintf(stderr,
                         "%s : error: token %d detokenizes to '%s'(%zu) but "
@@ -94,7 +94,7 @@ int main(int argc, char **argv) {
                 fprintf(stderr, "]\n");
                 return 2;
             }
-            std::string check = llama_detokenize(ctx, tokens);
+            std::string check = common_detokenize(ctx, tokens);
             if (check != str) {
                 fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
                     __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
@@ -123,8 +123,8 @@ int main(int argc, char **argv) {
                     }
 
                     std::string str = unicode_cpt_to_utf8(cp);
-                    std::vector tokens = llama_tokenize(ctx, str, false);
-                    std::string check = llama_detokenize(ctx, tokens);
+                    std::vector tokens = common_tokenize(ctx, str, false);
+                    std::string check = common_detokenize(ctx, tokens);
                     if (cp != 9601 && str != check) {
                         fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
                                 cp, check.c_str(), check.length(), str.c_str(), str.length());
diff --git a/tests/test-tokenizer-1-spm.cpp b/tests/test-tokenizer-1-spm.cpp
index 7ca9e2ca6..9b0716a43 100644
--- a/tests/test-tokenizer-1-spm.cpp
+++ b/tests/test-tokenizer-1-spm.cpp
@@ -66,9 +66,9 @@ int main(int argc, char ** argv) {
     const int n_vocab = llama_n_vocab(model);
 
     for (int i = 0; i < n_vocab; ++i) {
-        std::string str = llama_detokenize(ctx, std::vector(1, i), true);
-        std::vector tokens = llama_tokenize(ctx, str, false, true);
-        std::string check = llama_detokenize(ctx, tokens);
+        std::string str = common_detokenize(ctx, std::vector(1, i), true);
+        std::vector tokens = common_tokenize(ctx, str, false, true);
+        std::string check = common_detokenize(ctx, tokens);
         if (check != str) {
             fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
                 __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
@@ -93,8 +93,8 @@ int main(int argc, char ** argv) {
                     }
 
                     std::string str = unicode_cpt_to_utf8(cp);
-                    std::vector tokens = llama_tokenize(ctx, str, false, true);
-                    std::string check = llama_detokenize(ctx, tokens);
+                    std::vector tokens = common_tokenize(ctx, str, false, true);
+                    std::string check = common_detokenize(ctx, tokens);
                     if (cp != 9601 && str != check) {
                         fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
                                 cp, check.c_str(), check.length(), str.c_str(), str.length());